# filter()
- `DataFrameGroupBy.filter(func, dropna=True, *args, **kwargs)`
- 특정 집계 조건을 만족하는 Group의 행들만 조회한다.
    1. DataFrameGroupBy의 group로 DataFrame을 함수에 전달한다.
    2. 함수는 받은 DataFrame을 이용해 집계한 값의 조건을 비교해서 반환한다.(반환타입: Bool) 
    3. 반환값이 True인 Group들의 모든 행들로 구성된 DataFrame을 반환한다.
- 매개변수
    - func: filtering 조건을 구현한 함수
        - 첫번째 매개변수로 Group으로 묶인 DataFrame을 받는다.
    - dropna=True
        - 필터를 통과하지 못한 group의 DataFrame의 값들을 drop시킨다. False로 설정하면 NA 처리해서 반환한다.
    - \*args, \*\*kwargs: filter 함수의 매개변수에 전달할 전달인자값.

In [1]:
import numpy as np
import pandas as pd

In [2]:
# cnt1 - 사과: 10대, 귤: 20대, 배: 단단위, 딸기 30이상
data = dict(fruits=['사과', '사과','사과', '사과','사과','귤','귤','귤','귤','귤','배','배','배','배','배','딸기','딸기','딸기','딸기','딸기']
            ,cnt1=[10, 12, 13, 11, 12, 21, 22, 27, 24, 26, 7, 7, 8, 3, 2, 30, 35, 37, 41, 28]
            ,cnt2=[100,  103, 107, 107,  101,  51,  57, 58,  57, 51,  9, 9,  5,  7,  7,  208, 217, 213, 206, 204]
           )
df = pd.DataFrame(data)
df

Unnamed: 0,fruits,cnt1,cnt2
0,사과,10,100
1,사과,12,103
2,사과,13,107
3,사과,11,107
4,사과,12,101
5,귤,21,51
6,귤,22,57
7,귤,27,58
8,귤,24,57
9,귤,26,51


#### 과일중 cnt1의 평균이 20 이상인 과일들만 보기
- 집계결과가 아니라 조건을 만족하는 행들을 다 출력

In [15]:
r = df.groupby('fruits').mean()
r[r['cnt1'] > 20]
df[(df['fruits'] == '귤') | (df['fruits'] == '딸기')]

Unnamed: 0,fruits,cnt1,cnt2
5,귤,21,51
6,귤,22,57
7,귤,27,58
8,귤,24,57
9,귤,26,51
15,딸기,30,208
16,딸기,35,217
17,딸기,37,213
18,딸기,41,206
19,딸기,28,204


In [17]:
# df.groupby('fruits').filter(함수:그룹별 cnt1 평균이 20 이상이야???)
def check_cnt1_mean(X):
    """
    filter에 사용할 함수 
    매개변수: X - DataFrame (group별로 나뉜)
    반환값: bool - 특정 조건을 만족하는지 여부
    """
    return X['cnt1'].mean() >= 20

# group별로 특정 조건을 만족하는 행들을 조회 해주는 함수.
df.groupby('fruits').filter(check_cnt1_mean)

Unnamed: 0,fruits,cnt1,cnt2
5,귤,21,51
6,귤,22,57
7,귤,27,58
8,귤,24,57
9,귤,26,51
15,딸기,30,208
16,딸기,35,217
17,딸기,37,213
18,딸기,41,206
19,딸기,28,204


In [16]:
df.groupby('fruits').groups

{'귤': [5, 6, 7, 8, 9], '딸기': [15, 16, 17, 18, 19], '배': [10, 11, 12, 13, 14], '사과': [0, 1, 2, 3, 4]}

In [18]:
# lambda 식으로 구현
df.groupby('fruits').filter(lambda X : X['cnt1'].mean()>=20)

Unnamed: 0,fruits,cnt1,cnt2
5,귤,21,51
6,귤,22,57
7,귤,27,58
8,귤,24,57
9,귤,26,51
15,딸기,30,208
16,딸기,35,217
17,딸기,37,213
18,딸기,41,206
19,딸기,28,204


In [20]:
df.groupby('fruits').filter(check_cnt1_mean, dropna=False)

Unnamed: 0,fruits,cnt1,cnt2
0,,,
1,,,
2,,,
3,,,
4,,,
5,귤,21.0,51.0
6,귤,22.0,57.0
7,귤,27.0,58.0
8,귤,24.0,57.0
9,귤,26.0,51.0


In [21]:
df.groupby('fruits').filter(lambda X : X['cnt1'].mean()>=20, dropna=False)

Unnamed: 0,fruits,cnt1,cnt2
0,,,
1,,,
2,,,
3,,,
4,,,
5,귤,21.0,51.0
6,귤,22.0,57.0
7,귤,27.0,58.0
8,귤,24.0,57.0
9,귤,26.0,51.0


#### 매개변수 있는 filter함수

In [22]:
def check_mean(X, col_name, mean_thresh):
    """
    [Parameter]
        - X: DataFrame - group별 DataFrame
        - col_name: str - 평균을 계산할 컬럼명
        - mean_thresh: int - 컬럼의 평균값이 이 값 이상인지 비교할 기준값
    [return]  
        - bool: 컬럼의 평균이 mean_thresh 이상인지 여부
    """
    return X[col_name].mean() >= mean_thresh

In [25]:
df.groupby('fruits').mean()

Unnamed: 0_level_0,cnt1,cnt2
fruits,Unnamed: 1_level_1,Unnamed: 2_level_1
귤,24.0,54.8
딸기,34.2,209.6
배,5.4,7.4
사과,11.6,103.6


In [27]:
df.groupby('fruits').filter(check_mean, col_name='cnt2', mean_thresh=200)

Unnamed: 0,fruits,cnt1,cnt2
15,딸기,30,208
16,딸기,35,217
17,딸기,37,213
18,딸기,41,206
19,딸기,28,204


In [29]:
# df.groupby('fruits').filter(lambda X, col_name, thresh:X[col_name].mean()>=thresh,  col_name='cnt2', thresh=200)

Unnamed: 0,fruits,cnt1,cnt2
15,딸기,30,208
16,딸기,35,217
17,딸기,37,213
18,딸기,41,206
19,딸기,28,204


# transform
함수에 의해 처리된 값(반환값)으로 원래 값들을 변경(tranform) 해서 반환    
DataFrame에 Group 단위 통계량을 추가할 때 유용하다.
- `DataFrameGroupBy.transform(func, *args)`, `SeriesGroupBy.transform(func, *args)`
    - func: 매개변수로 그룹별로 Series를 받아 Series의 값들을 변환하여 (Series로)반환하는 함수객체
        - DataFrameGroupBy은 모든 컬럼의 값들을 group 별 Series로 전달한다.
    - *args: 함수에 전달할 추가 인자값이 있으면 매개변수 순서에 맞게 값을 전달한다. (위치기반 argument)
- transform() 함수를 groupby() 와 사용하면 컬럼의 각 원소들을 자신이 속한 그룹의 통계량으로 변환된 데이터셋을 생성할 수 있다.
- 컬럼의 값과 통계값을 비교해서 보거나 결측치 처리등에 사용할 수있다.

In [35]:
df.groupby('fruits').mean()

Unnamed: 0_level_0,cnt1,cnt2
fruits,Unnamed: 1_level_1,Unnamed: 2_level_1
귤,24.0,54.8
딸기,34.2,209.6
배,5.4,7.4
사과,11.6,103.6


In [36]:
df

Unnamed: 0,fruits,cnt1,cnt2
0,사과,10,100
1,사과,12,103
2,사과,13,107
3,사과,11,107
4,사과,12,101
5,귤,21,51
6,귤,22,57
7,귤,27,58
8,귤,24,57
9,귤,26,51


In [37]:
# agg(), filter(), transform() - DataFrameGroupby의 메소드: 판다스제공 집계함수-문자열, 사용자정의 집계함수-함수객체
r = df.groupby('fruits').transform('mean')
r['과일'] = df.fruits
r

Unnamed: 0,cnt1,cnt2,과일
0,11.6,103.6,사과
1,11.6,103.6,사과
2,11.6,103.6,사과
3,11.6,103.6,사과
4,11.6,103.6,사과
5,24.0,54.8,귤
6,24.0,54.8,귤
7,24.0,54.8,귤
8,24.0,54.8,귤
9,24.0,54.8,귤


In [38]:
def max_min_diff(X):
    """사용자정의 통계함수
    매개변수:  Series (그룹별로 넘어온다.)
    반환: 처리한 통계값
    """
    return X.max() - X.mean()

In [40]:
r = df.groupby('fruits').transform(max_min_diff)
r['과일'] = df.fruits
r

Unnamed: 0,cnt1,cnt2,과일
0,1.4,3.4,사과
1,1.4,3.4,사과
2,1.4,3.4,사과
3,1.4,3.4,사과
4,1.4,3.4,사과
5,3.0,3.2,귤
6,3.0,3.2,귤
7,3.0,3.2,귤
8,3.0,3.2,귤
9,3.0,3.2,귤


In [42]:
# lambda 식
df.groupby('fruits').transform(lambda X: X.max() - X.min())

Unnamed: 0,cnt1,cnt2
0,3,7
1,3,7
2,3,7
3,3,7
4,3,7
5,6,7
6,6,7
7,6,7
8,6,7
9,6,7


## 원본에 통계치 붙여서 비교하기

In [48]:
df2 = df.copy()

In [51]:
# cnt1의 과일별 평균을 df에 행별로 붙이기
cnt1_mean = df.groupby('fruits')['cnt1'].transform('mean')
cnt1_mean

0     11.6
1     11.6
2     11.6
3     11.6
4     11.6
5     24.0
6     24.0
7     24.0
8     24.0
9     24.0
10     5.4
11     5.4
12     5.4
13     5.4
14     5.4
15    34.2
16    34.2
17    34.2
18    34.2
19    34.2
Name: cnt1, dtype: float64

In [52]:
df.insert(2, 'cnt mean', cnt1_mean) # (컬럼을 삽입할 위치, 컬럼명, 컬럼값) => 원본변경

In [53]:
df

Unnamed: 0,fruits,cnt1,cnt mean,cnt2
0,사과,10,11.6,100
1,사과,12,11.6,103
2,사과,13,11.6,107
3,사과,11,11.6,107
4,사과,12,11.6,101
5,귤,21,24.0,51
6,귤,22,24.0,57
7,귤,27,24.0,58
8,귤,24,24.0,57
9,귤,26,24.0,51


In [60]:
# sampling=>표본추출. frac=샘플비율(0~1)
df = df.sample(frac=1).reset_index(drop=True)  #섞인 index명을 자동증가 정수로 변환.
df

Unnamed: 0,fruits,cnt1,cnt mean,cnt2
0,사과,12,11.6,101
1,배,8,5.4,5
2,귤,26,24.0,51
3,딸기,35,34.2,217
4,딸기,37,34.2,213
5,배,3,5.4,7
6,귤,24,24.0,57
7,딸기,30,34.2,208
8,귤,22,24.0,57
9,딸기,28,34.2,204


In [62]:
df['cnt2 mean'] = df.groupby('fruits')['cnt2'].transform('mean')
df

Unnamed: 0,fruits,cnt1,cnt mean,cnt2,cnt2 mean
0,사과,12,11.6,101,103.6
1,배,8,5.4,5,7.4
2,귤,26,24.0,51,54.8
3,딸기,35,34.2,217,209.6
4,딸기,37,34.2,213,209.6
5,배,3,5.4,7,7.4
6,귤,24,24.0,57,54.8
7,딸기,30,34.2,208,209.6
8,귤,22,24.0,57,54.8
9,딸기,28,34.2,204,209.6


## 결측치 처리
- 결측치: 모르는 값, 없는 값
- 결측치 처리
    1. 제거 (행/열단위)
        - 데이터양이 충분할 때 제거.
    2. 대체
        - 가장 가능성이 높은 값을 대체(평균, 중앙값, 최빈값)
        - 결측치를 표현하는 값으로 대체(없다라는 값으로 변경)



- transform이용해서 여기선 결측치를 같은 과일그룹의 평균값으로 변환
    - 전체 평균보다 좀더 정확할 수 있다.

In [65]:
data = dict(fruits=['사과', '사과','사과', '사과','사과','귤','귤','귤','귤','귤','배','배','배','배','배','딸기','딸기','딸기','딸기','딸기']
            ,cnt1=[10, 12, 13, 11, 12, 21, 22, 27, 24, 26, 7, 7, 8, 3, 2, 30, 35, 37, 41, 28]
            ,cnt2=[100,  103, 107, 107,  101,  51,  57, 58,  57, 51,  9, 9,  5,  7,  7,  208, 217, 213, 206, 204]
           )
df = pd.DataFrame(data)

df.loc[[0, 1, 5, 7, 10, 12, 15, 16], 'cnt2'] = None
df

Unnamed: 0,fruits,cnt1,cnt2
0,사과,10,
1,사과,12,
2,사과,13,107.0
3,사과,11,107.0
4,사과,12,101.0
5,귤,21,
6,귤,22,57.0
7,귤,27,
8,귤,24,57.0
9,귤,26,51.0


In [67]:
# 결측치 제거 - dropna()
df.dropna() # 결측치가 있는 행(기본)을 제거

Unnamed: 0,fruits,cnt1,cnt2
2,사과,13,107.0
3,사과,11,107.0
4,사과,12,101.0
6,귤,22,57.0
8,귤,24,57.0
9,귤,26,51.0
11,배,7,9.0
13,배,3,7.0
14,배,2,7.0
17,딸기,37,213.0


In [69]:
df.dropna(axis=1) # 결측치가 있는 열을 제거

Unnamed: 0,fruits,cnt1
0,사과,10
1,사과,12
2,사과,13
3,사과,11
4,사과,12
5,귤,21
6,귤,22
7,귤,27
8,귤,24
9,귤,26


In [73]:
df2 = df.copy()

In [74]:
# 대체 - fillna(대체값)
# 평균
df2['cnt2'] = df['cnt2'].fillna(round(df['cnt2'].mean(),2))  # 결측치를 대체한 것으로 변경.
df2

Unnamed: 0,fruits,cnt1,cnt2
0,사과,10,93.83
1,사과,12,93.83
2,사과,13,107.0
3,사과,11,107.0
4,사과,12,101.0
5,귤,21,93.83
6,귤,22,57.0
7,귤,27,93.83
8,귤,24,57.0
9,귤,26,51.0


In [77]:
a = pd.Series([1,2,None,3,None,4])
b = pd.Series([100,200,300,400,500,600])

a.fillna(10000)
a.fillna(b)  #a와 동일한 size의 Series를 넣으면 결측치와 동일한 index명의 값으로 결측치를 채운다.

0      1.0
1      2.0
2    300.0
3      3.0
4    500.0
5      4.0
dtype: float64

In [81]:
df['cnt2'].fillna(df.groupby('fruits')['cnt2'].transform('mean'), inplace=True)
df

Unnamed: 0,fruits,cnt1,cnt2
0,사과,10,105.0
1,사과,12,105.0
2,사과,13,107.0
3,사과,11,107.0
4,사과,12,101.0
5,귤,21,55.0
6,귤,22,57.0
7,귤,27,55.0
8,귤,24,57.0
9,귤,26,51.0


# TODO 
- data/diamonds.csv 조회

In [82]:
dia_df = pd.read_csv('data/diamonds.csv')
dia_df.shape

(53940, 10)

In [86]:
# cut 별 평균 가격이 4000 이상인 diamond 데이터들 조회 
r = dia_df.groupby('cut')['price'].mean()
r[r >= 4000]

cut
Fair       4358.757764
Premium    4584.257704
Name: price, dtype: float64

In [90]:
result = dia_df.groupby('cut').filter(lambda X:X['price'].mean()>=4000)
result['cut'].unique()

array(['Premium', 'Fair'], dtype=object)

In [91]:
result.head(10)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
8,0.22,Fair,E,VS2,65.1,61.0,337,3.87,3.78,2.49
12,0.22,Premium,F,SI1,60.4,61.0,342,3.88,3.84,2.33
14,0.2,Premium,E,SI2,60.2,62.0,345,3.79,3.75,2.27
15,0.32,Premium,E,I1,60.9,58.0,345,4.38,4.42,2.68
26,0.24,Premium,I,VS1,62.5,57.0,355,3.97,3.94,2.47
45,0.29,Premium,F,SI1,62.4,58.0,403,4.24,4.26,2.65
53,0.22,Premium,E,VS2,61.6,58.0,404,3.93,3.89,2.41
54,0.22,Premium,D,VS2,59.3,62.0,404,3.91,3.88,2.31


In [100]:
dia_df['color'][0]

'E'

In [106]:
# color 별 carat의 최대값과 최소값의 차이가 2이상 3미만인 모든 diamond 데이터들 조회
def carat_diff(X):
    # X: Group 별 DataFrame
    diff = X['carat'].max() - X['carat'].min()
    print(X['color'].iloc[0], diff, sep='-')
    return (diff >= 2) & (diff < 3)

# E,F,G

In [109]:
result = dia_df.groupby('color').filter(carat_diff)

D-3.1999999999999997
E-2.8499999999999996
F-2.8099999999999996
G-2.78
H-3.9
I-3.78
J-4.779999999999999


In [111]:
result['color'].unique()

array(['E', 'F', 'G'], dtype=object)

In [115]:
dia_df.head(2)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31


In [116]:
# clarity 별 평균 가격 컬럼을 DataFrame에 추가.
r = dia_df.groupby('clarity')['price'].transform('mean')
dia_df.insert(7, 'clarity price mean', r)

In [117]:
dia_df

Unnamed: 0,carat,cut,color,clarity,depth,table,price,clarity price mean,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,5063.028606,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3996.001148,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,3839.455391,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,3924.989395,4.20,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,5063.028606,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,3996.001148,5.75,5.76,3.50
53936,0.72,Good,D,SI1,63.1,55.0,2757,3996.001148,5.69,5.75,3.61
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,3996.001148,5.66,5.68,3.56
53938,0.86,Premium,H,SI2,61.0,58.0,2757,5063.028606,6.15,6.12,3.74


# pivot_table()
엑셀의 pivot table 기능을 제공하는 메소드.    
분류별 집계(Group으로 묶어 집계)를 처리하는 함수로 group으로 묶고자 하는 컬럼을 행과 열로 위치시키고 집계값을 값으로 보여준다.    
역할은 groupby() 를 이용한 집계와 같다.

> pivot() 함수와 역할이 다르다.   
> pivot() 은 index와 column의 형태를 바꾸는 reshape 함수.

- `DataFrame.pivot_table(values=None, index=None, columns=None, aggfunc='mean', fill_value=None, margins=False, dropna=True, margins_name='All')`
- **매개변수**
    - index
        - 문자열 또는 리스트. index로 올 컬럼들 => groupby였으면 묶었을 컬럼
    - columns
        - 문자열 또는 리스트. column으로 올 컬럼들 => groupby였으면 묶었을 컬럼 (index/columns가 묶여서 groupby에 묶을 컬럼들이 된다.)
    - values
        - 문자열 또는 리스트. 집계할 대상 컬럼들
    - aggfunc
        - 집계함수 지정. 함수, 함수이름문자열, 함수리스트(함수이름 문자열/함수객체), dict: 집계할 함수
        - 기본(생략시): 평균을 구한다. (mean이 기본값)
    - fill_value, dropna
        - fill_value: 집계시 NA가 나올경우 채울 값
        - dropna: boolean. 컬럼의 전체값이 NA인 경우 그 컬럼 제거(기본: True)
    - margins/margins_name
        - margin: boolean(기본: False). 총집계결과를 만들지 여부.
        - margin_name: margin의 이름 문자열로 지정 (생략시 All)

In [118]:
flights = pd.read_csv('data/flights.csv')
flights.shape

(58492, 14)

In [119]:
flights.head(3)

Unnamed: 0,MONTH,DAY,WEEKDAY,AIRLINE,ORG_AIR,DEST_AIR,SCHED_DEP,DEP_DELAY,AIR_TIME,DIST,SCHED_ARR,ARR_DELAY,DIVERTED,CANCELLED
0,1,1,4,WN,LAX,SLC,1625,58.0,94.0,590,1905,65.0,0,0
1,1,1,4,UA,DEN,IAD,823,7.0,154.0,1452,1333,-13.0,0,0
2,1,1,4,MQ,DFW,VPS,1305,36.0,85.0,641,1453,35.0,0,0


In [120]:
# AIRLINE, MONTH 별 AIR_TIME 평균 (두개 이상의 컬럼을 GROUP으로 묶어서 집계)
flights.groupby(['AIRLINE', 'MONTH'])['AIR_TIME'].mean()

AIRLINE  MONTH
AA       1        138.793103
         2        143.126560
         3        142.026163
         4        144.544267
         5        145.261538
                     ...    
WN       7        109.377500
         8        109.173248
         9        109.842239
         11       104.704244
         12       104.136304
Name: AIR_TIME, Length: 149, dtype: float64

## 1개의 컬럼을 grouping 해서 집계
- 항공사별 비행시간의 평균 
- 사용컬럼
    - grouping할 컬럼
        - AIRLINE: 항공사
    - 집계대상컬럼
        - AIR_TIME
- 집계: mean

In [121]:
# groupby
flights.groupby('AIRLINE')['AIR_TIME'].mean()

AIRLINE
AA    144.259404
AS    147.845052
B6    209.412963
DL    115.334187
EV     68.964016
F9    127.592337
HA    338.288288
MQ     61.318346
NK    135.736878
OO     76.010272
UA    155.650521
US    147.686755
VX    154.864097
WN    107.005897
Name: AIR_TIME, dtype: float64

In [127]:
flights.pivot_table(values='AIR_TIME',   #집계대상
                    index='AIRLINE',   # GROUPING할 대상 컬럼 중 INDEX로 올 컬럼 
#                     columns='AIRLINE',
                    aggfunc='mean' ,    #집계 함수 - default: mean
                    margins=True, # 총집계
                    margins_name='전체 평균'
                   )

Unnamed: 0_level_0,AIR_TIME
AIRLINE,Unnamed: 1_level_1
AA,144.259404
AS,147.845052
B6,209.412963
DL,115.334187
EV,68.964016
F9,127.592337
HA,338.288288
MQ,61.318346
NK,135.736878
OO,76.010272


In [128]:
# 여러 집계값/집계대상컬럼/index/columns의 컬럼을 여러개 넣을 경우 리스트로 묶어서 전달.
flights.pivot_table(values='AIR_TIME', index='AIRLINE', aggfunc=['min', 'median','max'])

Unnamed: 0_level_0,min,median,max
Unnamed: 0_level_1,AIR_TIME,AIR_TIME,AIR_TIME
AIRLINE,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
AA,29.0,132.0,513.0
AS,58.0,131.0,402.0
B6,40.0,256.0,344.0
DL,22.0,93.0,577.0
EV,14.0,65.0,212.0
F9,55.0,118.0,296.0
HA,281.0,336.0,411.0
MQ,16.0,53.0,199.0
NK,39.0,127.0,295.0
OO,8.0,67.0,258.0


In [130]:
flights.pivot_table(values=['AIR_TIME', 'ARR_DELAY', 'DEP_DELAY'], index='AIRLINE', aggfunc=['min', 'median','max'])

Unnamed: 0_level_0,min,min,min,median,median,median,max,max,max
Unnamed: 0_level_1,AIR_TIME,ARR_DELAY,DEP_DELAY,AIR_TIME,ARR_DELAY,DEP_DELAY,AIR_TIME,ARR_DELAY,DEP_DELAY
AIRLINE,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
AA,29.0,-60.0,-15.0,132.0,-5.0,-2.0,513.0,858.0,835.0
AS,58.0,-57.0,-22.0,131.0,-6.0,-5.0,402.0,344.0,338.0
B6,40.0,-51.0,-19.0,256.0,-2.0,0.0,344.0,331.0,348.0
DL,22.0,-57.0,-17.0,93.0,-7.0,-1.0,577.0,741.0,755.0
EV,14.0,-39.0,-17.0,65.0,-3.0,-2.0,212.0,669.0,672.0
F9,55.0,-43.0,-19.0,118.0,-1.0,-1.0,296.0,839.0,852.0
HA,281.0,-44.0,-24.0,336.0,0.0,-4.0,411.0,298.0,298.0
MQ,16.0,-37.0,-12.0,53.0,-4.0,-1.0,199.0,357.0,342.0
NK,39.0,-39.0,-16.0,127.0,2.0,0.0,295.0,474.0,461.0
OO,8.0,-44.0,-20.0,67.0,-3.0,-2.0,258.0,724.0,735.0


## 두개의 컬럼을 grouping 해서 집계
- 항공사/출발공항코드 별 취소 총수 (1이 취소이므로 합계를 구한다.)
- 사용컬럼
    - grouping할 컬럼
        - AIRLINE: 항공사
        - ORG_AIR: 출발 공항코드
    - 집계대상컬럼
        - CANCELLED: 취소여부 - 1:취소, 0: 취소안됨
- 집계: sum

In [131]:
flights.groupby(['AIRLINE', 'ORG_AIR'])['CANCELLED'].sum()

AIRLINE  ORG_AIR
AA       ATL         3
         DEN         4
         DFW        86
         IAH         3
         LAS         3
                    ..
WN       LAS         7
         LAX        32
         MSP         1
         PHX         6
         SFO        25
Name: CANCELLED, Length: 114, dtype: int64

In [137]:
flights.pivot_table(values='CANCELLED', 
                    index='AIRLINE', columns='ORG_AIR', # 그룹으로 묶을 컬럼들을 INDEX, COLOUMNS지정
                    aggfunc='sum', 
                    margins=True
#                     fill_value=-10000  # NaN을 지정한 값으로 변경
                    
                   )
# NaN인 것은 그 group들의 값을 가진 행이 없는 경우.

ORG_AIR,ATL,DEN,DFW,IAH,LAS,LAX,MSP,ORD,PHX,SFO,All
AIRLINE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
AA,3.0,4.0,86.0,3.0,3.0,11.0,3.0,35.0,4.0,2.0,154
AS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
B6,,0.0,0.0,,0.0,0.0,,0.0,0.0,1.0,1
DL,28.0,1.0,0.0,0.0,1.0,1.0,4.0,0.0,1.0,2.0,38
EV,18.0,6.0,27.0,36.0,,,6.0,53.0,0.0,,146
F9,0.0,2.0,1.0,0.0,1.0,1.0,1.0,4.0,0.0,0.0,10
HA,,,,,0.0,0.0,,,0.0,0.0,0
MQ,5.0,,62.0,0.0,,0.0,0.0,85.0,,,152
NK,1.0,1.0,6.0,0.0,1.0,1.0,3.0,10.0,2.0,,25
OO,3.0,25.0,2.0,10.0,0.0,15.0,4.0,41.0,9.0,33.0,142


In [139]:
# 회항, 취소 총 건수
flights.pivot_table(values=['DIVERTED', 'CANCELLED'], index='AIRLINE', columns='ORG_AIR', aggfunc='sum')

Unnamed: 0_level_0,CANCELLED,CANCELLED,CANCELLED,CANCELLED,CANCELLED,CANCELLED,CANCELLED,CANCELLED,CANCELLED,CANCELLED,DIVERTED,DIVERTED,DIVERTED,DIVERTED,DIVERTED,DIVERTED,DIVERTED,DIVERTED,DIVERTED,DIVERTED
ORG_AIR,ATL,DEN,DFW,IAH,LAS,LAX,MSP,ORD,PHX,SFO,ATL,DEN,DFW,IAH,LAS,LAX,MSP,ORD,PHX,SFO
AIRLINE,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2
AA,3.0,4.0,86.0,3.0,3.0,11.0,3.0,35.0,4.0,2.0,1.0,1.0,8.0,0.0,3.0,1.0,0.0,4.0,4.0,4.0
AS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B6,,0.0,0.0,,0.0,0.0,,0.0,0.0,1.0,,0.0,0.0,,1.0,0.0,,1.0,0.0,0.0
DL,28.0,1.0,0.0,0.0,1.0,1.0,4.0,0.0,1.0,2.0,18.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0
EV,18.0,6.0,27.0,36.0,,,6.0,53.0,0.0,,6.0,0.0,3.0,4.0,,,1.0,1.0,0.0,
F9,0.0,2.0,1.0,0.0,1.0,1.0,1.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
HA,,,,,0.0,0.0,,,0.0,0.0,,,,,0.0,1.0,,,0.0,0.0
MQ,5.0,,62.0,0.0,,0.0,0.0,85.0,,,0.0,,2.0,0.0,,0.0,0.0,3.0,,
NK,1.0,1.0,6.0,0.0,1.0,1.0,3.0,10.0,2.0,,0.0,0.0,2.0,1.0,1.0,0.0,1.0,0.0,0.0,
OO,3.0,25.0,2.0,10.0,0.0,15.0,4.0,41.0,9.0,33.0,0.0,5.0,0.0,2.0,0.0,5.0,3.0,3.0,1.0,2.0


## 3개 이상의 컬럼을 grouping해서 집계
- 항공사/월/출발공항코드 별 취소 총수 
- grouping할 컬럼
    - AIRLINE:항공사
    - MONTH:월
    - ORG_AIR: 출발지 공항
- 집계 대상컬럼
    - CANCELLED: 취소여부
- 집계 : sum    

In [142]:
flights.groupby(['AIRLINE', 'MONTH', 'ORG_AIR'])['CANCELLED'].sum()#[:20]

AIRLINE  MONTH  ORG_AIR
AA       1      ATL        0
                DEN        0
                DFW        8
                IAH        0
                LAS        0
                          ..
WN       12     LAS        1
                LAX        2
                MSP        0
                PHX        0
                SFO        0
Name: CANCELLED, Length: 1133, dtype: int64

In [145]:
result = flights.pivot_table(values='CANCELLED', 
                            index=['AIRLINE', 'ORG_AIR'], 
                            columns='MONTH', 
                            aggfunc='sum')
result.to_csv('save_data/flights_stat.csv')

In [148]:
pd.options.display.max_rows = 200
pd.options.display.max_rows

200

In [151]:
pd.options.display.max_columns = 60
pd.options.display.max_columns

60

In [149]:
result

Unnamed: 0_level_0,MONTH,1,2,3,4,5,6,7,8,9,11,12
AIRLINE,ORG_AIR,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
AA,ATL,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AA,DEN,0.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0
AA,DFW,8.0,33.0,13.0,4.0,8.0,7.0,1.0,2.0,1.0,3.0,6.0
AA,IAH,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
AA,LAS,0.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
AA,LAX,1.0,3.0,1.0,0.0,0.0,0.0,0.0,2.0,1.0,1.0,2.0
AA,MSP,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
AA,ORD,0.0,7.0,1.0,5.0,2.0,4.0,3.0,3.0,0.0,6.0,4.0
AA,PHX,0.0,1.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0
AA,SFO,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


## 3개 이상의 컬럼을 grouping해서 집계 2
- 항공사/월/출발공항코드 별 최대/최소 연착시간
- grouping할 컬럼
    - AIRLINE:항공사
    - MONTH:월
    - ORG_AIR: 출발지 공항
- 집계 대상컬럼
    - ARR_DELAY: 연착시간
- 집계 : min, max    

In [153]:
flights.groupby(['AIRLINE', 'MONTH', 'ORG_AIR'])['ARR_DELAY'].agg(['min', 'max'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,min,max
AIRLINE,MONTH,ORG_AIR,Unnamed: 3_level_1,Unnamed: 4_level_1
AA,1,ATL,-27.0,26.0
AA,1,DEN,-13.0,78.0
AA,1,DFW,-39.0,287.0
AA,1,IAH,-23.0,63.0
AA,1,LAS,-32.0,732.0
...,...,...,...,...
WN,12,LAS,-52.0,96.0
WN,12,LAX,-30.0,493.0
WN,12,MSP,-23.0,90.0
WN,12,PHX,-30.0,254.0


In [155]:
flights.pivot_table(values='ARR_DELAY', index=['AIRLINE', 'ORG_AIR'], columns='MONTH', aggfunc=['min', 'max'],
                   margins=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,min,min,min,min,min,min,min,min,min,min,min,min,max,max,max,max,max,max,max,max,max,max,max,max
Unnamed: 0_level_1,MONTH,1,2,3,4,5,6,7,8,9,11,12,All,1,2,3,4,5,6,7,8,9,11,12,All
AIRLINE,ORG_AIR,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2
AA,ATL,-27.0,-26.0,-32.0,-30.0,-26.0,-23.0,-27.0,-32.0,-33.0,-33.0,-31.0,-33.0,26.0,16.0,25.0,115.0,25.0,159.0,319.0,84.0,196.0,255.0,203.0,319.0
AA,DEN,-13.0,-17.0,-19.0,-28.0,-20.0,-13.0,-30.0,-27.0,-27.0,-27.0,-40.0,-40.0,78.0,40.0,53.0,105.0,330.0,10.0,67.0,257.0,152.0,146.0,106.0,330.0
AA,DFW,-39.0,-29.0,-29.0,-37.0,-36.0,-33.0,-32.0,-32.0,-33.0,-45.0,-42.0,-45.0,287.0,311.0,234.0,275.0,285.0,602.0,203.0,268.0,241.0,349.0,293.0,602.0
AA,IAH,-23.0,-27.0,-13.0,-27.0,-19.0,-13.0,-19.0,-30.0,-31.0,-14.0,-15.0,-31.0,63.0,51.0,97.0,127.0,131.0,456.0,858.0,95.0,73.0,98.0,103.0,858.0
AA,LAS,-32.0,-25.0,-24.0,-19.0,-31.0,-27.0,-40.0,-28.0,-32.0,-31.0,-33.0,-40.0,732.0,20.0,111.0,626.0,54.0,206.0,157.0,157.0,36.0,89.0,219.0,732.0
AA,LAX,-42.0,-41.0,-31.0,-40.0,-41.0,-39.0,-42.0,-49.0,-38.0,-37.0,-45.0,-49.0,85.0,162.0,223.0,145.0,115.0,272.0,725.0,185.0,162.0,144.0,473.0,725.0
AA,MSP,-34.0,-32.0,-38.0,-10.0,-15.0,-17.0,-26.0,-22.0,-28.0,-33.0,-41.0,-41.0,106.0,95.0,3.0,20.0,76.0,125.0,40.0,14.0,17.0,142.0,50.0,142.0
AA,ORD,-38.0,-34.0,-34.0,-52.0,-32.0,-43.0,-35.0,-36.0,-46.0,-50.0,-41.0,-52.0,343.0,208.0,248.0,158.0,214.0,352.0,393.0,206.0,311.0,280.0,473.0,473.0
AA,PHX,-15.0,-16.0,-24.0,-22.0,-17.0,-26.0,-26.0,-26.0,-27.0,-35.0,-60.0,-60.0,117.0,81.0,87.0,82.0,339.0,55.0,215.0,551.0,226.0,263.0,155.0,551.0
AA,SFO,-20.0,-31.0,-31.0,-33.0,-45.0,-33.0,-46.0,-39.0,-38.0,-29.0,-37.0,-46.0,93.0,223.0,160.0,37.0,35.0,130.0,229.0,406.0,324.0,69.0,120.0,406.0


# apply() - Series, DataFrame의 데이터 일괄 처리

데이터프레임의 행들과 열들 또는 Series의 원소들에 공통된 처리를 할 때 apply 함수를 이용하면 반복문을 사용하지 않고 일괄 처리가 가능하다.

- DataFrame.apply(함수, axis=0, args=())
    - 인수로 행이나 열을 받는 함수를 apply 메서드의 인수로 넣으면 데이터프레임의 행이나 열들을 하나씩 함수에 전달한다.
    - 매개변수
        - 함수: DataFrame의 행들 또는 열들을 전달할 함수
        - axis: **0-행을 전달, 1-열을 전달 (기본값 0)** G: 0이 행이다...
        - args: 행/열 이외에 전달할 매개변수를 위치기반(순서대로) 튜플로 전달
- Series.apply(함수, args=())
    - 인수로 Series의 원소들을 받는 함수를 apply 메소드의 인수로 넣으면  Series의 원소들을 하나씩 함수로 전달한다.
    - 매개변수
        - 함수: Series의 원소들을 전달할 함수
        - args: 원소 이외에 전달할 매개변수를 위치기반(순서대로) 튜플로 전달

In [161]:
import numpy as np
import pandas as pd

arr = np.arange(24).reshape(6,4)
df = pd.DataFrame(arr, columns=['no1', 'no2', 'no3', 'no4'])
df

Unnamed: 0,no1,no2,no3,no4
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19
5,20,21,22,23


In [176]:
# apply에 전달할 함수
def func(X):
    """
    매개변수 X 
    - dataframe.apply() -> Series 가 전달됨. (DF의 한행 또는 한열)
    - series.apply() -> series 원소가 전달.
    """
    print(X)
    print('---------------------------')
    return X**2

In [177]:
df.apply(func)

0     0
1     4
2     8
3    12
4    16
5    20
Name: no1, dtype: int32
---------------------------
0     1
1     5
2     9
3    13
4    17
5    21
Name: no2, dtype: int32
---------------------------
0     2
1     6
2    10
3    14
4    18
5    22
Name: no3, dtype: int32
---------------------------
0     3
1     7
2    11
3    15
4    19
5    23
Name: no4, dtype: int32
---------------------------


Unnamed: 0,no1,no2,no3,no4
0,0,1,4,9
1,16,25,36,49
2,64,81,100,121
3,144,169,196,225
4,256,289,324,361
5,400,441,484,529


In [178]:
df.apply(func, axis=1) #axis=1: 행단위로 전달

no1    0
no2    1
no3    2
no4    3
Name: 0, dtype: int32
---------------------------
no1    4
no2    5
no3    6
no4    7
Name: 1, dtype: int32
---------------------------
no1     8
no2     9
no3    10
no4    11
Name: 2, dtype: int32
---------------------------
no1    12
no2    13
no3    14
no4    15
Name: 3, dtype: int32
---------------------------
no1    16
no2    17
no3    18
no4    19
Name: 4, dtype: int32
---------------------------
no1    20
no2    21
no3    22
no4    23
Name: 5, dtype: int32
---------------------------


Unnamed: 0,no1,no2,no3,no4
0,0,1,4,9
1,16,25,36,49
2,64,81,100,121
3,144,169,196,225
4,256,289,324,361
5,400,441,484,529


In [175]:
df

Unnamed: 0,no1,no2,no3,no4
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19
5,20,21,22,23


In [179]:
# 추가 매개변수가 있는 경우
def func2(X, value):
    return X + value

In [182]:
df.apply(func2, value=100)

Unnamed: 0,no1,no2,no3,no4
0,100,101,102,103
1,104,105,106,107
2,108,109,110,111
3,112,113,114,115
4,116,117,118,119
5,120,121,122,123


In [183]:
df.apply(lambda X: X * 2)

Unnamed: 0,no1,no2,no3,no4
0,0,2,4,6
1,8,10,12,14
2,16,18,20,22
3,24,26,28,30
4,32,34,36,38
5,40,42,44,46


In [186]:
df['no1']

0     0
1     4
2     8
3    12
4    16
5    20
Name: no1, dtype: int32

In [185]:
df['no1'].apply(func)

0
---------------------------
4
---------------------------
8
---------------------------
12
---------------------------
16
---------------------------
20
---------------------------


0      0
1     16
2     64
3    144
4    256
5    400
Name: no1, dtype: int64

In [188]:
df.apply(lambda X: X.min())

no1    0
no2    1
no3    2
no4    3
dtype: int32

In [187]:
df

Unnamed: 0,no1,no2,no3,no4
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19
5,20,21,22,23


# cut()/qcut() - 연속형(실수)을 범주형으로 변환
- cut() : 지정한 값을 기준으로 구간을 나눠 그룹으로 묶는다.
    - `pd.cut(x, bins,right=True, labels=None)`
    - 매개변수
        - x: 범주형으로 바꿀 대상. 1차원 배열형태(Series, 리스트, ndarray)의 자료구조
        - bins: 범주로 나눌때의 기준값(구간경계)들을 리스트로 묶어서 전달한다.
        - right: 구간경계의 오른쪽(True-기본)을 포함할지 왼쪽(False)을 포함할지
        - labels: 각 구간(범주)의 label을 리스트로 전달
            - 생략하면 범위를 범주명으로 사용한다. (ex: (10, 20], ()-포함안함, []-포함)
- qcut() :  대상배열의 최대값 ~ 최소값을 지정한 개수의 동등한 size(**원소의개수**)가 되도록 나눈다.
    - `pd.qcut(x, q, labels)`
    - 매개변수
        - x: 나눌 대상. 1차원 배열형태의 자료구조
        - q: 나눌 개수
        - labels: 각 구간(범주)의 label을 리스트로 전달

In [208]:
ages = pd.Series(np.random.randint(50, size=30))  # 0 ~ 50 숫자중에 30개의 정수를 random하게 생성
ages

0      0
1      0
2     17
3     16
4     31
5      5
6     41
7     27
8     41
9     32
10    34
11    10
12     4
13    29
14    14
15    38
16    32
17    29
18    47
19     3
20     8
21    11
22    37
23     7
24     5
25    12
26     2
27    27
28    42
29    30
dtype: int32

In [210]:
ages.value_counts().sort_index()

0     2
2     1
3     1
4     1
5     2
7     1
8     1
10    1
11    1
12    1
14    1
16    1
17    1
27    2
29    2
30    1
31    1
32    2
34    1
37    1
38    1
41    2
42    1
47    1
dtype: int64

In [212]:
ages

0      0
1      0
2     17
3     16
4     31
5      5
6     41
7     27
8     41
9     32
10    34
11    10
12     4
13    29
14    14
15    38
16    32
17    29
18    47
19     3
20     8
21    11
22    37
23     7
24     5
25    12
26     2
27    27
28    42
29    30
dtype: int32

In [213]:
# cut()을 이용해 구간별로 그룹화
bins = [-1, 10, 30, 40, 50]
# -1~10, 10~30, 30~40, 40~50
# right:False - 작은쪽을 포함, True(기본): 큰쪽을 포함
pd.cut(ages, bins=bins, right=False)  #ages의 각 index의 값이 어느 그룹에 포함되는지를 반환

# (): 포함안함, []: 포함
# (-1, 10]   -1은 포함안함, 10은 포함

0     [-1, 10)
1     [-1, 10)
2     [10, 30)
3     [10, 30)
4     [30, 40)
5     [-1, 10)
6     [40, 50)
7     [10, 30)
8     [40, 50)
9     [30, 40)
10    [30, 40)
11    [10, 30)
12    [-1, 10)
13    [10, 30)
14    [10, 30)
15    [30, 40)
16    [30, 40)
17    [10, 30)
18    [40, 50)
19    [-1, 10)
20    [-1, 10)
21    [10, 30)
22    [30, 40)
23    [-1, 10)
24    [-1, 10)
25    [10, 30)
26    [-1, 10)
27    [10, 30)
28    [40, 50)
29    [30, 40)
dtype: category
Categories (4, interval[int64, left]): [[-1, 10) < [10, 30) < [30, 40) < [40, 50)]

In [215]:
labels = ['10세이하', '10,20대', '30대', '40대']
# -1~10, 10~30, 30~40, 40~50
age_cate = pd.cut(ages, bins=bins, labels=labels)
age_cate

0      10세이하
1      10세이하
2     10,20대
3     10,20대
4        30대
5      10세이하
6        40대
7     10,20대
8        40대
9        30대
10       30대
11     10세이하
12     10세이하
13    10,20대
14    10,20대
15       30대
16       30대
17    10,20대
18       40대
19     10세이하
20     10세이하
21    10,20대
22       30대
23     10세이하
24     10세이하
25    10,20대
26     10세이하
27    10,20대
28       40대
29    10,20대
dtype: category
Categories (4, object): ['10세이하' < '10,20대' < '30대' < '40대']

In [217]:
age_df = pd.DataFrame({'나이':ages, '나이대':age_cate})
age_df

Unnamed: 0,나이,나이대
0,0,10세이하
1,0,10세이하
2,17,"10,20대"
3,16,"10,20대"
4,31,30대
5,5,10세이하
6,41,40대
7,27,"10,20대"
8,41,40대
9,32,30대


In [220]:
age_df.groupby('나이대')['나이'].mean()

나이대
10세이하      4.40
10,20대    21.20
30대       34.00
40대       42.75
Name: 나이, dtype: float64

In [223]:
# qcut() - N등분을 지정하면 동일한 원소의 개수로 N 개의 그룹으로 나눈다.
pd.qcut(ages, 3)

0     (-0.001, 10.667]
1     (-0.001, 10.667]
2     (10.667, 30.333]
3     (10.667, 30.333]
4       (30.333, 47.0]
5     (-0.001, 10.667]
6       (30.333, 47.0]
7     (10.667, 30.333]
8       (30.333, 47.0]
9       (30.333, 47.0]
10      (30.333, 47.0]
11    (-0.001, 10.667]
12    (-0.001, 10.667]
13    (10.667, 30.333]
14    (10.667, 30.333]
15      (30.333, 47.0]
16      (30.333, 47.0]
17    (10.667, 30.333]
18      (30.333, 47.0]
19    (-0.001, 10.667]
20    (-0.001, 10.667]
21    (10.667, 30.333]
22      (30.333, 47.0]
23    (-0.001, 10.667]
24    (-0.001, 10.667]
25    (10.667, 30.333]
26    (-0.001, 10.667]
27    (10.667, 30.333]
28      (30.333, 47.0]
29    (10.667, 30.333]
dtype: category
Categories (3, interval[float64, right]): [(-0.001, 10.667] < (10.667, 30.333] < (30.333, 47.0]]

In [225]:
age_cate2 = pd.qcut(ages, 3, labels=['상', '중', '하'])
age_cate2

0     상
1     상
2     중
3     중
4     하
5     상
6     하
7     중
8     하
9     하
10    하
11    상
12    상
13    중
14    중
15    하
16    하
17    중
18    하
19    상
20    상
21    중
22    하
23    상
24    상
25    중
26    상
27    중
28    하
29    중
dtype: category
Categories (3, object): ['상' < '중' < '하']

In [226]:
df2 = pd.DataFrame({'나이':ages, '나이대':age_cate2})
df2

Unnamed: 0,나이,나이대
0,0,상
1,0,상
2,17,중
3,16,중
4,31,하
5,5,상
6,41,하
7,27,중
8,41,하
9,32,하


In [227]:
df2.groupby('나이대')['나이'].count()

나이대
상    10
중    10
하    10
Name: 나이, dtype: int64

# TODO

In [4]:
import pandas as pd

In [5]:
# TODO 1: data/diamonds.csv 를 읽어 DataFrame으로 만든다.
dia_df = pd.read_csv('data/diamonds.csv')
dia_df.shape

(53940, 10)

In [6]:
# TODO 2: price 컬럼을 '고가', '중가', '저가' 세개의 범주값을 가지는 "price_cate" 컬럼을 생성한다.
r = dia_df['price'].agg(['min', 'max'])
print(r)
# min      326
# max    18823
# 이므로 0 ~ 5000: 저가, 5000 ~ 15000: 중가, 15000 ~ : 고가로 선택

min      326
max    18823
Name: price, dtype: int64


In [7]:
dia_df['price_cate'] = pd.cut(dia_df['price'], bins=[0, 5000, 15000, 20000], labels=['저가', '중가', '고가'])
dia_df

# 원소 개수를 기반으로 3개로 나눔 ([(325.999, 1240.0] < (1240.0, 4287.333] < (4287.333, 18823.0]])
dia_df['price_cate2'] = pd.qcut(dia_df['price'], 3, labels=['저가', '중가', '고가'])

In [8]:
dia_df

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,price_cate,price_cate2
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43,저가,저가
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31,저가,저가
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31,저가,저가
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63,저가,저가
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75,저가,저가
...,...,...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50,저가,중가
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61,저가,중가
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56,저가,중가
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74,저가,중가


In [9]:
# TODO 3 가격대(price_cate) 별 carat의 평균을 조회
dia_df.groupby('price_cate')['carat'].mean()

price_cate
저가    0.572899
중가    1.324372
고가    1.977861
Name: carat, dtype: float64

In [10]:
dia_df.groupby('price_cate2')['carat'].mean()

price_cate2
저가    0.352691
중가    0.709712
고가    1.331734
Name: carat, dtype: float64

In [11]:
# TODO 4 가격대(price_cate)와 cut별 평균 가격(price)를 피봇테이블로 조회
dia_df.pivot_table(values='price', index='cut', columns='price_cate', margins=True)

price_cate,저가,중가,고가,All
cut,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fair,2651.319797,8268.131783,16682.268293,4358.757764
Good,2226.222374,8049.896583,16778.44186,3928.864452
Ideal,1688.642702,8445.635608,16803.335217,3457.54197
Premium,2024.964845,8474.121308,16778.272572,4584.257704
Very Good,1997.429797,8230.669355,16778.234332,3981.759891
All,1914.472289,8366.697756,16783.940181,3932.799722


In [13]:
# TODO 5 cut, color, price_cate 별 carat의 평균을 피봇테이블로 조회
dia_df.pivot_table(values='carat', index=['cut', 'color'], columns='price_cate', margins=True)

Unnamed: 0_level_0,price_cate,저가,중가,고가,All
cut,color,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Fair,D,0.773871,1.244545,2.158333,0.920123
Fair,E,0.743352,1.31,1.833333,0.856607
Fair,F,0.751926,1.387541,2.022857,0.904712
Fair,G,0.806383,1.639865,2.126,1.023822
Fair,H,0.910508,1.737216,2.392222,1.219175
Fair,I,0.926694,1.751818,2.524286,1.198057
Fair,J,0.918026,1.933333,3.6075,1.341176
Good,D,0.617414,1.175,1.609286,0.744517
Good,E,0.608738,1.201341,1.854706,0.745134
Good,F,0.660458,1.218344,1.955625,0.77593
