# 예제 6-15 데이터 집계

In [1]:
import pandas as pd
import seaborn as sns

### titanic 데이터셋에서 5개 열을 선택하여 데이터프레임 만들기 

In [2]:
titanic = sns.load_dataset('titanic')
df = titanic.loc[:, ['age', 'sex', 'class', 'fare', 'survived']]

### class 열을 기준으로 분할 

In [3]:
grouped = df.groupby(['class'])

### 각 그룹에 대한 모든 열의 표준편차 집계후 데이터프레임 반환

In [5]:
std_all = grouped.std()
std_all

Unnamed: 0_level_0,age,fare,survived
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
First,14.802856,78.380373,0.484026
Second,14.001077,13.417399,0.500623
Third,12.495398,11.778142,0.428949


### 각 그룹에 대한 fare 열의 표준편차를 집계하여 시리즈로 반환 

In [6]:
std_fare = grouped.fare.std()
std_fare

class
First     78.380373
Second    13.417399
Third     11.778142
Name: fare, dtype: float64

### 그룹 객체에 agg() 메소드 적용 - 사용자 정의 함수를 인자로 전달 

In [7]:
def min_max(x):
    return x.max() - x.min()

### 각 그룹의 최대값과 최소값의 차이를 계산하여 그룹별로 집계 

In [8]:
agg_minmax = grouped.agg(min_max)
agg_minmax.head()

Unnamed: 0_level_0,age,fare,survived
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
First,79.08,512.3292,1
Second,69.33,73.5,1
Third,73.58,69.55,1


### 여러 함수를 각 열에 동일하게 적용하여 집계 

agg_all = grouped.agg(['min', 'max'])
agg_all

### 각 열마다 다른 함수를 적용하여 집계 

In [10]:
agg_sep = grouped.agg({'fare': ['min', 'max'], 'age':'mean'})
agg_sep.head()

Unnamed: 0_level_0,fare,fare,age
Unnamed: 0_level_1,min,max,mean
class,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
First,0.0,512.3292,38.233441
Second,0.0,73.5,29.87763
Third,0.0,69.55,25.14062


# 예제 6-16 그룹 연산 데이터 변환 

### 그룹별 age 열의 평균 집계 연산 

In [11]:
age_mean = grouped.age.mean()
age_mean

class
First     38.233441
Second    29.877630
Third     25.140620
Name: age, dtype: float64

### 그룹별 age 열의 표준편차 집계 연산 

In [12]:
age_std = grouped.age.std()
age_std

class
First     14.802856
Second    14.001077
Third     12.495398
Name: age, dtype: float64

### 그룹 객체의 age 열을 반복문으로 z-score 계산 후 출력 

In [14]:
for key, group in grouped.age:
    group_zscore = (group - age_mean.loc[key])/age_std.loc[key]
    print('* origin :', key)
    print(group_zscore.head(3))
    print('\n')

* origin : First
1   -0.015770
3   -0.218434
6    1.065103
Name: age, dtype: float64


* origin : Second
9    -1.134029
15    1.794317
17         NaN
Name: age, dtype: float64


* origin : Third
0   -0.251342
2    0.068776
4    0.789041
Name: age, dtype: float64




### z-score를 계산하는 사용자 함수 정의 

In [15]:
def z_score(x):
    return (x - x.mean())/x.std()

### transform() 메소드를 이용하여 age 열의 데이터를 z-score로 변환 

In [16]:
age_zscore = grouped.age.transform(z_score)

### 1, 2, 3 그룹의 첫 데이터 확인(변환 결과)

In [17]:
age_zscore.loc[[1, 9, 0]]

1   -0.015770
9   -1.134029
0   -0.251342
Name: age, dtype: float64

### transform 메소드 반환 값의 길이 

In [18]:
len(age_zscore)

891

### transform 메소드 반환 값 출력(첫 10개) 

In [19]:
age_zscore.loc[0:9]

0   -0.251342
1   -0.015770
2    0.068776
3   -0.218434
4    0.789041
5         NaN
6    1.065103
7   -1.851931
8    0.148805
9   -1.134029
Name: age, dtype: float64

### transform 메소드 반환 객체의 자료형 

In [20]:
type(age_zscore)

pandas.core.series.Series

# 예제 6-17 그룹 객체 필터링 

### 데이터 개수가 200개 이상인 그룹만을 필터링 

In [23]:
titanic = sns.load_dataset('titanic')
df = titanic.loc[:, ['age', 'sex', 'class', 'fare', 'survived']]

In [24]:
grouped_filter = grouped.filter(lambda x: len(x) >= 200)
grouped_filter.head()

Unnamed: 0,age,sex,fare,survived
0,22.0,male,7.25,0
1,38.0,female,71.2833,1
2,26.0,female,7.925,1
3,35.0,female,53.1,1
4,35.0,male,8.05,0


### age 열이 평균이 30보다 작은 그룹만 필터링 

In [25]:
age_filter = grouped.filter(lambda x: x.age.mean() < 30)
age_filter.tail()

Unnamed: 0,age,sex,fare,survived
884,25.0,male,7.05,0
885,39.0,female,29.125,0
886,27.0,male,13.0,0
888,,female,23.45,0
890,32.0,male,7.75,0


# 예제 6-18 그룹 객체 필터링 

### 집계: 각 그룹별 요약 통계 정보 집계 

In [32]:
agg_grouped = grouped.apply(lambda x: x.describe())
agg_grouped

Unnamed: 0_level_0,Unnamed: 1_level_0,age,fare,survived
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
First,count,186.0,216.0,216.0
First,mean,38.233441,84.154687,0.62963
First,std,14.802856,78.380373,0.484026
First,min,0.92,0.0,0.0
First,25%,27.0,30.92395,0.0
First,50%,37.0,60.2875,1.0
First,75%,49.0,93.5,1.0
First,max,80.0,512.3292,1.0
Second,count,173.0,184.0,184.0
Second,mean,29.87763,20.662183,0.472826


### z-score를 계산하는 사용자 함수 정의 

In [34]:
def z_score(x):
    return (x - x.mean())/x.std()

In [35]:
age_zscore = grouped.age.apply(z_score)
age_zscore.head()

0   -0.251342
1   -0.015770
2    0.068776
3   -0.218434
4    0.789041
Name: age, dtype: float64

### 필터링: age 열의 데이터 평균이 30보다 작은 그룹만을 필터링하여 출력 

In [37]:
age_filter = grouped.apply(lambda x: x.age.mean() < 30)
age_filter

class
First     False
Second     True
Third      True
dtype: bool

In [38]:
for x in age_filter.index:
    if age_filter[x] == True:
        age_filter_df = grouped.get_group(x)
        print(age_filter_df.head())
        print('\n')

     age     sex     fare  survived
9   14.0  female  30.0708         1
15  55.0  female  16.0000         1
17   NaN    male  13.0000         1
20  35.0    male  26.0000         0
21  34.0    male  13.0000         1


    age     sex     fare  survived
0  22.0    male   7.2500         0
2  26.0  female   7.9250         1
4  35.0    male   8.0500         0
5   NaN    male   8.4583         0
7   2.0    male  21.0750         0




# 예제 6-19 멀티 인덱스 

### class 열, sex 열을 기준으로 분할 

In [39]:
grouped = df.groupby(['class', 'sex'])

### 그룹 객체에 연산 메소드 적용 

In [40]:
gdf = grouped.mean()
gdf

Unnamed: 0_level_0,Unnamed: 1_level_0,age,fare,survived
class,sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
First,female,34.611765,106.125798,0.968085
First,male,41.281386,67.226127,0.368852
Second,female,28.722973,21.970121,0.921053
Second,male,30.740707,19.741782,0.157407
Third,female,21.75,16.11881,0.5
Third,male,26.507589,12.661633,0.135447


### class 값이 First인 행을 선택하여 출력 

In [41]:
gdf.loc['First']

Unnamed: 0_level_0,age,fare,survived
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,34.611765,106.125798,0.968085
male,41.281386,67.226127,0.368852


### class 값이 First이고, sex값이 female인 행을 선택 

In [42]:
gdf.loc[('First', 'female')]

age          34.611765
fare        106.125798
survived      0.968085
Name: (First, female), dtype: float64

### sex 값이 male인 행을 선택하여 출력 

In [43]:
gdf.xs('male', level='sex')

Unnamed: 0_level_0,age,fare,survived
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
First,41.281386,67.226127,0.368852
Second,30.740707,19.741782,0.157407
Third,26.507589,12.661633,0.135447
