- 그룹 객체 필터링
  - 그룹 객체에 filter() 메소드를 적용할 때 조건식을 가진 함수를 전달하면 조건이 참인 그룹만을 남긴다.

In [78]:
# pandas, seaborn 라이브러리 불러오기

import pandas as pd
import seaborn as sns


# titanic 데이터셍에서 age, sex, class, fare, survived 열을 df로 만들기

titanic = sns.load_dataset('titanic')
df = titanic.loc[:, ['age', 'sex', 'class', 'fare', 'survived']]
df.head()

# class 열을 기준으로 분할
df_group = df.groupby(['class'], observed=True)

# 그룹별로 첫 2행을 확인
df_group.head(2)

Unnamed: 0,age,sex,class,fare,survived
0,22.0,male,Third,7.25,0
1,38.0,female,First,71.2833,1
2,26.0,female,Third,7.925,1
3,35.0,female,First,53.1,1
9,14.0,female,Second,30.0708,1
15,55.0,female,Second,16.0,1


In [80]:
# 그룹별 처음 5개 행 출력
df_group.head()

Unnamed: 0,age,sex,class,fare,survived
0,22.0,male,Third,7.25,0
1,38.0,female,First,71.2833,1
2,26.0,female,Third,7.925,1
3,35.0,female,First,53.1,1
4,35.0,male,Third,8.05,0
5,,male,Third,8.4583,0
6,54.0,male,First,51.8625,0
7,2.0,male,Third,21.075,0
9,14.0,female,Second,30.0708,1
11,58.0,female,First,26.55,1


In [82]:
# 각 그룹의 첫번째 데이터를 확인
df_group.head(1)

Unnamed: 0,age,sex,class,fare,survived
0,22.0,male,Third,7.25,0
1,38.0,female,First,71.2833,1
9,14.0,female,Second,30.0708,1


In [92]:
df_group.nth(0)

Unnamed: 0,age,sex,class,fare,survived
0,22.0,male,Third,7.25,0
1,38.0,female,First,71.2833,1
9,14.0,female,Second,30.0708,1


In [94]:
# 그룹별 2번째 데이터 확인
df_group.nth(1)

Unnamed: 0,age,sex,class,fare,survived
2,26.0,female,Third,7.925,1
3,35.0,female,First,53.1,1
15,55.0,female,Second,16.0,1


In [96]:
# 데이터 개수가 200개 이상인 그룹만을 필터링하여 데이터프레임으로 반환 - 람다
df_group.filter(lambda x: len(x) >= 200)

Unnamed: 0,age,sex,class,fare,survived
0,22.0,male,Third,7.2500,0
1,38.0,female,First,71.2833,1
2,26.0,female,Third,7.9250,1
3,35.0,female,First,53.1000,1
4,35.0,male,Third,8.0500,0
...,...,...,...,...,...
885,39.0,female,Third,29.1250,0
887,19.0,female,First,30.0000,1
888,,female,Third,23.4500,0
889,26.0,male,First,30.0000,1


In [98]:
# age 열의 평균이 30보다 작은 그룹만을 필터링하여 데이터프레임으로 반환 - 람다
df_group.filter(lambda x: x['age'].mean() < 30)

Unnamed: 0,age,sex,class,fare,survived
0,22.0,male,Third,7.2500,0
2,26.0,female,Third,7.9250,1
4,35.0,male,Third,8.0500,0
5,,male,Third,8.4583,0
7,2.0,male,Third,21.0750,0
...,...,...,...,...,...
884,25.0,male,Third,7.0500,0
885,39.0,female,Third,29.1250,0
886,27.0,male,Second,13.0000,0
888,,female,Third,23.4500,0


- 그룹 객체에 함수 매핑
  - apply() 메소드는 판다스 객체의 개별 원소를 특정 함수에 일대일로 매핑

In [128]:
# 각 그룹별 요약 통계 정보 집계
df_group.apply(lambda x: x.describe(), include_groups=False)
#df_group[['age', 'fare','survived']].apply(lambda x: x.describe())

Unnamed: 0_level_0,Unnamed: 1_level_0,age,fare,survived
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
First,count,186.0,216.0,216.0
First,mean,38.233441,84.154687,0.62963
First,std,14.802856,78.380373,0.484026
First,min,0.92,0.0,0.0
First,25%,27.0,30.92395,0.0
First,50%,37.0,60.2875,1.0
First,75%,49.0,93.5,1.0
First,max,80.0,512.3292,1.0
Second,count,173.0,184.0,184.0
Second,mean,29.87763,20.662183,0.472826


In [148]:
# 변환 : z-score 계산
# (val - mean) / std

def z_score(x):
    return (x - x.mean()) / x.std()


df_group[['age', 'fare']].apply(z_score)

Unnamed: 0_level_0,Unnamed: 1_level_0,age,fare
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
First,1,-0.015770,-0.164217
First,3,-0.218434,-0.396205
First,6,1.065103,-0.411993
First,11,1.335321,-0.734938
First,23,-0.691315,-0.620751
...,...,...,...
Third,882,-0.251342,-0.268196
Third,884,-0.011254,-0.562529
Third,885,1.109159,1.311705
Third,888,,0.829880


In [192]:
# 필터링 : age열의 데이터 평균이 30보다 작은 그룹만을 필터링하여 출력


#df_group['age'].filter(lambda x: x.mean() < 30 )
#df_group.filter(lambda x: x['age'].mean() < 30 )
age_apply = df_group[['age', 'fare']].apply(lambda x: x['age'].mean() < 30 )
age_apply

class
First     False
Second     True
Third      True
dtype: bool

In [210]:
# 필터링 결과로 조건을 충족하는 데이터프레임 반환
df_group.filter(lambda x: x['age'].mean() < 30 )

df.loc[ df['class'].isin(age_apply[age_apply == True]).index , ['age', 'class', 'survived']]

Unnamed: 0,age,class,survived
0,22.0,Third,0
1,38.0,First,1
2,26.0,Third,1
3,35.0,First,1
4,35.0,Third,0
...,...,...,...
886,27.0,Second,0
887,19.0,First,1
888,,Third,0
889,26.0,First,1


### 멀티 인덱스
- groupby() 메소드에 여러 열을 리스트 형태로 전달하면 각 열들이 다중으로 행 인덱스를 구성 => 멀티 인덱스

In [220]:
# class열과 sex 열을 기준으로 그룹화
df_group2 = df.groupby(['class', 'age'], observed=True)
df_group2

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000002137AB54B90>

In [61]:
# 그룹 객체에 mean() 메소드 적용하면 그룹별로 각 열의 평균값을 정리하여 반환


                     age        fare  survived
class  sex                                    
First  female  34.611765  106.125798  0.968085
       male    41.281386   67.226127  0.368852
Second female  28.722973   21.970121  0.921053
       male    30.740707   19.741782  0.157407
Third  female  21.750000   16.118810  0.500000
       male    26.507589   12.661633  0.135447


In [65]:
# 멀티인덱스에서 하나의 인덱스만 사용
# class 인덱스에서 First 라는 값을 가진 행 선택


              age        fare  survived
sex                                    
female  34.611765  106.125798  0.968085
male    41.281386   67.226127  0.368852


In [67]:
# 멀티인덱스에서 두 개의 인덱스 사용
# First , female 에 해당하는 행 추출


age          34.611765
fare        106.125798
survived      0.968085
Name: (First, female), dtype: float64
