In [1]:
import numpy as np
import pandas as pd

### Groupby

In [2]:
# 데이터프레임 생성
df = pd.DataFrame({"Points" : [111,222,333,44,12],
                  "Rank" : [1,2,2,3,3],
                  "Team" : ["Riders","Riders","Devils","Devils","Kings"],
                  "Year" : [2014,2015,2014,2015,2014]})

In [3]:
df

Unnamed: 0,Points,Rank,Team,Year
0,111,1,Riders,2014
1,222,2,Riders,2015
2,333,2,Devils,2014
3,44,3,Devils,2015
4,12,3,Kings,2014


In [4]:
# groupby(묶음의 기준이 되는 컬럼)[적용받는 컬럼].적용받는 연산
# team을 기준으로 points를 sum
df.groupby("Team")["Points"].sum()

Team
Devils    377
Kings      12
Riders    333
Name: Points, dtype: int64

In [9]:
# 한 개 이상의 컬럼 묶기
# team과 year을 기준으로 points를 sum
gr_df = df.groupby(["Team","Year"])["Points"].sum()
gr_df

Team    Year
Devils  2014    333
        2015     44
Kings   2014     12
Riders  2014    111
        2015    222
Name: Points, dtype: int64

**groupby의 결과물은 dataframe으로 생성되며 두개의 컬럼을 groupby할 경우, index가 두개 생성**

In [10]:
# 2개의 인덱스 생성
gr_df.index

MultiIndex(levels=[['Devils', 'Kings', 'Riders'], [2014, 2015]],
           codes=[[0, 0, 1, 2, 2], [0, 1, 0, 0, 1]],
           names=['Team', 'Year'])

In [11]:
gr_df["Devils" : "Kings"]

Team    Year
Devils  2014    333
        2015     44
Kings   2014     12
Name: Points, dtype: int64

### unstack()

- group으로 묶여진 데이터를 matrix 형태로 전환해줌

In [12]:
gr_df.unstack()

Year,2014,2015
Team,Unnamed: 1_level_1,Unnamed: 2_level_1
Devils,333.0,44.0
Kings,12.0,
Riders,111.0,222.0


In [13]:
# index level 변경
gr_df.swaplevel()

Year  Team  
2014  Devils    333
2015  Devils     44
2014  Kings      12
      Riders    111
2015  Riders    222
Name: Points, dtype: int64

In [15]:
# index level을 기준으로 연산
gr_df.sum(level = 0)

Team
Devils    377
Kings      12
Riders    333
Name: Points, dtype: int64

In [18]:
# groupby에 의해 split된 상태를 추출
grouped = df.groupby("Team")

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001FB641B5940>

In [19]:
# 각각의 그룹에 sum
grouped.agg(sum)

Unnamed: 0_level_0,Points,Rank,Year
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Devils,377,5,4029
Kings,12,3,2014
Riders,333,3,4029


In [20]:
# 각각의 그룹에 mean
grouped.agg(np.mean)

Unnamed: 0_level_0,Points,Rank,Year
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Devils,188.5,2.5,2014.5
Kings,12.0,3.0,2014.0
Riders,166.5,1.5,2014.5


In [23]:
# 한번에 연산 가능
grouped['Points'].agg([np.sum,np.mean,np.std])

Unnamed: 0_level_0,sum,mean,std
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Devils,377,188.5,204.35386
Kings,12,12.0,
Riders,333,166.5,78.488853


In [24]:
df

Unnamed: 0,Points,Rank,Team,Year
0,111,1,Riders,2014
1,222,2,Riders,2015
2,333,2,Devils,2014
3,44,3,Devils,2015
4,12,3,Kings,2014


In [27]:
# 데이터프레임 생성
df = pd.DataFrame({"Points" : [111,222,333,44,12,321,155,224,225],
                  "Rank" : [1,2,2,3,3,3,1,2,2],
                  "Team" : ["Riders","Riders","Devils","Devils","Kings","Kings",'Riders','Devils',"Kings"],
                  "Year" : [2014,2015,2014,2015,2014,2016,2016,2016,2015]})
df

Unnamed: 0,Points,Rank,Team,Year
0,111,1,Riders,2014
1,222,2,Riders,2015
2,333,2,Devils,2014
3,44,3,Devils,2015
4,12,3,Kings,2014
5,321,3,Kings,2016
6,155,1,Riders,2016
7,224,2,Devils,2016
8,225,2,Kings,2015


### transformation

- 요약 정보가 아닌 개별 데이터의 변환을 지원함

In [31]:
grouped = df.groupby("Team")

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001FB66D43CF8>

In [32]:
# team별 큰값을 적용
score = lambda x : (x.max())
grouped.transform(score)

Unnamed: 0,Points,Rank,Year
0,222,2,2016
1,222,2,2016
2,333,3,2016
3,333,3,2016
4,321,3,2016
5,321,3,2016
6,222,2,2016
7,333,3,2016
8,321,3,2016


### filter

- 특정 조건으로 데이터를 검색할 때 사용<br>
- filter안에는 boolean 조건이 존재해야함

In [36]:
df

Unnamed: 0,Points,Rank,Team,Year
0,111,1,Riders,2014
1,222,2,Riders,2015
2,333,2,Devils,2014
3,44,3,Devils,2015
4,12,3,Kings,2014
5,321,3,Kings,2016
6,155,1,Riders,2016
7,224,2,Devils,2016
8,225,2,Kings,2015


In [33]:
# 데이터가 3개 이상이 있을 경우에만 출력
df.groupby("Team").filter(lambda x : len(x) >= 3)

Unnamed: 0,Points,Rank,Team,Year
0,111,1,Riders,2014
1,222,2,Riders,2015
2,333,2,Devils,2014
3,44,3,Devils,2015
4,12,3,Kings,2014
5,321,3,Kings,2016
6,155,1,Riders,2016
7,224,2,Devils,2016
8,225,2,Kings,2015


In [38]:
df.groupby("Team").filter(lambda x : x['Points'].sum() >= 600)

Unnamed: 0,Points,Rank,Team,Year
2,333,2,Devils,2014
3,44,3,Devils,2015
7,224,2,Devils,2016


In [43]:
# df에서 팀이 devils인것들을 연도별로 points의 합을 구해라
df[df["Team"] == "Devils"].groupby('Year')['Points'].sum()

Year
2014    333
2015     44
2016    224
Name: Points, dtype: int64

### merge