<a href="https://colab.research.google.com/github/JakeOh/202011_itw_bd21/blob/main/lab_da/da16_groupby.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# GroupBy

*   분리(split) - 적용(apply) - 결합(combine)


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
tips = sns.load_dataset('tips')

In [3]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [4]:
tips.shape

(244, 7)

In [5]:
tips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.3 KB


In [6]:
tips.describe()

Unnamed: 0,total_bill,tip,size
count,244.0,244.0,244.0
mean,19.785943,2.998279,2.569672
std,8.902412,1.383638,0.9511
min,3.07,1.0,1.0
25%,13.3475,2.0,2.0
50%,17.795,2.9,2.0
75%,24.1275,3.5625,3.0
max,50.81,10.0,6.0


In [7]:
tips['sex'].value_counts()

Male      157
Female     87
Name: sex, dtype: int64

In [8]:
tips.index

RangeIndex(start=0, stop=244, step=1)

In [9]:
tips.index.nlevels

1

In [10]:
tips.columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

In [11]:
tips.columns.nlevels

1

*   성별 팁의 평균
    1. 성별(Female, Male) 부분집합을 찾는다 -> split(분리)
    2. 각 부분집합에서 팁 변수의 평균을 계산한다 -> apply(적용)
    3. 2번에서 계산된 결과를 하나로 합쳐서 표현 -> combine(결합)

1. Split

In [12]:
genders = tips['sex'].unique()
genders

['Female', 'Male']
Categories (2, object): ['Female', 'Male']

In [13]:
female = tips[tips['sex'] == 'Female']
male = tips[tips['sex'] == 'Male']

In [14]:
female

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
11,35.26,5.00,Female,No,Sun,Dinner,4
14,14.83,3.02,Female,No,Sun,Dinner,2
16,10.33,1.67,Female,No,Sun,Dinner,3
...,...,...,...,...,...,...,...
226,10.09,2.00,Female,Yes,Fri,Lunch,2
229,22.12,2.88,Female,Yes,Sat,Dinner,2
238,35.83,4.67,Female,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2


In [15]:
male

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
5,25.29,4.71,Male,No,Sun,Dinner,4
6,8.77,2.00,Male,No,Sun,Dinner,2
...,...,...,...,...,...,...,...
236,12.60,1.00,Male,Yes,Sat,Dinner,2
237,32.83,1.17,Male,Yes,Sat,Dinner,2
239,29.03,5.92,Male,No,Sat,Dinner,3
241,22.67,2.00,Male,Yes,Sat,Dinner,2


2. Apply

In [16]:
female_tip = female['tip'].mean()
male_tip = male['tip'].mean()

In [17]:
print('female tip mean:', female_tip)
print('male tip mean:', male_tip)

female tip mean: 2.833448275862069
male tip mean: 3.0896178343949052


3. Combine

In [18]:
# genders = tips['sex'].unique()
s = pd.Series(data=[female_tip, male_tip], index=genders)
s

Female    2.833448
Male      3.089618
dtype: float64

In [20]:
df = pd.DataFrame(data={'tip': [female_tip, male_tip]}, index=genders)
df

Unnamed: 0,tip
Female,2.833448
Male,3.089618


In [27]:
def groupby_mean(df, by, value):
    """
    :param df: DataFrame
    :param by: str. 그룹을 나눌 컬럼 이름.
    :param value: str. mean을 적용할 변수 이름.
    """
    categories = df[by].unique()
    mean_by_category = []
    for c in categories:
        subset = df[df[by] == c]  # split
        avg = subset[value].mean()  # apply
        mean_by_category.append(avg)
    # combine
    # result = pd.Series(data=mean_by_category, index=categories)
    result = pd.DataFrame(data={value: mean_by_category}, index=categories)
    
    # 결과 리턴
    return result

In [28]:
groupby_mean(df=tips, by='sex', value='tip')

Unnamed: 0,tip
Female,2.833448
Male,3.089618


In [29]:
tips.pivot_table(values='tip', index='sex', aggfunc='mean')

Unnamed: 0_level_0,tip
sex,Unnamed: 1_level_1
Male,3.089618
Female,2.833448


In [30]:
groupby_mean(df=tips, by='sex', value='total_bill')

Unnamed: 0,total_bill
Female,18.056897
Male,20.744076


In [24]:
tips[tips['sex'] == 'Female']['total_bill'].mean()

18.056896551724137

In [31]:
tips.pivot_table(index='sex', values='total_bill', aggfunc='mean')

Unnamed: 0_level_0,total_bill
sex,Unnamed: 1_level_1
Male,20.744076
Female,18.056897


In [32]:
groupby_mean(df=tips, by='day', value='tip')

Unnamed: 0,tip
Sun,3.255132
Sat,2.993103
Thur,2.771452
Fri,2.734737


In [26]:
tips[tips['day'] == 'Thur']['tip'].mean()

2.771451612903226

In [33]:
tips.pivot_table(index='day', values='tip', aggfunc='mean')

Unnamed: 0_level_0,tip
day,Unnamed: 1_level_1
Thur,2.771452
Fri,2.734737
Sat,2.993103
Sun,3.255132



`pd.DataFrame.groupby()` 메서드


In [34]:
grouped = tips.groupby(by='sex')
grouped

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f25cc2b7390>

In [37]:
for cls, subset in grouped:
    print('Class:', cls)
    print(subset)
    print('-' * 50, '\n')

Class: Male
     total_bill   tip   sex smoker  day    time  size
1         10.34  1.66  Male     No  Sun  Dinner     3
2         21.01  3.50  Male     No  Sun  Dinner     3
3         23.68  3.31  Male     No  Sun  Dinner     2
5         25.29  4.71  Male     No  Sun  Dinner     4
6          8.77  2.00  Male     No  Sun  Dinner     2
..          ...   ...   ...    ...  ...     ...   ...
236       12.60  1.00  Male    Yes  Sat  Dinner     2
237       32.83  1.17  Male    Yes  Sat  Dinner     2
239       29.03  5.92  Male     No  Sat  Dinner     3
241       22.67  2.00  Male    Yes  Sat  Dinner     2
242       17.82  1.75  Male     No  Sat  Dinner     2

[157 rows x 7 columns]
-------------------------------------------------- 

Class: Female
     total_bill   tip     sex smoker   day    time  size
0         16.99  1.01  Female     No   Sun  Dinner     2
4         24.59  3.61  Female     No   Sun  Dinner     4
11        35.26  5.00  Female     No   Sun  Dinner     4
14        14.83  3.02

In [38]:
grouped = tips.groupby(by='day')
for cls, subset in grouped:
    print(cls)
    print(subset)
    print('-' * 50, '\n')

Thur
     total_bill   tip     sex smoker   day    time  size
77        27.20  4.00    Male     No  Thur   Lunch     4
78        22.76  3.00    Male     No  Thur   Lunch     2
79        17.29  2.71    Male     No  Thur   Lunch     2
80        19.44  3.00    Male    Yes  Thur   Lunch     2
81        16.66  3.40    Male     No  Thur   Lunch     2
..          ...   ...     ...    ...   ...     ...   ...
202       13.00  2.00  Female    Yes  Thur   Lunch     2
203       16.40  2.50  Female    Yes  Thur   Lunch     2
204       20.53  4.00    Male    Yes  Thur   Lunch     4
205       16.47  3.23  Female    Yes  Thur   Lunch     3
243       18.78  3.00  Female     No  Thur  Dinner     2

[62 rows x 7 columns]
-------------------------------------------------- 

Fri
     total_bill   tip     sex smoker  day    time  size
90        28.97  3.00    Male    Yes  Fri  Dinner     2
91        22.49  3.50    Male     No  Fri  Dinner     2
92         5.75  1.00  Female    Yes  Fri  Dinner     2
93     

In [39]:
# 성별 tip의 평균
tips.groupby(by='sex')['tip'].mean()  #> Series: shape(2,)

sex
Male      3.089618
Female    2.833448
Name: tip, dtype: float64

In [40]:
tips.groupby(by='sex')[['tip']].mean()  #> DataFrame: shape(2, 1)

Unnamed: 0_level_0,tip
sex,Unnamed: 1_level_1
Male,3.089618
Female,2.833448
