# 데이터 프레임 - 그룹 분석

In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
import seaborn as sns
iris = sns.load_dataset('iris')
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [3]:
# 각 품종별 4가지 피쳐의 평균
iris.groupby('species').mean()

Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
setosa,5.006,3.428,1.462,0.246
versicolor,5.936,2.77,4.26,1.326
virginica,6.588,2.974,5.552,2.026


In [6]:
# 각 품종별 sepal_length의 표준편차
iris[['sepal_length', 'species']].groupby('species').std()

Unnamed: 0_level_0,sepal_length
species,Unnamed: 1_level_1
setosa,0.35249
versicolor,0.516171
virginica,0.63588


In [8]:
iris.groupby('species')['sepal_length'].std()

species
setosa        0.352490
versicolor    0.516171
virginica     0.635880
Name: sepal_length, dtype: float64

In [9]:
# 각 품종별 sepal_length의 평균, 표준편차, 최대값, 최소값
iris.groupby('species')['sepal_length'].agg(['mean', 'std', 'max', 'min'])

Unnamed: 0_level_0,mean,std,max,min
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
setosa,5.006,0.35249,5.8,4.3
versicolor,5.936,0.516171,7.0,4.9
virginica,6.588,0.63588,7.9,4.9


### tips data 사례

In [27]:
tips = sns.load_dataset('tips')
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [12]:
tips['tip_pct'] = (tips.tip / tips.total_bill * 100).round(2)
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
0,16.99,1.01,Female,No,Sun,Dinner,2,5.94
1,10.34,1.66,Male,No,Sun,Dinner,3,16.05
2,21.01,3.5,Male,No,Sun,Dinner,3,16.66
3,23.68,3.31,Male,No,Sun,Dinner,2,13.98
4,24.59,3.61,Female,No,Sun,Dinner,4,14.68


In [13]:
tips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
 7   tip_pct     244 non-null    float64 
dtypes: category(4), float64(3), int64(1)
memory usage: 9.3 KB


In [15]:
tips.describe()

Unnamed: 0,total_bill,tip,size,tip_pct
count,244.0,244.0,244.0,244.0
mean,19.785943,2.998279,2.569672,16.079754
std,8.902412,1.383638,0.9511,6.10702
min,3.07,1.0,1.0,3.56
25%,13.3475,2.0,2.0,12.91
50%,17.795,2.9,2.0,15.475
75%,24.1275,3.5625,3.0,19.1475
max,50.81,10.0,6.0,71.03


In [16]:
# 성별 데이터 갯수
tips.groupby('sex').count()

Unnamed: 0_level_0,total_bill,tip,smoker,day,time,size,tip_pct
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Male,157,157,157,157,157,157,157
Female,87,87,87,87,87,87,87


In [17]:
tips.groupby('sex').size()

sex
Male      157
Female     87
dtype: int64

In [19]:
# 성별 흡연유무 데이터 갯수
tips.groupby(['sex', 'smoker']).size()

sex     smoker
Male    Yes       60
        No        97
Female  Yes       33
        No        54
dtype: int64

In [20]:
# 인덱스가 2개인 다중 인덱스를 갖는 시리즈
tips.groupby(['sex', 'smoker']).size()['Male']['Yes']

60

In [23]:
# 성별 팁 비율의 평균, 최소값, 최대값
tips.groupby('sex')['tip_pct'].agg(['mean', 'min', 'max'])

Unnamed: 0_level_0,mean,min,max
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Male,15.764713,3.56,71.03
Female,16.648276,5.64,41.67


In [24]:
# 성별, 흡연 유무별 팁 비율의 평균
tips.groupby(['sex', 'smoker'])['tip_pct'].mean()

sex     smoker
Male    Yes       15.276667
        No        16.066598
Female  Yes       18.214545
        No        15.691111
Name: tip_pct, dtype: float64

In [30]:
tips.groupby('sex')['tip'].agg(['mean', 'std', 'max', 'min'])

Unnamed: 0_level_0,mean,std,max,min
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Male,3.089618,1.489102,10.0,1.0
Female,2.833448,1.159495,6.5,1.0


In [32]:
# 모든 고려요소별 팁 비율의 평군(size만 빼고)
tips.groupby(['sex', 'smoker', 'time'])[['tip_pct']].mean()

KeyError: "Columns not found: 'tip_pct'"