<a href="https://colab.research.google.com/github/JakeOh/202511_BD53/blob/main/lab_python/da12_shape.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DataFrame 모양(shape) 변경

wide(columns)  <---> long(rows)

In [28]:
import numpy as np
import pandas as pd
import seaborn as sns

# stack vs unstack

In [2]:
df = pd.DataFrame(data=np.arange(1, 7).reshape((2, 3)),
                  columns=['a', 'b', 'c'],
                  index=['X', 'Y'])
df

Unnamed: 0,a,b,c
X,1,2,3
Y,4,5,6


In [3]:
df_stacked = df.stack()  # stack: wide(columns) --> long(rows)
df_stacked  #> 컬럼 이름들은 index로 바뀜.

Unnamed: 0,Unnamed: 1,0
X,a,1
X,b,2
X,c,3
Y,a,4
Y,b,5
Y,c,6


In [6]:
df_unstack = df_stacked.unstack()  # unstack: long(rows) --> wide(columns)
df_unstack  #> level=-1(기본값): 가장 마지막 계층(level)의 인덱스를 컬럼으로 변환.

Unnamed: 0,a,b,c
X,1,2,3
Y,4,5,6


In [8]:
df_stacked.unstack(level=0)

Unnamed: 0,X,Y
a,1,4
b,2,5
c,3,6


## 컬럼 이름이 multi-level 인덱스인 경우

In [10]:
df =  pd.DataFrame(data=np.arange(1, 13).reshape(2, 6),
                   columns=[['Fri', 'Fri', 'Sat', 'Sat', 'Sun', 'Sun'],
                            ['Lunch', 'Dinner'] * 3])
df

Unnamed: 0_level_0,Fri,Fri,Sat,Sat,Sun,Sun
Unnamed: 0_level_1,Lunch,Dinner,Lunch,Dinner,Lunch,Dinner
0,1,2,3,4,5,6
1,7,8,9,10,11,12


In [11]:
df.columns

MultiIndex([('Fri',  'Lunch'),
            ('Fri', 'Dinner'),
            ('Sat',  'Lunch'),
            ('Sat', 'Dinner'),
            ('Sun',  'Lunch'),
            ('Sun', 'Dinner')],
           )

In [19]:
df.stack(future_stack=True)  # level=-1(기본값): 가장 마지막 레벨의 컬럼 이름들을 인덱스(row)로 변환.

Unnamed: 0,Unnamed: 1,Fri,Sat,Sun
0,Lunch,1,3,5
0,Dinner,2,4,6
1,Lunch,7,9,11
1,Dinner,8,10,12


In [14]:
df.stack(level=0, future_stack=True)  # 첫번째 레벨의 컬럼 이름들을 인덱스로 변환.

Unnamed: 0,Unnamed: 1,Lunch,Dinner
0,Fri,1,2
0,Sat,3,4
0,Sun,5,6
1,Fri,7,8
1,Sat,9,10
1,Sun,11,12


# pivot vs melt

In [20]:
df = pd.DataFrame(data={
    'time': ['Lunch'] * 3 + ['Dinner'] * 3,
    'day': ['Fri', 'Sat', 'Sun'] * 2,
    'tip': np.arange(1, 7),
    'total_bill': np.arange(10, 70, 10)
})
df

Unnamed: 0,time,day,tip,total_bill
0,Lunch,Fri,1,10
1,Lunch,Sat,2,20
2,Lunch,Sun,3,30
3,Dinner,Fri,4,40
4,Dinner,Sat,5,50
5,Dinner,Sun,6,60


## pivot

카테고리 타입 컬럼의 값들이 컬럼 이름 또는 (row) 인덱스로 변환.

`pd.DataFrame.pivot()` 메서드 파라미터:

*   `columns`: pivoting된 데이터프레임에서 컬럼 이름으로 사용하기 위한 변수 이름(들).
*   `index`: pivoting된 데이터프레임에서 인덱스로 사용하기 위한 변수 이름(들).
*   `values`: pivoting된 데이터프레임에서 각 셀을 채울 수 있는 값들을 가지고 있는 변수 이름(들).


In [21]:
df.pivot(columns='day', index='time', values='tip')

day,Fri,Sat,Sun
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Dinner,4,5,6
Lunch,1,2,3


In [23]:
df.pivot(columns='time', index='day', values='total_bill')

time,Dinner,Lunch
day,Unnamed: 1_level_1,Unnamed: 2_level_1
Fri,40,10
Sat,50,20
Sun,60,30


## melt

`pd.DataFrame.melt()` 메서드 파라미터:

*   `id_vars`: melting될 때 컬럼으로 유지될 변수 이름(들).
    *   `id_vars`에 설정하지 않은 변수 이름들은 **variable 컬럼**의 값들로 melting됨.
    *   `id_vars`에 설정하니 않은 변수의 값들은 **value 컬럼**의 값들로 meltingehla.
*   `var_name`: variable 컬럼의 이름을 대체할 문자열. 옵션.
*   `value_name`: value 컬럼의 이름을 대체할 문자열. 옵션.


In [24]:
df = pd.DataFrame(data={
    'gender': ['Female', 'Male'],
    'lunch': [1, 5],
    'dinner': [10, 20]
})
df

Unnamed: 0,gender,lunch,dinner
0,Female,1,10
1,Male,5,20


In [25]:
df.melt(id_vars='gender')

Unnamed: 0,gender,variable,value
0,Female,lunch,1
1,Male,lunch,5
2,Female,dinner,10
3,Male,dinner,20


In [27]:
df.melt(id_vars='gender', var_name='time', value_name='size')

Unnamed: 0,gender,time,size
0,Female,lunch,1
1,Male,lunch,5
2,Female,dinner,10
3,Male,dinner,20


# pivot_table

groupby 기능과 통계 함수 적용 결과를 unstack하는 함수.

`pd.DataFrame.pivot_table()` 메서드 파라미터:

*   `values`: 집계(통계) 함수를 적용할 값들을 가지고 있는 변수 이름(들).
*   `index`: 피벗 테이블의 인덱스로 사용할 값들을 가지고 있는 변수 이름(들).
*   `columns`: 피벗 테이블의 컬럼 이름으로 사용할 값들을 가지고 있는 변수 이름(들).
*   `aggfunc`: aggregation function(집계/통계 함수). 기본값은 'mean'.


In [29]:
tips = sns.load_dataset(name='tips')

In [30]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


## 성별 팁의 평균

In [32]:
by_sex = tips.groupby(by=['sex'], observed=True).tip.mean()
by_sex

Unnamed: 0_level_0,tip
sex,Unnamed: 1_level_1
Male,3.089618
Female,2.833448


In [34]:
tips.pivot_table(values='tip', index='sex', observed=True)  # aggfunc='mean': 기본값. 생략 가능.

Unnamed: 0_level_0,tip
sex,Unnamed: 1_level_1
Male,3.089618
Female,2.833448


## 성별, 흡연여부 별 팁의 평균

In [36]:
by_sex_smoker = tips.groupby(by=['sex', 'smoker'], observed=True).tip.mean()
by_sex_smoker

Unnamed: 0_level_0,Unnamed: 1_level_0,tip
sex,smoker,Unnamed: 2_level_1
Male,Yes,3.051167
Male,No,3.113402
Female,Yes,2.931515
Female,No,2.773519


In [37]:
by_sex_smoker.unstack()

smoker,Yes,No
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
Male,3.051167,3.113402
Female,2.931515,2.773519


In [41]:
by_sex_smoker.unstack(level=0)

sex,Male,Female
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1
Yes,3.051167,2.931515
No,3.113402,2.773519


In [39]:
tips.pivot_table(values='tip', index=['sex', 'smoker'], observed=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,tip
sex,smoker,Unnamed: 2_level_1
Male,Yes,3.051167
Male,No,3.113402
Female,Yes,2.931515
Female,No,2.773519


In [40]:
tips.pivot_table(values='tip', index='sex', columns='smoker', observed=True)

smoker,Yes,No
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
Male,3.051167,3.113402
Female,2.931515,2.773519


In [42]:
tips.pivot_table(values='tip', index='smoker', columns='sex', observed=True)

sex,Male,Female
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1
Yes,3.051167,2.931515
No,3.113402,2.773519


## 성별 팁과 영수증의 평균

In [48]:
tips[['tip', 'total_bill']].mean()

Unnamed: 0,0
tip,2.998279
total_bill,19.785943


In [50]:
by_sex = tips.groupby(by=['sex'], observed=True)[['tip', 'total_bill']].mean()
by_sex

Unnamed: 0_level_0,tip,total_bill
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
Male,3.089618,20.744076
Female,2.833448,18.056897


In [52]:
tips.pivot_table(values=['tip', 'total_bill'], index='sex', observed=True)

Unnamed: 0_level_0,tip,total_bill
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
Male,3.089618,20.744076
Female,2.833448,18.056897


## 성별 흡연여부별 팁과 영수증의 평균

In [54]:
by_sex_smoker = tips.groupby(by=['sex', 'smoker'], observed=True)[['tip', 'total_bill']].mean()
by_sex_smoker

Unnamed: 0_level_0,Unnamed: 1_level_0,tip,total_bill
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Male,Yes,3.051167,22.2845
Male,No,3.113402,19.791237
Female,Yes,2.931515,17.977879
Female,No,2.773519,18.105185


In [55]:
by_sex_smoker.unstack()

Unnamed: 0_level_0,tip,tip,total_bill,total_bill
smoker,Yes,No,Yes,No
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Male,3.051167,3.113402,22.2845,19.791237
Female,2.931515,2.773519,17.977879,18.105185


In [56]:
by_sex_smoker.unstack(level=0)

Unnamed: 0_level_0,tip,tip,total_bill,total_bill
sex,Male,Female,Male,Female
smoker,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Yes,3.051167,2.931515,22.2845,17.977879
No,3.113402,2.773519,19.791237,18.105185


In [58]:
tips.pivot_table(values=['tip', 'total_bill'], index=['sex', 'smoker'], observed=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,tip,total_bill
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Male,Yes,3.051167,22.2845
Male,No,3.113402,19.791237
Female,Yes,2.931515,17.977879
Female,No,2.773519,18.105185


In [60]:
tips.pivot_table(values=['tip', 'total_bill'], index='sex', columns='smoker', observed=True)

Unnamed: 0_level_0,tip,tip,total_bill,total_bill
smoker,Yes,No,Yes,No
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Male,3.051167,3.113402,22.2845,19.791237
Female,2.931515,2.773519,17.977879,18.105185


In [61]:
tips.pivot_table(values=['tip', 'total_bill'], index='smoker', columns='sex', observed=True)

Unnamed: 0_level_0,tip,tip,total_bill,total_bill
sex,Male,Female,Male,Female
smoker,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Yes,3.051167,2.931515,22.2845,17.977879
No,3.113402,2.773519,19.791237,18.105185


## 성별, 요일별, 시간별 팁의 평균

In [66]:
by_sex_day_time = tips.groupby(by=['sex', 'day', 'time'], observed=True).tip.mean()
by_sex_day_time

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,tip
sex,day,time,Unnamed: 3_level_1
Male,Thur,Lunch,2.980333
Male,Fri,Lunch,1.9
Male,Fri,Dinner,3.032857
Male,Sat,Dinner,3.083898
Male,Sun,Dinner,3.220345
Female,Thur,Lunch,2.561935
Female,Thur,Dinner,3.0
Female,Fri,Lunch,2.745
Female,Fri,Dinner,2.81
Female,Sat,Dinner,2.801786


In [67]:
by_sex_day_time.unstack()

Unnamed: 0_level_0,time,Lunch,Dinner
sex,day,Unnamed: 2_level_1,Unnamed: 3_level_1
Male,Thur,2.980333,
Male,Fri,1.9,3.032857
Male,Sat,,3.083898
Male,Sun,,3.220345
Female,Thur,2.561935,3.0
Female,Fri,2.745,2.81
Female,Sat,,2.801786
Female,Sun,,3.367222


In [68]:
by_sex_day_time.unstack(level=1)

Unnamed: 0_level_0,day,Thur,Fri,Sat,Sun
sex,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Male,Lunch,2.980333,1.9,,
Male,Dinner,,3.032857,3.083898,3.220345
Female,Lunch,2.561935,2.745,,
Female,Dinner,3.0,2.81,2.801786,3.367222


In [69]:
by_sex_day_time.unstack(level=0)

Unnamed: 0_level_0,sex,Male,Female
day,time,Unnamed: 2_level_1,Unnamed: 3_level_1
Thur,Lunch,2.980333,2.561935
Thur,Dinner,,3.0
Fri,Lunch,1.9,2.745
Fri,Dinner,3.032857,2.81
Sat,Dinner,3.083898,2.801786
Sun,Dinner,3.220345,3.367222


In [77]:
by_sex_day_time.unstack(level=[1, 2])

day,Thur,Fri,Fri,Sat,Sun,Thur
time,Lunch,Lunch,Dinner,Dinner,Dinner,Dinner
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Male,2.980333,1.9,3.032857,3.083898,3.220345,
Female,2.561935,2.745,2.81,2.801786,3.367222,3.0


In [78]:
tips.pivot_table(values='tip', index=['sex', 'day', 'time'], observed=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,tip
sex,day,time,Unnamed: 3_level_1
Male,Thur,Lunch,2.980333
Male,Fri,Lunch,1.9
Male,Fri,Dinner,3.032857
Male,Sat,Dinner,3.083898
Male,Sun,Dinner,3.220345
Female,Thur,Lunch,2.561935
Female,Thur,Dinner,3.0
Female,Fri,Lunch,2.745
Female,Fri,Dinner,2.81
Female,Sat,Dinner,2.801786


In [79]:
tips.pivot_table(values='tip', columns=['sex', 'day', 'time'], observed=True)

sex,Male,Male,Male,Male,Male,Female,Female,Female,Female,Female,Female
day,Thur,Fri,Fri,Sat,Sun,Thur,Thur,Fri,Fri,Sat,Sun
time,Lunch,Lunch,Dinner,Dinner,Dinner,Lunch,Dinner,Lunch,Dinner,Dinner,Dinner
tip,2.980333,1.9,3.032857,3.083898,3.220345,2.561935,3.0,2.745,2.81,2.801786,3.367222


In [81]:
tips.pivot_table(values='tip', index=['day', 'time'], columns='sex', observed=True)

Unnamed: 0_level_0,sex,Male,Female
day,time,Unnamed: 2_level_1,Unnamed: 3_level_1
Thur,Lunch,2.980333,2.561935
Thur,Dinner,,3.0
Fri,Lunch,1.9,2.745
Fri,Dinner,3.032857,2.81
Sat,Dinner,3.083898,2.801786
Sun,Dinner,3.220345,3.367222


In [82]:
tips.pivot_table(values='tip', index='sex', columns=['day', 'time'], observed=True)

day,Thur,Thur,Fri,Fri,Sat,Sun
time,Lunch,Dinner,Lunch,Dinner,Dinner,Dinner
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Male,2.980333,,1.9,3.032857,3.083898,3.220345
Female,2.561935,3.0,2.745,2.81,2.801786,3.367222


## 성별 팁의 최솟값, 중앙값, 최댓값

## 성별, 요일별 영수증 최솟값, 중앙값, 최댓값

## 성별, 흡연여부별, 요일별 팁의 중앙값

## 성별, 흡연여부별, 요일별, 시간별 팁의 중앙값