In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pylab as plt
import scipy as sp
import scipy.stats as stats
from sklearn.preprocessing import *

In [None]:
"""
stack이 (위에서 아래로 길게, 높게) 쌓는 것이면, 

unstack은 쌓은 것을 옆으로 늘어놓는것(왼쪽에서 오른쪽으로 넓게) 
"""

In [93]:
# making df
# pd.MultiIndex.from_tuples : 멀티 인덱스 만들기
index = pd.MultiIndex.from_tuples([("cust_1", "2015"), ("cust_1", "2016"),
                                   ("cust_2", "2015"), ("cust_2", "2016")])

df = pd.DataFrame(np.arange(16).reshape(4, 4),
                  index=index, columns=["prd1", "prd2", "prd3", "prd4"])
print(df.index)
df


MultiIndex(levels=[['cust_1', 'cust_2'], ['2015', '2016']],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]])


Unnamed: 0,Unnamed: 1,prd1,prd2,prd3,prd4
cust_1,2015,0,1,2,3
cust_1,2016,4,5,6,7
cust_2,2015,8,9,10,11
cust_2,2016,12,13,14,15


In [94]:
# stack()
df_s = df.stack()
df_s

# series로 return

cust_1  2015  prd1     0
              prd2     1
              prd3     2
              prd4     3
        2016  prd1     4
              prd2     5
              prd3     6
              prd4     7
cust_2  2015  prd1     8
              prd2     9
              prd3    10
              prd4    11
        2016  prd1    12
              prd2    13
              prd3    14
              prd4    15
dtype: int32

In [10]:
df_s.index

MultiIndex(levels=[['cust_1', 'cust_2'], ['2015', '2016'], ['prd1', 'prd2', 'prd3', 'prd4']],
           labels=[[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1], [0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1], [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]])

In [19]:
# indexing
# 2015년도 자료확인
df_s[:, "2015"]

cust_1  prd1     0
        prd2     1
        prd3     2
        prd4     3
cust_2  prd1     8
        prd2     9
        prd3    10
        prd4    11
dtype: int32

In [20]:
# cust_2 자료 확인
df_s["cust_2"]

2015  prd1     8
      prd2     9
      prd3    10
      prd4    11
2016  prd1    12
      prd2    13
      prd3    14
      prd4    15
dtype: int32

In [23]:
# cust_1의 2016년 자료 확인
df_s["cust_1", "2016"]

prd1    4
prd2    5
prd3    6
prd4    7
dtype: int32

In [40]:
# cust_1의 2016년 prd3, 4 확인
df_s["cust_1", "2016"]["prd3":]

prd3    6
prd4    7
dtype: int32

In [50]:
df

Unnamed: 0,Unnamed: 1,prd1,prd2,prd3,prd4
cust_1,2015,0,1,2,3
cust_1,2016,4,5,6,7
cust_2,2015,8,9,10,11
cust_2,2016,12,13,14,15


In [66]:
# inserting NaN
# cust_2, 2015-2016, prd4 결측치 
df.loc[("cust_2", ["2015", "2016"]), "prd4"] = np.nan
df

Unnamed: 0,Unnamed: 1,prd1,prd2,prd3,prd4
cust_1,2015,0,1,2,3.0
cust_1,2016,4,5,6,7.0
cust_2,2015,8,9,10,
cust_2,2016,12,13,14,


In [67]:
# stack() with dropna=False
df.stack(dropna=False)

cust_1  2015  prd1     0.0
              prd2     1.0
              prd3     2.0
              prd4     3.0
        2016  prd1     4.0
              prd2     5.0
              prd3     6.0
              prd4     7.0
cust_2  2015  prd1     8.0
              prd2     9.0
              prd3    10.0
              prd4     NaN
        2016  prd1    12.0
              prd2    13.0
              prd3    14.0
              prd4     NaN
dtype: float64

In [68]:
# stack() with dropna=True
df.stack(dropna=True)

cust_1  2015  prd1     0.0
              prd2     1.0
              prd3     2.0
              prd4     3.0
        2016  prd1     4.0
              prd2     5.0
              prd3     6.0
              prd4     7.0
cust_2  2015  prd1     8.0
              prd2     9.0
              prd3    10.0
        2016  prd1    12.0
              prd2    13.0
              prd3    14.0
dtype: float64

In [77]:
# unstack()
df_s.index
# 레벨 총 3 : 0, 1, 2

MultiIndex(levels=[['cust_1', 'cust_2'], ['2015', '2016'], ['prd1', 'prd2', 'prd3', 'prd4']],
           labels=[[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1], [0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1], [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]])

In [95]:
df_s.unstack(level=2)

Unnamed: 0,Unnamed: 1,prd1,prd2,prd3,prd4
cust_1,2015,0,1,2,3
cust_1,2016,4,5,6,7
cust_2,2015,8,9,10,11
cust_2,2016,12,13,14,15


In [96]:
df_s.unstack(level=1)

Unnamed: 0,Unnamed: 1,2015,2016
cust_1,prd1,0,4
cust_1,prd2,1,5
cust_1,prd3,2,6
cust_1,prd4,3,7
cust_2,prd1,8,12
cust_2,prd2,9,13
cust_2,prd3,10,14
cust_2,prd4,11,15


In [97]:
df_s.unstack(level=0)

Unnamed: 0,Unnamed: 1,cust_1,cust_2
2015,prd1,0,8
2015,prd2,1,9
2015,prd3,2,10
2015,prd4,3,11
2016,prd1,4,12
2016,prd2,5,13
2016,prd3,6,14
2016,prd4,7,15


In [89]:
df_s.unstack(level=[0, 1])

Unnamed: 0_level_0,cust_1,cust_1,cust_2,cust_2
Unnamed: 0_level_1,2015,2016,2015,2016
prd1,0,4,8,12
prd2,1,5,9,13
prd3,2,6,10,14
prd4,3,7,11,15


In [111]:
# reset_index() : 해당 레벨의 index를 컬럼으로 만들기
df_u = df_s.unstack(level=2).reset_index(level=[0, 1])
df_u

Unnamed: 0,level_0,level_1,prd1,prd2,prd3,prd4
0,cust_1,2015,0,1,2,3
1,cust_1,2016,4,5,6,7
2,cust_2,2015,8,9,10,11
3,cust_2,2016,12,13,14,15


In [116]:
# changing columns name
# rename(columns=, index=, )
# inplace=False : 원본데이터는 변형하지 않음
df_u.rename(columns={"level_0": "custID", "level_1": "year"}, 
            inplace=True)
df_u

Unnamed: 0,custID,year,prd1,prd2,prd3,prd4
0,cust_1,2015,0,1,2,3
1,cust_1,2016,4,5,6,7
2,cust_2,2015,8,9,10,11
3,cust_2,2016,12,13,14,15


In [117]:
df_u

Unnamed: 0,custID,year,prd1,prd2,prd3,prd4
0,cust_1,2015,0,1,2,3
1,cust_1,2016,4,5,6,7
2,cust_2,2015,8,9,10,11
3,cust_2,2016,12,13,14,15
