# Pandas - Index & Groupby

In [3]:
import numpy as np
import pandas as pd

In [8]:
data = {'color': ['black', 'white', 'black', 'white', 'black', 'white', 'black', 'white', 'black', 'white'],
        'size': ['S', 'M', 'L', 'M', 'L', 'S', 'S', 'XL', 'XL', 'M'],
        'data': pd.date_range('1/1/2019', periods=10, freq='W'),
        'a': np.random.randn(10),
        'b': np.random.normal(0.5, 2, 10)}

In [10]:
index = [['A', 'B', 'B', 'B', 'C', 'A', 'B', 'A', 'C', 'C'], ['JP', 'CN', 'US', 'US', 'US', 'CN', 'CN', 'CA', 'JP', 'CA']]
index = pd.MultiIndex.from_arrays(index, names=['class', 'country'])

In [11]:
df = pd.DataFrame(data, index=index)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,color,size,data,a,b
class,country,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A,JP,black,S,2019-01-06,-0.227358,0.166195
B,CN,white,M,2019-01-13,-1.215936,1.682201
B,US,black,L,2019-01-20,1.17344,-1.091643
B,US,white,M,2019-01-27,0.423253,-0.238322
C,US,black,L,2019-02-03,-0.09764,0.420131
A,CN,white,S,2019-02-10,0.39301,-1.197277
B,CN,black,S,2019-02-17,0.679828,-1.519297
A,CA,white,XL,2019-02-24,-0.715628,2.794433
C,JP,black,XL,2019-03-03,-0.812198,-2.287517
C,CA,white,M,2019-03-10,-0.784265,-1.598787


## Groupby

In [13]:
size_lvl = df.groupby('size')

for i in size_lvl:
    print(i)

('L',                color size       data        a         b
class country                                          
B     US       black    L 2019-01-20  1.17344 -1.091643
C     US       black    L 2019-02-03 -0.09764  0.420131)
('M',                color size       data         a         b
class country                                           
B     CN       white    M 2019-01-13 -1.215936  1.682201
      US       white    M 2019-01-27  0.423253 -0.238322
C     CA       white    M 2019-03-10 -0.784265 -1.598787)
('S',                color size       data         a         b
class country                                           
A     JP       black    S 2019-01-06 -0.227358  0.166195
      CN       white    S 2019-02-10  0.393010 -1.197277
B     CN       black    S 2019-02-17  0.679828 -1.519297)
('XL',                color size       data         a         b
class country                                           
A     CA       white   XL 2019-02-24 -0.715628  2.794433
C     J

In [15]:
size_lvl.sum().add_prefix('sum_')

Unnamed: 0_level_0,sum_a,sum_b
size,Unnamed: 1_level_1,Unnamed: 2_level_1
L,1.0758,-0.671512
M,-1.576949,-0.154908
S,0.84548,-2.550379
XL,-1.527826,0.506916


In [17]:
size_lvl.get_group('M')

Unnamed: 0_level_0,Unnamed: 1_level_0,color,data,a,b
class,country,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
B,CN,white,2019-01-13,-1.215936,1.682201
B,US,white,2019-01-27,0.423253,-0.238322
C,CA,white,2019-03-10,-0.784265,-1.598787


In [24]:
def get_letter_type(letter):
    if 'a' in letter:
        return 'feature_1'
    else:
        return 'feature_2'
    
for i in list(df.groupby(get_letter_type, axis=1)):
    print(i)

('feature_1',                     data         a
class country                     
A     JP      2019-01-06 -0.227358
B     CN      2019-01-13 -1.215936
      US      2019-01-20  1.173440
      US      2019-01-27  0.423253
C     US      2019-02-03 -0.097640
A     CN      2019-02-10  0.393010
B     CN      2019-02-17  0.679828
A     CA      2019-02-24 -0.715628
C     JP      2019-03-03 -0.812198
      CA      2019-03-10 -0.784265)
('feature_2',                color size         b
class country                      
A     JP       black    S  0.166195
B     CN       white    M  1.682201
      US       black    L -1.091643
      US       white    M -0.238322
C     US       black    L  0.420131
A     CN       white    S -1.197277
B     CN       black    S -1.519297
A     CA       white   XL  2.794433
C     JP       black   XL -2.287517
      CA       white    M -1.598787)


In [26]:
# 0: class, 1: country
for i in list(df.groupby(level=[0,1])):
    print(i)

(('A', 'CA'),                color size       data         a         b
class country                                           
A     CA       white   XL 2019-02-24 -0.715628  2.794433)
(('A', 'CN'),                color size       data        a         b
class country                                          
A     CN       white    S 2019-02-10  0.39301 -1.197277)
(('A', 'JP'),                color size       data         a         b
class country                                           
A     JP       black    S 2019-01-06 -0.227358  0.166195)
(('B', 'CN'),                color size       data         a         b
class country                                           
B     CN       white    M 2019-01-13 -1.215936  1.682201
      CN       black    S 2019-02-17  0.679828 -1.519297)
(('B', 'US'),                color size       data         a         b
class country                                           
B     US       black    L 2019-01-20  1.173440 -1.091643
      US       wh

## Aggregation

In [31]:
group_2 = df.groupby(['size', 'color'])

group_2.agg({'a': np.min, 'b': np.mean})

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
size,color,Unnamed: 2_level_1,Unnamed: 3_level_1
L,black,-0.09764,-0.335756
M,white,-1.215936,-0.051636
S,black,-0.227358,-0.676551
S,white,0.39301,-1.197277
XL,black,-0.812198,-2.287517
XL,white,-0.715628,2.794433


In [32]:
data_range = lambda x: x.max() - x.min()
df.groupby('size').transform(data_range)

Unnamed: 0_level_0,Unnamed: 1_level_0,data,a,b
class,country,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,JP,42 days,0.907185,1.685492
B,CN,56 days,1.639189,3.280988
B,US,14 days,1.27108,1.511775
B,US,56 days,1.639189,3.280988
C,US,14 days,1.27108,1.511775
A,CN,42 days,0.907185,1.685492
B,CN,42 days,0.907185,1.685492
A,CA,7 days,0.09657,5.08195
C,JP,7 days,0.09657,5.08195
C,CA,56 days,1.639189,3.280988


In [37]:
df.iloc[1, 3:5] = np.NaN
df

Unnamed: 0_level_0,Unnamed: 1_level_0,color,size,data,a,b
class,country,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A,JP,black,S,2019-01-06,-0.227358,0.166195
B,CN,white,M,2019-01-13,,
B,US,black,L,2019-01-20,1.17344,-1.091643
B,US,white,M,2019-01-27,0.423253,-0.238322
C,US,black,L,2019-02-03,-0.09764,0.420131
A,CN,white,S,2019-02-10,0.39301,-1.197277
B,CN,black,S,2019-02-17,0.679828,-1.519297
A,CA,white,XL,2019-02-24,-0.715628,2.794433
C,JP,black,XL,2019-03-03,-0.812198,-2.287517
C,CA,white,M,2019-03-10,-0.784265,-1.598787


In [38]:
f = lambda x: x.fillna(x.mean())
group_2.transform(f)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
class,country,Unnamed: 2_level_1,Unnamed: 3_level_1
A,JP,-0.227358,0.166195
B,CN,-0.180506,-0.918554
B,US,1.17344,-1.091643
B,US,0.423253,-0.238322
C,US,-0.09764,0.420131
A,CN,0.39301,-1.197277
B,CN,0.679828,-1.519297
A,CA,-0.715628,2.794433
C,JP,-0.812198,-2.287517
C,CA,-0.784265,-1.598787
