# Pandas - Index & Groupby

In [1]:
import numpy as np
import pandas as pd

In [34]:
data = {'color': ['black', 'white', 'black', 'white', 'black', 'white', 'black', 'white', 'black', 'white'],
        'size': ['S', 'M', 'L', 'M', 'L', 'S', 'S', 'XL', 'XL', 'M'],
        'data': pd.date_range('1/1/2019', periods=10, freq='W'),
        'a': np.random.randn(10),
        'b': np.random.normal(0.5, 2, 10)}

index = [['A', 'B', 'B', 'B', 'C', 'A', 'B', 'A', 'C', 'C'], ['JP', 'CN', 'US', 'US', 'US', 'CN', 'CN', 'CA', 'JP', 'CA']]
index = pd.MultiIndex.from_arrays(index, names=['class', 'country'])

In [35]:
df = pd.DataFrame(data, index=index)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,color,size,data,a,b
class,country,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A,JP,black,S,2019-01-06,1.504432,0.243258
B,CN,white,M,2019-01-13,-0.126859,3.785009
B,US,black,L,2019-01-20,-1.203678,0.023889
B,US,white,M,2019-01-27,-0.144241,3.149552
C,US,black,L,2019-02-03,-0.385681,-0.641574
A,CN,white,S,2019-02-10,1.250439,-0.607894
B,CN,black,S,2019-02-17,0.087329,-2.457463
A,CA,white,XL,2019-02-24,-1.102457,0.097761
C,JP,black,XL,2019-03-03,-1.173527,1.404316
C,CA,white,M,2019-03-10,2.81217,-0.07898


In [32]:
df2 = pd.DataFrame(data)
df2.append({'color': 'green', 'size': 'XS', 'data': '2019-02-01 00:00:00', 'a': 1, 'b': -3}, ignore_index=True)

Unnamed: 0,color,size,data,a,b
0,black,S,2019-01-06 00:00:00,3.379347,-1.720918
1,white,M,2019-01-13 00:00:00,1.085067,-1.771311
2,black,L,2019-01-20 00:00:00,-0.784112,-1.84251
3,white,M,2019-01-27 00:00:00,1.789608,0.789167
4,black,L,2019-02-03 00:00:00,0.229144,-1.256342
5,white,S,2019-02-10 00:00:00,-0.257723,3.844117
6,black,S,2019-02-17 00:00:00,-2.262585,4.73239
7,white,XL,2019-02-24 00:00:00,-0.835432,-1.177477
8,black,XL,2019-03-03 00:00:00,0.367152,-0.356291
9,white,M,2019-03-10 00:00:00,-0.693315,3.003024


In [33]:
# more efficent
dic = dict({'color': ['green'], 'size': ['XS'], 'data': ['2019-02-01 00:00:00'], 'a': [1], 'b': [-3]})

pd.concat([df2, pd.DataFrame(dic)], ignore_index=True)

Unnamed: 0,color,size,data,a,b
0,black,S,2019-01-06 00:00:00,3.379347,-1.720918
1,white,M,2019-01-13 00:00:00,1.085067,-1.771311
2,black,L,2019-01-20 00:00:00,-0.784112,-1.84251
3,white,M,2019-01-27 00:00:00,1.789608,0.789167
4,black,L,2019-02-03 00:00:00,0.229144,-1.256342
5,white,S,2019-02-10 00:00:00,-0.257723,3.844117
6,black,S,2019-02-17 00:00:00,-2.262585,4.73239
7,white,XL,2019-02-24 00:00:00,-0.835432,-1.177477
8,black,XL,2019-03-03 00:00:00,0.367152,-0.356291
9,white,M,2019-03-10 00:00:00,-0.693315,3.003024


### group by specific columns

In [36]:
size_lvl = df.groupby('size')

for i in size_lvl:
    print(i)

('L',                color size       data         a         b
class country                                           
B     US       black    L 2019-01-20 -1.203678  0.023889
C     US       black    L 2019-02-03 -0.385681 -0.641574)
('M',                color size       data         a         b
class country                                           
B     CN       white    M 2019-01-13 -0.126859  3.785009
      US       white    M 2019-01-27 -0.144241  3.149552
C     CA       white    M 2019-03-10  2.812170 -0.078980)
('S',                color size       data         a         b
class country                                           
A     JP       black    S 2019-01-06  1.504432  0.243258
      CN       white    S 2019-02-10  1.250439 -0.607894
B     CN       black    S 2019-02-17  0.087329 -2.457463)
('XL',                color size       data         a         b
class country                                           
A     CA       white   XL 2019-02-24 -1.102457  0.097761
C  

### select specific group

In [43]:
size_lvl.get_group('M')

Unnamed: 0_level_0,Unnamed: 1_level_0,color,data,a,b
class,country,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
B,CN,white,2019-01-13,0.403105,-0.061578
B,US,white,2019-01-27,-0.950894,-1.255788
C,CA,white,2019-03-10,0.997361,1.465797


In [45]:
# 0: class, 1: country
for i in list(df.groupby(level=[0,1])):
    print(i)

(('A', 'CA'),                color size       data         a         b
class country                                           
A     CA       white   XL 2019-02-24  0.389593  4.642428)
(('A', 'CN'),                color size       data         a         b
class country                                           
A     CN       white    S 2019-02-10 -0.198026 -2.008272)
(('A', 'JP'),                color size       data         a         b
class country                                           
A     JP       black    S 2019-01-06 -0.350326  3.920069)
(('B', 'CN'),                color size       data         a         b
class country                                           
B     CN       white    M 2019-01-13  0.403105 -0.061578
      CN       black    S 2019-02-17  0.519806  1.000723)
(('B', 'US'),                color size       data         a         b
class country                                           
B     US       black    L 2019-01-20 -1.782448  1.430286
      US      

## Aggregation

In [41]:
size_lvl.sum().add_prefix('sum_')

Unnamed: 0_level_0,sum_a,sum_b
size,Unnamed: 1_level_1,Unnamed: 2_level_1
L,-1.589359,-0.617685
M,2.541069,6.855581
S,2.8422,-2.8221
XL,-2.275984,1.502077


In [42]:
df.groupby('size')['a', 'b'].agg(['sum'])

Unnamed: 0_level_0,a,b
Unnamed: 0_level_1,sum,sum
size,Unnamed: 1_level_2,Unnamed: 2_level_2
L,-1.589359,-0.617685
M,2.541069,6.855581
S,2.8422,-2.8221
XL,-2.275984,1.502077


In [43]:
df.groupby(['size', 'color']).agg({'a': np.min, 'b': np.mean})

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
size,color,Unnamed: 2_level_1,Unnamed: 3_level_1
L,black,-1.203678,-0.308842
M,white,-0.144241,2.285194
S,black,0.087329,-1.107103
S,white,1.250439,-0.607894
XL,black,-1.173527,1.404316
XL,white,-1.102457,0.097761


### apply customize function

In [47]:
data_range = lambda x: x.max() - x.min()
df.groupby('size').transform(data_range)

Unnamed: 0_level_0,Unnamed: 1_level_0,data,a,b
class,country,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,JP,42 days,0.870131,5.928341
B,CN,56 days,1.948255,2.721586
B,US,14 days,2.761798,0.873576
B,US,56 days,1.948255,2.721586
C,US,14 days,2.761798,0.873576
A,CN,42 days,0.870131,5.928341
B,CN,42 days,0.870131,5.928341
A,CA,7 days,0.956596,1.993194
C,JP,7 days,0.956596,1.993194
C,CA,56 days,1.948255,2.721586


In [48]:
df.iloc[1, 3:5] = np.NaN
df

Unnamed: 0_level_0,Unnamed: 1_level_0,color,size,data,a,b
class,country,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A,JP,black,S,2019-01-06,-0.350326,3.920069
B,CN,white,M,2019-01-13,,
B,US,black,L,2019-01-20,-1.782448,1.430286
B,US,white,M,2019-01-27,-0.950894,-1.255788
C,US,black,L,2019-02-03,0.979349,0.55671
A,CN,white,S,2019-02-10,-0.198026,-2.008272
B,CN,black,S,2019-02-17,0.519806,1.000723
A,CA,white,XL,2019-02-24,0.389593,4.642428
C,JP,black,XL,2019-03-03,-0.567004,2.649234
C,CA,white,M,2019-03-10,0.997361,1.465797


In [59]:
f = lambda x: x.fillna(x.mean())
df = df.groupby(['class', 'country']).transform(f)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
class,country,Unnamed: 2_level_1,Unnamed: 3_level_1
A,JP,-0.350326,3.920069
B,CN,0.519806,1.000723
B,US,-1.782448,1.430286
B,US,-0.950894,-1.255788
C,US,0.979349,0.55671
A,CN,-0.198026,-2.008272
B,CN,0.519806,1.000723
A,CA,0.389593,4.642428
C,JP,-0.567004,2.649234
C,CA,0.997361,1.465797


In [50]:
for i in df.groupby('color'):
    print(i)

('black',                color size       data         a         b
class country                                           
A     JP       black    S 2019-01-06 -0.350326  3.920069
B     US       black    L 2019-01-20 -1.782448  1.430286
C     US       black    L 2019-02-03  0.979349  0.556710
B     CN       black    S 2019-02-17  0.519806  1.000723
C     JP       black   XL 2019-03-03 -0.567004  2.649234)
('white',                color size       data         a         b
class country                                           
B     CN       white    M 2019-01-13       NaN       NaN
      US       white    M 2019-01-27 -0.950894 -1.255788
A     CN       white    S 2019-02-10 -0.198026 -2.008272
      CA       white   XL 2019-02-24  0.389593  4.642428
C     CA       white    M 2019-03-10  0.997361  1.465797)


## Rolling

In [51]:
df.groupby('color').rolling(3).a.sum()

color  class  country
black  A      JP              NaN
       B      US              NaN
       C      US        -1.153425
       B      CN        -0.283293
       C      JP         0.932151
white  B      CN              NaN
              US              NaN
       A      CN              NaN
              CA        -0.759327
       C      CA         1.188928
Name: a, dtype: float64

## Expanding

In [52]:
df.groupby('color').expanding(3).a.sum()

color  class  country
black  A      JP              NaN
       B      US              NaN
       C      US        -1.153425
       B      CN        -0.633619
       C      JP        -1.200623
white  B      CN              NaN
              US              NaN
       A      CN              NaN
              CA        -0.759327
       C      CA         0.238034
Name: a, dtype: float64

## Filter

In [58]:
df.groupby('class').filter(lambda x: len(x) > 3)

Unnamed: 0_level_0,Unnamed: 1_level_0,color,size,data,a,b
class,country,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
B,CN,white,M,2019-01-13,,
B,US,black,L,2019-01-20,-1.782448,1.430286
B,US,white,M,2019-01-27,-0.950894,-1.255788
B,CN,black,S,2019-02-17,0.519806,1.000723


## Apply

In [54]:
df.groupby('class')['a'].apply(lambda x: x+1)

ValueError: cannot handle a non-unique multi-index!