# Basic Pandas Notes

In [2]:
import numpy as np
import pandas as pd

## 1.1. Create DataFrame

In [3]:
data = {'color': ['black', 'white', 'black', 'white', 'black', 'white', 'black', 'white', 'black', 'white'],
        'size': ['S', 'M', 'L', 'M', 'L', 'S', 'S', 'XL', 'XL', 'M'],
        'data': pd.date_range('1/1/2019', periods=10, freq='W'),
        'a': np.random.randn(10),
        'b': np.random.normal(0.5, 2, 10)}

pd.DataFrame(data)

Unnamed: 0,color,size,data,a,b
0,black,S,2019-01-06,-1.244961,1.335967
1,white,M,2019-01-13,0.092807,-0.95569
2,black,L,2019-01-20,-0.217615,1.533423
3,white,M,2019-01-27,-1.263106,1.183799
4,black,L,2019-02-03,1.240875,3.517104
5,white,S,2019-02-10,-1.504601,0.00023
6,black,S,2019-02-17,-1.488004,1.238074
7,white,XL,2019-02-24,1.388901,1.649747
8,black,XL,2019-03-03,0.459483,0.437548
9,white,M,2019-03-10,0.847091,0.742776


## 1.2. Adding Index

In [4]:
index = [['A', 'B', 'B', 'B', 'C', 'A', 'B', 'A', 'C', 'C'], ['JP', 'CN', 'US', 'US', 'US', 'CN', 'CN', 'CA', 'JP', 'CA']]
index = pd.MultiIndex.from_arrays(index, names=['class', 'country'])

df = pd.DataFrame(data, index=index)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,color,size,data,a,b
class,country,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A,JP,black,S,2019-01-06,-1.244961,1.335967
B,CN,white,M,2019-01-13,0.092807,-0.95569
B,US,black,L,2019-01-20,-0.217615,1.533423
B,US,white,M,2019-01-27,-1.263106,1.183799
C,US,black,L,2019-02-03,1.240875,3.517104
A,CN,white,S,2019-02-10,-1.504601,0.00023
B,CN,black,S,2019-02-17,-1.488004,1.238074
A,CA,white,XL,2019-02-24,1.388901,1.649747
C,JP,black,XL,2019-03-03,0.459483,0.437548
C,CA,white,M,2019-03-10,0.847091,0.742776


## 1.3. Append DataFrame

In [5]:
df2 = pd.DataFrame(data)
df2.append({'color': 'green', 'size': 'XS', 'data': '2019-02-01 00:00:00', 'a': 1, 'b': -3}, ignore_index=True)

Unnamed: 0,color,size,data,a,b
0,black,S,2019-01-06 00:00:00,-1.244961,1.335967
1,white,M,2019-01-13 00:00:00,0.092807,-0.95569
2,black,L,2019-01-20 00:00:00,-0.217615,1.533423
3,white,M,2019-01-27 00:00:00,-1.263106,1.183799
4,black,L,2019-02-03 00:00:00,1.240875,3.517104
5,white,S,2019-02-10 00:00:00,-1.504601,0.00023
6,black,S,2019-02-17 00:00:00,-1.488004,1.238074
7,white,XL,2019-02-24 00:00:00,1.388901,1.649747
8,black,XL,2019-03-03 00:00:00,0.459483,0.437548
9,white,M,2019-03-10 00:00:00,0.847091,0.742776


In [11]:
# more efficent way
temp = dict({'color': ['green'], 'size': ['XS'], 'data': ['2019-02-01 00:00:00'], 'a': [1], 'b': [-3]})

# append row wise
pd.concat([df2, pd.DataFrame(temp)], axis=0, ignore_index=True)

# append column wise
temp = pd.Series(np.linspace(4,20,10))
pd.concat([df2, temp], axis=1)

Unnamed: 0,color,size,data,a,b,0
0,black,S,2019-01-06,-1.244961,1.335967,4.0
1,white,M,2019-01-13,0.092807,-0.95569,5.777778
2,black,L,2019-01-20,-0.217615,1.533423,7.555556
3,white,M,2019-01-27,-1.263106,1.183799,9.333333
4,black,L,2019-02-03,1.240875,3.517104,11.111111
5,white,S,2019-02-10,-1.504601,0.00023,12.888889
6,black,S,2019-02-17,-1.488004,1.238074,14.666667
7,white,XL,2019-02-24,1.388901,1.649747,16.444444
8,black,XL,2019-03-03,0.459483,0.437548,18.222222
9,white,M,2019-03-10,0.847091,0.742776,20.0


## 2.1. Query Row Data

In [25]:
# select row by index, return Series
df.iloc[0]

color                  black
size                       S
data     2019-01-06 00:00:00
a                   -1.24496
b                    1.33597
Name: (A, JP), dtype: object

In [26]:
# select row by index, return dataframe
df.iloc[[0]]

Unnamed: 0_level_0,Unnamed: 1_level_0,color,size,data,a,b
class,country,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A,JP,black,S,2019-01-06,-1.244961,1.335967


In [34]:
# select row by name, return dataframe
df.loc['A']

Unnamed: 0_level_0,color,size,data,a,b
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
JP,black,S,2019-01-06,-1.244961,1.335967
CN,white,S,2019-02-10,-1.504601,0.00023
CA,white,XL,2019-02-24,1.388901,1.649747


## 2.2. Query Column Data

In [22]:
# select column by index, return Series
df.iloc[:, 0]

class  country
A      JP         black
B      CN         white
       US         black
       US         white
C      US         black
A      CN         white
B      CN         black
A      CA         white
C      JP         black
       CA         white
Name: color, dtype: object

In [37]:
# select column by name, return Series
df.loc[:, 'color']

class  country
A      JP         black
B      CN         white
       US         black
       US         white
C      US         black
A      CN         white
B      CN         black
A      CA         white
C      JP         black
       CA         white
Name: color, dtype: object

In [28]:
# select column by index, return dataframe
df.iloc[:, [0]].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,color
class,country,Unnamed: 2_level_1
A,JP,black
B,CN,white
B,US,black
B,US,white
C,US,black


In [38]:
# select column by name, return dataframe
df[['color']].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,color
class,country,Unnamed: 2_level_1
A,JP,black
B,CN,white
B,US,black
B,US,white
C,US,black


## 3.1. Group by

In [63]:
size_lvl = df.groupby('size')

for i in size_lvl:
    print(i)

('L',                color size       data         a         b
class country                                           
B     US       black    L 2019-01-20  0.386777 -0.366475
C     US       black    L 2019-02-03  0.304819  1.441471)
('M',                color size       data         a         b
class country                                           
B     CN       white    M 2019-01-13 -1.820500  1.562026
      US       white    M 2019-01-27 -1.604988  3.103769
C     CA       white    M 2019-03-10 -0.621649 -0.967063)
('S',                color size       data         a         b
class country                                           
A     JP       black    S 2019-01-06  0.652694  2.277243
      CN       white    S 2019-02-10 -0.356302  0.380898
B     CN       black    S 2019-02-17 -0.440898 -3.134000)
('XL',                color size       data         a        b
class country                                          
A     CA       white   XL 2019-02-24 -1.118385  1.21721
C     

## 3.2. Select specific group

In [65]:
size_lvl.get_group('M')

Unnamed: 0_level_0,Unnamed: 1_level_0,color,size,data,a,b
class,country,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
B,CN,white,M,2019-01-13,-1.8205,1.562026
B,US,white,M,2019-01-27,-1.604988,3.103769
C,CA,white,M,2019-03-10,-0.621649,-0.967063


## 4.1. Aggregation

In [66]:
size_lvl.sum().add_prefix('sum_')

Unnamed: 0_level_0,sum_a,sum_b
size,Unnamed: 1_level_1,Unnamed: 2_level_1
L,0.691596,1.074996
M,-4.047138,3.698732
S,-0.144507,-0.47586
XL,-0.988178,2.28193


In [67]:
df.groupby(['size', 'color']).agg({'a': np.min, 'b': np.mean})

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
size,color,Unnamed: 2_level_1,Unnamed: 3_level_1
L,black,0.304819,0.537498
M,white,-1.8205,1.232911
S,black,-0.440898,-0.428379
S,white,-0.356302,0.380898
XL,black,0.130208,1.06472
XL,white,-1.118385,1.21721


## 5.1. Apply customize function

In [96]:
# Transform
data_range = lambda x: x.max() - x.min()
df.groupby('size').transform(data_range)

Unnamed: 0_level_0,Unnamed: 1_level_0,data,a,b
class,country,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,JP,42 days,1.093592,5.411243
B,CN,56 days,1.198851,4.070832
B,US,14 days,0.081958,1.807946
B,US,56 days,1.198851,4.070832
C,US,14 days,0.081958,1.807946
A,CN,42 days,1.093592,5.411243
B,CN,42 days,1.093592,5.411243
A,CA,7 days,1.248593,0.152489
C,JP,7 days,1.248593,0.152489
C,CA,56 days,1.198851,4.070832


In [98]:
# Apply
df.groupby('size')['a'].apply(lambda x: x.max() - x.min())

size
L     0.081958
M     1.198851
S     1.093592
XL    1.248593
Name: a, dtype: float64

## 6.1. Rolling

Creating a n-row window to aggregate data group by columns. If not enough data, then return `NaN`

In [90]:
df.groupby('color').rolling(2).a.sum()

color  class  country
black  A      JP              NaN
       B      US         1.039471
       C      US         0.691596
       B      CN        -0.136079
       C      JP        -0.310691
white  B      CN              NaN
              US        -3.425488
       A      CN        -1.961290
              CA        -1.474688
       C      CA        -1.740035
Name: a, dtype: float64

## 7.1. Expanding

Creating a window to aggregate data group by columns, but the window is increasing every step

In [84]:
# Cumulative sum
df.groupby('color').expanding(1).a.sum()

color  class  country
black  A      JP         0.652694
       B      US         1.039471
       C      US         1.344290
       B      CN         0.903392
       C      JP         1.033599
white  B      CN        -1.820500
              US        -3.425488
       A      CN        -3.781790
              CA        -4.900176
       C      CA        -5.521825
Name: a, dtype: float64

## 8.1. Filter

In [91]:
df.groupby('class').filter(lambda x: len(x) > 3)

Unnamed: 0_level_0,Unnamed: 1_level_0,color,size,data,a,b
class,country,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
B,CN,white,M,2019-01-13,-1.8205,1.562026
B,US,black,L,2019-01-20,0.386777,-0.366475
B,US,white,M,2019-01-27,-1.604988,3.103769
B,CN,black,S,2019-02-17,-0.440898,-3.134
