# Pandas

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.DataFrame({'grp': [1, 2, 1, 2, 1, 2],
                   'x': range(6, 0, -1),
                   'y': range(4, 10),
                   'z': [3, 4, 5, 6, 7, None]},
                   index = list('abcdef'))

Pandas supports MultiIndex. In this example, the record identifiers are stored as row index.

# Accessing Data

Python uses 0-based indexing.

In [3]:
df.iloc[1, 1]   # second row, second column

5

In [4]:
df.iloc[1:3]   # second to fourth(exclusive) rows, all columns

Unnamed: 0,grp,x,y,z
b,2,5,5,4.0
c,1,4,6,5.0


In [5]:
df.iloc[:, 1:]   # all rows, second to last columns

Unnamed: 0,x,y,z
a,6,4,3.0
b,5,5,4.0
c,4,6,5.0
d,3,7,6.0
e,2,8,7.0
f,1,9,


The `loc` function may return a Series object when a single row is matched or a DataFrame object when multiple objects are matched.

In [6]:
df.loc['c']    # rows having index 'c'

grp    1.0
x      4.0
y      6.0
z      5.0
Name: c, dtype: float64

In [7]:
df.loc[:, 'x']   # all rows, 'x' column

a    6
b    5
c    4
d    3
e    2
f    1
Name: x, dtype: int64

In [8]:
df.loc[:, ['x','z']]   # all rows, columns 'x' and 'z'

Unnamed: 0,x,z
a,6,3.0
b,5,4.0
c,4,5.0
d,3,6.0
e,2,7.0
f,1,


In [9]:
df.loc[:, 'x':'z']   # all rows, alls columns between 'x' and 'z'

Unnamed: 0,x,y,z
a,6,4,3.0
b,5,5,4.0
c,4,6,5.0
d,3,7,6.0
e,2,8,7.0
f,1,9,


In [10]:
df.loc['c'][1]   # row having index 'c', second column

4.0

# Common Operations

In [11]:
df['z'].mean(skipna = False)

nan

In [12]:
df['z'].mean()

5.0

In [13]:
df[['z']].agg('mean')

z    5.0
dtype: float64

In [14]:
df.assign(z1 = df['z'] + 1)

Unnamed: 0,grp,x,y,z,z1
a,1,6,4,3.0,4.0
b,2,5,5,4.0,5.0
c,1,4,6,5.0,6.0
d,2,3,7,6.0,7.0
e,1,2,8,7.0,8.0
f,2,1,9,,


In [15]:
df.rename(columns = {'x': 'x_new'})

Unnamed: 0,grp,x_new,y,z
a,1,6,4,3.0
b,2,5,5,4.0
c,1,4,6,5.0
d,2,3,7,6.0
e,1,2,8,7.0
f,2,1,9,


In [16]:
df.assign(x_mean = df['x'].mean())[['x_mean', 'y']]

Unnamed: 0,x_mean,y
a,3.5,4
b,3.5,5
c,3.5,6
d,3.5,7
e,3.5,8
f,3.5,9


In [17]:
df.sort_values(by = 'x')

Unnamed: 0,grp,x,y,z
f,2,1,9,
e,1,2,8,7.0
d,2,3,7,6.0
c,1,4,6,5.0
b,2,5,5,4.0
a,1,6,4,3.0


In [18]:
df.sort_values(by = ['grp', 'x'], ascending = [True, False])

Unnamed: 0,grp,x,y,z
a,1,6,4,3.0
c,1,4,6,5.0
e,1,2,8,7.0
b,2,5,5,4.0
d,2,3,7,6.0
f,2,1,9,


# Grouping data and aggregation

In [19]:
df.groupby('grp')['x'].mean()

grp
1    4
2    3
Name: x, dtype: int64

In [20]:
df.groupby('grp')['x'].mean().rename("my_mean")

grp
1    4
2    3
Name: my_mean, dtype: int64

In [21]:
df.join(df.groupby('grp')['x'].mean(), on='grp', rsuffix='_mean')

Unnamed: 0,grp,x,y,z,x_mean
a,1,6,4,3.0,4
b,2,5,5,4.0,3
c,1,4,6,5.0,4
d,2,3,7,6.0,3
e,1,2,8,7.0,4
f,2,1,9,,3


In [22]:
df.join(df.groupby('grp')['x'].mean(), on='grp', rsuffix='_mean')[['grp','x_mean']]

Unnamed: 0,grp,x_mean
a,1,4
b,2,3
c,1,4
d,2,3
e,1,4
f,2,3


# More advanced commands

In [23]:
df[['z']].agg(lambda v: np.mean(np.cos(v)))

z    0.07082
dtype: float64

In [24]:
df.agg({'x': max, 'y': min})

x    6
y    4
dtype: int64

In [25]:
df[['x','y']].mean()

x    3.5
y    6.5
dtype: float64

In [26]:
df.filter(regex=("[xy]")).mean()

x    3.5
y    6.5
dtype: float64

In [27]:
df[['x', 'y']].agg([max, min])

Unnamed: 0,x,y
max,6,9
min,1,4


In [28]:
df.assign(x_y_cor = np.corrcoef(df.x, df.y)[0,1])

Unnamed: 0,grp,x,y,z,x_y_cor
a,1,6,4,3.0,-1.0
b,2,5,5,4.0,-1.0
c,1,4,6,5.0,-1.0
d,2,3,7,6.0,-1.0
e,1,2,8,7.0,-1.0
f,2,1,9,,-1.0


In [29]:
df.assign(x_y_min = df.apply(lambda v: min(v.x, v.y), axis=1))

Unnamed: 0,grp,x,y,z,x_y_min
a,1,6,4,3.0,4.0
b,2,5,5,4.0,5.0
c,1,4,6,5.0,4.0
d,2,3,7,6.0,3.0
e,1,2,8,7.0,2.0
f,2,1,9,,1.0


In [30]:
df.assign(x_y_argmax = df.apply(lambda v: df.columns[v.argmax()], axis=1))

Unnamed: 0,grp,x,y,z,x_y_argmax
a,1,6,4,3.0,x
b,2,5,5,4.0,x
c,1,4,6,5.0,y
d,2,3,7,6.0,y
e,1,2,8,7.0,y
f,2,1,9,,y


In [31]:
df.groupby('grp').head(2)   # note that grp isn't sorted after groupby

Unnamed: 0,grp,x,y,z
a,1,6,4,3.0
b,2,5,5,4.0
c,1,4,6,5.0
d,2,3,7,6.0


In [32]:
df[['x']].agg(lambda x: [min(x), max(x)])

Unnamed: 0,x
0,1
1,6


# Joining data frames

In [33]:
# preparation
df2 = pd.DataFrame({'grp': [1, 3], 'w': [10, 11]})
df2

Unnamed: 0,grp,w
0,1,10
1,3,11


In [34]:
pd.merge(df, df2, how = 'inner', on = 'grp')

Unnamed: 0,grp,x,y,z,w
0,1,6,4,3.0,10
1,1,4,6,5.0,10
2,1,2,8,7.0,10


In [35]:
pd.merge(df, df2, how = 'outer', on = 'grp')

Unnamed: 0,grp,x,y,z,w
0,1,6.0,4.0,3.0,10.0
1,1,4.0,6.0,5.0,10.0
2,1,2.0,8.0,7.0,10.0
3,2,5.0,5.0,4.0,
4,2,3.0,7.0,6.0,
5,2,1.0,9.0,,
6,3,,,,11.0


In [36]:
pd.merge(df, df2, how = 'left', on = 'grp')

Unnamed: 0,grp,x,y,z,w
0,1,6,4,3.0,10.0
1,2,5,5,4.0,
2,1,4,6,5.0,10.0
3,2,3,7,6.0,
4,1,2,8,7.0,10.0
5,2,1,9,,


In [37]:
pd.merge(df, df2, how = 'right', on = 'grp')

Unnamed: 0,grp,x,y,z,w
0,1,6.0,4.0,3.0,10
1,1,4.0,6.0,5.0,10
2,1,2.0,8.0,7.0,10
3,3,,,,11


In [38]:
df[df.grp.isin(df2.grp)]

Unnamed: 0,grp,x,y,z
a,1,6,4,3.0
c,1,4,6,5.0
e,1,2,8,7.0


In [39]:
df[~df.grp.isin(df2.grp)]

Unnamed: 0,grp,x,y,z
b,2,5,5,4.0
d,2,3,7,6.0
f,2,1,9,
