In [3]:
import pandas as pd
import numpy as np

In [8]:
left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]})
left

Unnamed: 0,key,lval
0,foo,1
1,bar,2


In [7]:
right = pd.DataFrame({'key': ['foo', 'bar'], 'rval':[4,5]})
right

Unnamed: 0,key,rval
0,foo,4
1,bar,5


In [6]:
pd.merge(left, right, on='key')

Unnamed: 0,key,lval,rval
0,foo,1,4
1,bar,2,5


In [12]:
pd.merge(left, right, on='key', how='outer')

Unnamed: 0,key,lval,rval
0,foo,1,4
1,bar,2,5


# Grouping

- splitting the data into groups based on some criteria
- applying a function to each group independently
- combining the results into a data structure

In [14]:
df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
                             'foo', 'bar', 'foo', 'foo'],
                        'B': ['one', 'one', 'two', 'three',
                          'two', 'two', 'one', 'three'],
                       'C': np.random.randn(8),
                       'D': np.random.randn(8)})

In [16]:
df

Unnamed: 0,A,B,C,D
0,foo,one,-1.170732,1.442065
1,bar,one,-0.000726,0.089222
2,foo,two,0.334868,-0.515416
3,bar,three,-0.550426,-0.507564
4,foo,two,0.123697,0.275085
5,bar,two,-0.834651,-0.717183
6,foo,one,-0.314462,1.376682
7,foo,three,-0.836761,-0.678199


In [15]:
df.groupby(['A', 'B']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.000726,0.089222
bar,three,-0.550426,-0.507564
bar,two,-0.834651,-0.717183
foo,one,-1.485193,2.818747
foo,three,-0.836761,-0.678199
foo,two,0.458565,-0.240332


In [18]:
df.groupby('A').agg({'C': np.sum, 'D': np.max})

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,-1.385803,0.089222
foo,-1.86339,1.442065


# Stack

The `stack()` method "compresses" a level in the DataFrame's columns. Let's see now what it means.

Firstly, we are going to create the data-frame to work with.

In [28]:
tuples = list(zip(['bar', 'bar', 'baz', 'baz',
                         'foo', 'foo', 'qux', 'qux'],
                        ['one', 'two', 'one', 'two',
                         'one', 'two', 'one', 'two']))

tuples

[('bar', 'one'),
 ('bar', 'two'),
 ('baz', 'one'),
 ('baz', 'two'),
 ('foo', 'one'),
 ('foo', 'two'),
 ('qux', 'one'),
 ('qux', 'two')]

In [34]:
index = pd.MultiIndex.from_tuples(tuples, names=['first', 'seconds'])

In [39]:
index

MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('baz', 'one'),
            ('baz', 'two'),
            ('foo', 'one'),
            ('foo', 'two'),
            ('qux', 'one'),
            ('qux', 'two')],
           names=['first', 'seconds'])

In [35]:
df = pd.DataFrame(np.random.randn(8,2), index=index, columns=['A', 'B'])

In [40]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,seconds,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,2.264679,2.186907
bar,two,0.433804,0.531699
baz,one,0.36834,0.327758
baz,two,1.060215,0.182378
foo,one,-0.698839,-1.984382
foo,two,0.07008,-0.720712
qux,one,0.061207,-0.193692
qux,two,0.918153,-1.351546


In [36]:
df2 = df[:4]

In [41]:
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,seconds,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,2.264679,2.186907
bar,two,0.433804,0.531699
baz,one,0.36834,0.327758
baz,two,1.060215,0.182378


In [37]:
stacked = df2.stack()

In [38]:
stacked

first  seconds   
bar    one      A    2.264679
                B    2.186907
       two      A    0.433804
                B    0.531699
baz    one      A    0.368340
                B    0.327758
       two      A    1.060215
                B    0.182378
dtype: float64

In [42]:
type(stacked)

pandas.core.series.Series

In [47]:
stacked.unstack() # this reverts back to its previous...

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,seconds,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,2.264679,2.186907
bar,two,0.433804,0.531699
baz,one,0.36834,0.327758
baz,two,1.060215,0.182378


In [48]:
stacked.unstack(1) # while this seemed to have transposed...

Unnamed: 0_level_0,seconds,one,two
first,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,A,2.264679,0.433804
bar,B,2.186907,0.531699
baz,A,0.36834,1.060215
baz,B,0.327758,0.182378


In [49]:
stacked.unstack(2) # and this expectedly reverted back

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,seconds,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,2.264679,2.186907
bar,two,0.433804,0.531699
baz,one,0.36834,0.327758
baz,two,1.060215,0.182378


# Pivot Tables

In [50]:
In [101]: df = pd.DataFrame({'A': ['one', 'one', 'two', 'three'] * 3,
                       'B': ['A', 'B', 'C'] * 4,
                       'C': ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
                       'D': np.random.randn(12),
                       'E': np.random.randn(12)})

In [51]:
df

Unnamed: 0,A,B,C,D,E
0,one,A,foo,-0.077173,0.131909
1,one,B,foo,1.32594,-0.859668
2,two,C,foo,-0.424493,-0.381147
3,three,A,bar,-0.205724,-0.925148
4,one,B,bar,0.00078,-1.09653
5,one,C,bar,1.622929,-1.949259
6,two,A,foo,0.812803,-0.385647
7,three,B,foo,0.087974,-1.798565
8,one,C,foo,-0.550275,-0.421799
9,one,A,bar,3.606069,0.540411


In [52]:
pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C'])

Unnamed: 0_level_0,C,bar,foo
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
one,A,3.606069,-0.077173
one,B,0.00078,1.32594
one,C,1.622929,-0.550275
three,A,-0.205724,
three,B,,0.087974
three,C,-0.667848,
two,A,,0.812803
two,B,0.655862,
two,C,,-0.424493


Sometimes, pre-built Pandas functions are not enough, and therefore, it's important to know how to apply your own functions to Pandas objects.

To apply your own library's functions, or another library's functions to pandas obejcts, you should be aware of the methods below. The appropriate method to use depends on whether your function expects to operate on an entire `DataFrame` or `Series`, or row- or column-wise.

# Tablewise Function Application

In [54]:
def extract_city_name(df):
  """
  Chicago, IL > Chicago for city_name column
  """
  df['city_name'] = df['city_and_code'].str.split(",").str.get(0)
  return df

In [62]:
def add_country_name(df, country_name=None):
  """
  Chicago > Chicago-US for city_name column
  """
  col = 'city_name'
  df['city_and_country'] = df[col] + "-" + country_name
  return df


In [63]:
df_p = pd.DataFrame({'city_and_code': ['Chicago, IL']})

In [64]:
df_p

Unnamed: 0,city_and_code
0,"Chicago, IL"


In [65]:
add_country_name(extract_city_name(df_p), country_name='US')

Unnamed: 0,city_and_code,city_name,city_and_country
0,"Chicago, IL",Chicago,Chicago-US


Pandas encourages us to use `pipe()` for the problem above, which is known as 'method chaning'. `pipe` makes it easy to use your own or another library's functions in method chains, alongside Pandas' methods. Compare the first approach with the following:

In [66]:
(df_p.pipe(extract_city_name).pipe(add_country_name, country_name="US"))

Unnamed: 0,city_and_code,city_name,city_and_country
0,"Chicago, IL",Chicago,Chicago-US


# Row or column-wise function application

Arbritray functions can be applied along the axes of a `DataFrame` using the `apply()` method, which, like the descriptive statistics methods, takes an optional axis argument.

In [67]:
df = pd.DataFrame({
        'one': pd.Series(np.random.randn(3), index=['a', 'b', 'c']),
        'two': pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']),
        'three': pd.Series(np.random.randn(3), index=['b', 'c', 'd'])})

In [68]:
df

Unnamed: 0,one,two,three
a,-1.140556,0.868615,
b,2.384543,-0.546408,0.202538
c,-1.16787,0.15567,0.323198
d,,0.890579,0.244664


In [70]:
df.apply(np.mean)

one      0.025372
two      0.342114
three    0.256800
dtype: float64

In [71]:
df.apply(np.mean, axis=1)

a   -0.135970
b    0.680224
c   -0.229667
d    0.567621
dtype: float64

In [72]:
df.apply(lambda x: x.max() - x.min())

one      3.552413
two      1.436987
three    0.120660
dtype: float64

You can use `apply()` to apply your own function.

In [74]:
def own_function(x):
  return x*x

In [75]:
df.apply(own_function)

Unnamed: 0,one,two,three
a,1.300868,0.754493,
b,5.686043,0.298562,0.041022
c,1.36392,0.024233,0.104457
d,,0.793131,0.05986


You may also pass additional arguments and keyword arguments to the `apply()` method. For instance, consider the following function you would like to apply.

In [77]:
def subtract_and_divide(x, sub, divide=1):
  return (x-sub) / divide

In [None]:
df.apply(subtract_and_divide, args=(5,3))

Unnamed: 0,one,two,three
a,-2.046852,-1.377128,
b,-0.871819,-1.848803,-1.599154
c,-2.055957,-1.614777,-1.558934
d,,-1.369807,-1.585112


`args` has to be iterable. Therefore, even if you pass only 1 argument, you have to pass it as a tuple: `args = (5,)`

In [87]:
def subtract(x, sub):
  return (x - sub)

In [88]:
df.apply(subtract, args=(5,))

Unnamed: 0,one,two,three
a,-6.140556,-4.131385,
b,-2.615457,-5.546408,-4.797462
c,-6.16787,-4.84433,-4.676802
d,,-4.109421,-4.755336
