# Pandas Merge and Group By

In [4]:
import pandas as pd
import numpy as np

## Merge

In [5]:
df = pd.DataFrame(np.random.randn(10,4))
df

Unnamed: 0,0,1,2,3
0,-0.281476,0.940803,1.822935,0.491976
1,1.663102,0.153556,0.609241,-0.692249
2,-1.229036,0.279317,-0.983856,0.713002
3,-0.271544,-0.622131,-0.218752,-1.094215
4,0.684614,-0.388993,0.145258,0.378499
5,-0.398107,0.305635,-0.947333,-0.12178
6,1.372599,0.098666,1.153004,-0.133284
7,-0.132147,0.919066,0.33612,-0.876079
8,0.645154,-2.171154,1.450882,1.10055
9,0.841043,1.168713,-0.438416,-0.231943


In [6]:
# break into pieces
pieces = [df[:3], df[3:7], df[7:]]
pieces

[          0         1         2         3
 0 -0.281476  0.940803  1.822935  0.491976
 1  1.663102  0.153556  0.609241 -0.692249
 2 -1.229036  0.279317 -0.983856  0.713002,
           0         1         2         3
 3 -0.271544 -0.622131 -0.218752 -1.094215
 4  0.684614 -0.388993  0.145258  0.378499
 5 -0.398107  0.305635 -0.947333 -0.121780
 6  1.372599  0.098666  1.153004 -0.133284,
           0         1         2         3
 7 -0.132147  0.919066  0.336120 -0.876079
 8  0.645154 -2.171154  1.450882  1.100550
 9  0.841043  1.168713 -0.438416 -0.231943]

In [7]:
pd.concat(pieces)

Unnamed: 0,0,1,2,3
0,-0.281476,0.940803,1.822935,0.491976
1,1.663102,0.153556,0.609241,-0.692249
2,-1.229036,0.279317,-0.983856,0.713002
3,-0.271544,-0.622131,-0.218752,-1.094215
4,0.684614,-0.388993,0.145258,0.378499
5,-0.398107,0.305635,-0.947333,-0.12178
6,1.372599,0.098666,1.153004,-0.133284
7,-0.132147,0.919066,0.33612,-0.876079
8,0.645154,-2.171154,1.450882,1.10055
9,0.841043,1.168713,-0.438416,-0.231943


## Merge

In [17]:
left = pd.DataFrame({'key': ['foo', 'bar', 'qux'], 'lval': [1,2, 4]})
left

Unnamed: 0,key,lval
0,foo,1
1,bar,2
2,qux,4


In [15]:
right = pd.DataFrame({'key': ['foo', 'bar', 'foo', 'baz'], 'rval': [4, 5, 6, 7]})
right

Unnamed: 0,key,rval
0,foo,4
1,bar,5
2,foo,6
3,baz,7


In [20]:
pd.merge(left, right, on='key', how='left')


Unnamed: 0,key,lval,rval
0,foo,1,4.0
1,foo,1,6.0
2,bar,2,5.0
3,qux,4,


## Grouping

-splitting the data into groups on some criteria

-Applying a function to each group independently

-Combining the results into a data structure

In [21]:
df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
                    'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
                    'C': np.random.randn(8),
                    'D': np.random.randn(8)})
df

Unnamed: 0,A,B,C,D
0,foo,one,-0.100435,0.489031
1,bar,one,-0.130215,0.772347
2,foo,two,-0.341784,-0.477096
3,bar,three,0.570021,0.052839
4,foo,two,2.413235,-0.096813
5,bar,two,-0.199488,-1.538264
6,foo,one,-2.010569,-1.202398
7,foo,three,-0.376203,0.265906


In [27]:
df.groupby('A').sum()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,0.240318,-0.713077
foo,-0.415756,-1.02137


In [28]:
df.groupby(['A','B']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.130215,0.772347
bar,three,0.570021,0.052839
bar,two,-0.199488,-1.538264
foo,one,-2.111004,-0.713367
foo,three,-0.376203,0.265906
foo,two,2.071451,-0.573909


In [29]:
df.groupby('A').agg({'C': np.sum, 'D': np.max})

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,0.240318,0.772347
foo,-0.415756,0.489031


## Stack

In [31]:
tuples = list(zip(*[['bar', 'bar', 'baz', 'baz',
                         'foo', 'foo', 'qux', 'qux'],
                        ['one', 'two', 'one', 'two',
                         'one', 'two', 'one', 'two']]))
index = pd.MultiIndex.from_tuples(tuples, names=['first','second'])

df = pd.DataFrame(np.random.randn(8,2), index=index, columns=['A','B'])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,1.06862,-0.493816
bar,two,0.938605,0.107481
baz,one,0.569797,1.178731
baz,two,0.731166,0.482614
foo,one,0.262678,0.232581
foo,two,0.568244,-0.77552
qux,one,-1.522894,0.967612
qux,two,0.434904,1.136872


In [33]:
df2 = df[:4]

df2

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,1.06862,-0.493816
bar,two,0.938605,0.107481
baz,one,0.569797,1.178731
baz,two,0.731166,0.482614


In [34]:
stacked = df2.stack()
stacked

first  second   
bar    one     A    1.068620
               B   -0.493816
       two     A    0.938605
               B    0.107481
baz    one     A    0.569797
               B    1.178731
       two     A    0.731166
               B    0.482614
dtype: float64

In [35]:
stacked.unstack()

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,1.06862,-0.493816
bar,two,0.938605,0.107481
baz,one,0.569797,1.178731
baz,two,0.731166,0.482614


In [36]:
stacked.unstack(1)

Unnamed: 0_level_0,second,one,two
first,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,A,1.06862,0.938605
bar,B,-0.493816,0.107481
baz,A,0.569797,0.731166
baz,B,1.178731,0.482614


In [37]:
stacked.unstack(0)

Unnamed: 0_level_0,first,bar,baz
second,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,A,1.06862,0.569797
one,B,-0.493816,1.178731
two,A,0.938605,0.731166
two,B,0.107481,0.482614


## Pivot Tables

In [38]:
df = pd.DataFrame({'A': ['one', 'one', 'two', 'three'] * 3,
                   'B': ['A', 'B', 'C'] * 4,
                   'C': ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
                   'D': np.random.randn(12),
                   'E': np.random.randn(12)})
df

Unnamed: 0,A,B,C,D,E
0,one,A,foo,1.27509,0.122414
1,one,B,foo,0.771148,2.005911
2,two,C,foo,0.316802,-1.280823
3,three,A,bar,0.347597,1.438667
4,one,B,bar,0.401804,0.438934
5,one,C,bar,-1.195305,-0.468312
6,two,A,foo,-0.315526,2.066043
7,three,B,foo,-0.677779,0.889513
8,one,C,foo,-1.41918,1.906604
9,one,A,bar,0.483037,0.681947


In [39]:
pd.pivot_table(df, values='D', index=['A','B'], columns=['C'])

Unnamed: 0_level_0,C,bar,foo
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
one,A,0.483037,1.27509
one,B,0.401804,0.771148
one,C,-1.195305,-1.41918
three,A,0.347597,
three,B,,-0.677779
three,C,0.458248,
two,A,,-0.315526
two,B,-0.203457,
two,C,,0.316802
