# Combining and Merging Datasets

In [1]:
import numpy as np
import pandas as pd

In [2]:
# related data objects

df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                    'data1': range(7)})
df2 = pd.DataFrame({'key': ['a', 'b', 'd'],
                    'data2': range(3)})

df3 = pd.DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                    'data1': range(7)})
df4 = pd.DataFrame({'rkey': ['a', 'b', 'd'],
                    'data2': range(3)})

left = pd.DataFrame({'key1': ['foo', 'foo', 'bar'],
                     'key2': ['one', 'two', 'one'],
                     'lval': [1, 2, 3]})
right = pd.DataFrame({'key1': ['foo', 'foo', 'bar', 'bar'],
                      'key2': ['one', 'one', 'one', 'two'],
                      'rval': [4, 5, 6, 7]})


left1 = pd.DataFrame({'key': ['a', 'b', 'a', 'a', 'b', 'c'],
                      'value': range(6)})
right1 = pd.DataFrame({'group_val': [3.5, 7]}, index=['a', 'b'])

A1 = df = pd.DataFrame({'A': [1, 5, 3, 77, 6],'B': [5, 4, 99, 5, 8]})

B1 = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],'col1': range(6),'col2': [5,0,3,3,7,9]},
                  columns = ['key', 'col1', 'col2'])


Q1
- a. print df1
- b. print df2
- c. merge both the datasets of df1 and df2
- d. merge both the datasets of df1 and df2 by inner
- e. merge both the datasets of df1 and df2 by outer

In [3]:
# a. 
print(df1)

  key  data1
0   b      0
1   b      1
2   a      2
3   c      3
4   a      4
5   a      5
6   b      6


In [4]:
# b.
print(df2)

  key  data2
0   a      0
1   b      1
2   d      2


In [6]:
# c. 
pd.merge(df1, df2)


Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,4,0
5,a,5,0


In [7]:
# d.
pd.merge(df1, df2, how='inner')

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,4,0
5,a,5,0


In [8]:
# e.
pd.merge(df1, df2, how='outer')

Unnamed: 0,key,data1,data2
0,b,0.0,1.0
1,b,1.0,1.0
2,b,6.0,1.0
3,a,2.0,0.0
4,a,4.0,0.0
5,a,5.0,0.0
6,c,3.0,
7,d,,2.0


Q2
- a. merge both the datasets of df3 and df4
- b. merge both the datasets of df3 and df4 based on df3 columns
- c. merge both the datasets of df3 and df4 based on df4 columns

In [10]:
# a.
pd.merge(df3, df4, left_on = 'lkey', right_on = 'rkey')

Unnamed: 0,lkey,data1,rkey,data2
0,b,0,b,1
1,b,1,b,1
2,b,6,b,1
3,a,2,a,0
4,a,4,a,0
5,a,5,a,0


In [13]:
# b.
pd.merge(df3, df4, how='left', left_on='lkey', right_on='rkey')

Unnamed: 0,lkey,data1,rkey,data2
0,b,0,b,1.0
1,b,1,b,1.0
2,a,2,a,0.0
3,c,3,,
4,a,4,a,0.0
5,a,5,a,0.0
6,b,6,b,1.0


In [14]:
# c.
pd.merge(df3, df4, how='right', left_on='lkey', right_on='rkey')

Unnamed: 0,lkey,data1,rkey,data2
0,a,2.0,a,0
1,a,4.0,a,0
2,a,5.0,a,0
3,b,0.0,b,1
4,b,1.0,b,1
5,b,6.0,b,1
6,,,d,2


Q3
- a. merge the dataset of left and right by outer
- b. merge the dataset of left and right based on key1 column
- c. Suffix the conflicting column name of result output of question 3b as _left and _right

In [15]:
# a.
pd.merge(left, right, how='outer')

Unnamed: 0,key1,key2,lval,rval
0,foo,one,1.0,4.0
1,foo,one,1.0,5.0
2,foo,two,2.0,
3,bar,one,3.0,6.0
4,bar,two,,7.0


In [17]:
# b.
pd.merge(left, right, on='key1')

Unnamed: 0,key1,key2_x,lval,key2_y,rval
0,foo,one,1,one,4
1,foo,one,1,one,5
2,foo,two,2,one,4
3,foo,two,2,one,5
4,bar,one,3,one,6
5,bar,one,3,two,7


In [19]:
#c.
pd.merge(left, right, on='key1', suffixes=['_left', '_right'])

Unnamed: 0,key1,key2_left,lval,key2_right,rval
0,foo,one,1,one,4
1,foo,one,1,one,5
2,foo,two,2,one,4
3,foo,two,2,one,5
4,bar,one,3,one,6
5,bar,one,3,two,7


Q4. 
- a. merge the dataset of left1 and right1
- b. merge the dataset of left1 and right1 by outer

In [21]:
pd.merge(left1, right1, left_on='key', right_index=True)

Unnamed: 0,key,value,group_val
0,a,0,3.5
2,a,2,3.5
3,a,3,3.5
1,b,1,7.0
4,b,4,7.0


In [22]:
pd.merge(left1, right1, how='outer', left_on='key', right_index=True)

Unnamed: 0,key,value,group_val
0,a,0,3.5
2,a,2,3.5
3,a,3,3.5
1,b,1,7.0
4,b,4,7.0
5,c,5,


Q5: Aggregation
- a. aggregates mean within each column on A1
- b. aggregates mean within each row on A1
- c. load the 'planets' dataset form seaborn package
- d. drop na on planets dataset, then describe that computes several common aggregates

In [15]:
print(A1)

    A   B
0   1   5
1   5   4
2   3  99
3  77   5
4   6   8


In [3]:
# a.
A1.agg(['mean'])

Unnamed: 0,A,B
mean,18.4,24.2


In [4]:
# b.
A1.agg(['mean'], axis=1)

Unnamed: 0,mean
0,3.0
1,4.5
2,51.0
3,41.0
4,7.0


In [10]:
# c. 
import seaborn as sns
planets = sns.load_dataset('planets')

In [13]:
# d.
planets.dropna().describe()

Unnamed: 0,number,orbital_period,mass,distance,year
count,498.0,498.0,498.0,498.0,498.0
mean,1.73494,835.778671,2.50932,52.068213,2007.37751
std,1.17572,1469.128259,3.636274,46.596041,4.167284
min,1.0,1.3283,0.0036,1.35,1989.0
25%,1.0,38.27225,0.2125,24.4975,2005.0
50%,1.0,357.0,1.245,39.94,2009.0
75%,2.0,999.6,2.8675,59.3325,2011.0
max,6.0,17337.5,25.0,354.0,2014.0


Q6
- a. set 'number' column as the index of planets
- b. group by the sum of the new dataset by number 'key'
- c. group by the sum of 'orbital_period' in the new dataset by 'year' column

In [14]:
#a.

planets.set_index(['number'])

Unnamed: 0_level_0,method,orbital_period,mass,distance,year
number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Radial Velocity,269.300000,7.10,77.40,2006
1,Radial Velocity,874.774000,2.21,56.95,2008
1,Radial Velocity,763.000000,2.60,19.84,2011
1,Radial Velocity,326.030000,19.40,110.62,2007
1,Radial Velocity,516.220000,10.50,119.47,2009
...,...,...,...,...,...
1,Transit,3.941507,,172.00,2006
1,Transit,2.615864,,148.00,2007
1,Transit,3.191524,,174.00,2007
1,Transit,4.125083,,293.00,2008


In [16]:
# b.

planets.groupby(['number']).sum()

Unnamed: 0_level_0,orbital_period,mass,distance,year
number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1466106.0,1022.19753,149664.47,1195018
2,162147.3,265.3161,40715.28,520627
3,41657.18,47.67735,8309.82,176831
4,306063.8,12.8244,318.64,64245
5,5994.579,4.667,4867.65,60322
6,4090.624,0.694,4032.12,48254
7,834.5253,0.0,5460.0,14091


In [21]:
# c.

planets.groupby(['year']).sum(['orbital_period'])

Unnamed: 0_level_0,number,orbital_period,mass,distance
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1989,1,83.888,11.68,40.57
1992,6,91.8039,0.0,0.0
1994,3,98.2114,0.0,0.0
1995,1,4.230785,0.472,15.36
1996,15,2015.769933,7.9176,95.2
1997,1,39.845,1.04,17.43
1998,11,571.553306,13.4356,131.51
1999,24,8284.213789,45.313,464.21
2000,27,8655.12517,50.8622,492.04
2001,15,8823.6527,37.74,438.38


Q6: 
- a. print B1
- b. group by the sum of B1 by 'key'
- c. set a function filter_func that keep all groups in which sum of col2 > 7
- d. apply c on B1

In [22]:
# a.
print(B1)

  key  col1  col2
0   A     0     5
1   B     1     0
2   C     2     3
3   A     3     3
4   B     4     7
5   C     5     9


In [25]:
# b.
B1.groupby(['key']).sum()

Unnamed: 0_level_0,col1,col2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,3,8
B,5,7
C,7,12


In [29]:
# c.
def filter_fun(df):
    return df['col2'].sum()>7

In [33]:
# d.
B1.groupby('key').filter(filter_fun)

Unnamed: 0,key,col1,col2
0,A,0,5
2,C,2,3
3,A,3,3
5,C,5,9
