In [1]:
import pandas as pd
import numpy as np

# what if we have duplicate indices

In [2]:
df1 = pd.DataFrame(np.arange(0, 9).reshape(3, 3),
                   index = [1,2,3],
                   columns = ['a','b','c'])
df2 = pd.DataFrame(np.arange(9, 18).reshape(3, 3),
                   index = [1,2,3],
                   columns = ['a','b','c'])
print(df1)
print(df2)

   a  b  c
1  0  1  2
2  3  4  5
3  6  7  8
    a   b   c
1   9  10  11
2  12  13  14
3  15  16  17


In [17]:
pd.concat([df1, df2]) # pandas just do it and ignores the duplicated indices

Unnamed: 0,a,b,c
1,0,1,2
2,3,4,5
3,6,7,8
1,9,10,11
2,12,13,14
3,15,16,17


In [18]:
pd.concat([df1, df2], ignore_index = True) # reindexes 

Unnamed: 0,a,b,c
0,0,1,2
1,3,4,5
2,6,7,8
3,9,10,11
4,12,13,14
5,15,16,17


In [3]:
# specify the ladel of the source data
pd.concat([df1, df2], axis = 1, keys = ['df1', 'df2'])

Unnamed: 0_level_0,df1,df1,df1,df2,df2,df2
Unnamed: 0_level_1,a,b,c,a,b,c
1,0,1,2,9,10,11
2,3,4,5,12,13,14
3,6,7,8,15,16,17


In [4]:
# combine data that share some columns
df3 = pd.DataFrame(np.arange(27, 36).reshape(3, 3),
                  index = [4,5,6],
                  columns=['b','c','d'])
df3

Unnamed: 0,b,c,d
4,27,28,29
5,30,31,32
6,33,34,35


In [21]:
pd.concat([df1, df3])

Unnamed: 0,a,b,c,d
1,0.0,1,2,
2,3.0,4,5,
3,6.0,7,8,
4,,27,28,29.0
5,,30,31,32.0
6,,33,34,35.0


In [5]:
pd.concat([df1, df3], axis = 1)

Unnamed: 0,a,b,c,b.1,c.1,d
1,0.0,1.0,2.0,,,
2,3.0,4.0,5.0,,,
3,6.0,7.0,8.0,,,
4,,,,27.0,28.0,29.0
5,,,,30.0,31.0,32.0
6,,,,33.0,34.0,35.0


In [22]:
# only get the data that share the same columns
pd.concat([df1, df3], join = 'inner')

Unnamed: 0,b,c
1,1,2
2,4,5
3,7,8
4,27,28
5,30,31
6,33,34


In [23]:
# same as pd.concat([df1, df3])
pd.concat([df1, df3], join='outer')

Unnamed: 0,a,b,c,d
1,0.0,1,2,
2,3.0,4,5,
3,6.0,7,8,
4,,27,28,29.0
5,,30,31,32.0
6,,33,34,35.0


In [30]:
# change the indices of df3 to ['a','b','c']
# df3 doesn't have column 'a' so we fill zero in the new produced column 'a'

df3.reindex(columns = df1.columns, fill_value = 0)

Unnamed: 0,a,b,c
4,0,27,28
5,0,30,31
6,0,33,34


In [31]:
# now the indices of df1 and df3 are the same
pd.concat([df1, df3.reindex(columns = df1.columns)])

Unnamed: 0,a,b,c
1,0.0,1,2
2,3.0,4,5
3,6.0,7,8
4,,27,28
5,,30,31
6,,33,34


# Combine datasets: merge and join

Using pd.nerge() implements a number of types of joins
- one-to-one
- many-to one
- many-to-many

## one-to-one

In [6]:
df1 = pd.DataFrame({'employee': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'group': ['Accounting', 'Engineering', 'Engineering', 'HR']})

df2 = pd.DataFrame({'employee': ['Lisa', 'Bob', 'Jake', 'Sue'],
                    'hire_date': [2004, 2008, 2012, 2014]})

display(df1, df2)

Unnamed: 0,employee,group
0,Bob,Accounting
1,Jake,Engineering
2,Lisa,Engineering
3,Sue,HR


Unnamed: 0,employee,hire_date
0,Lisa,2004
1,Bob,2008
2,Jake,2012
3,Sue,2014


In [8]:
# pandas automatically finds the same column 'employee' to merge
df3 = pd.merge(df1, df2)
df3

Unnamed: 0,employee,group,hire_date
0,Bob,Accounting,2008
1,Jake,Engineering,2012
2,Lisa,Engineering,2004
3,Sue,HR,2014


# many-to-one

In [7]:
df4 = pd.DataFrame({'group': ['Accounting', 'Engineering', 'HR'],
                    'supervisor': ['Carly', 'Guido', 'Steve']})
df4

Unnamed: 0,group,supervisor
0,Accounting,Carly
1,Engineering,Guido
2,HR,Steve


In [9]:
# based on column 'group'
pd.merge(df3, df4)

Unnamed: 0,employee,group,hire_date,supervisor
0,Bob,Accounting,2008,Carly
1,Jake,Engineering,2012,Guido
2,Lisa,Engineering,2004,Guido
3,Sue,HR,2014,Steve


# many-to-many

In [10]:
df5 = pd.DataFrame({'group': ['Accounting', 'Accounting', 'Engineering', 'Engineerging', 'HR', 'HR'],
                    'skills': ['math', 'spreadsheet','coding', 'linux', 'organization', 'spreadsheet']})
df5

Unnamed: 0,group,skills
0,Accounting,math
1,Accounting,spreadsheet
2,Engineering,coding
3,Engineerging,linux
4,HR,organization
5,HR,spreadsheet


In [38]:
df1

Unnamed: 0,employee,group
0,Bob,Accounting
1,Jake,Engineering
2,Lisa,Engineering
3,Sue,HR


In [39]:
# based on column 'group'
pd.merge(df1, df5)

Unnamed: 0,employee,group,skills
0,Bob,Accounting,math
1,Bob,Accounting,spreadsheet
2,Jake,Engineering,coding
3,Lisa,Engineering,coding
4,Sue,HR,organization
5,Sue,HR,spreadsheet


# Combine two datasets with different column name but same values
- with left_join and right_join

In [110]:
df3 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'salary': [70000, 80000, 12000, 9000]})
df3

Unnamed: 0,name,salary
0,Bob,70000
1,Jake,80000
2,Lisa,12000
3,Sue,9000


In [41]:
display(df1, df3)

Unnamed: 0,employee,group
0,Bob,Accounting
1,Jake,Engineering
2,Lisa,Engineering
3,Sue,HR


Unnamed: 0,name,salary
0,Bob,70000
1,Jake,80000
2,Lisa,12000
3,Sue,9000


In [43]:
pd.merge(df1, df3, left_on='employee', right_on='name')

# take df1's column 'employee'and df3's column 'name' since they are the same
# if there is no common column, code goes wrong

Unnamed: 0,employee,group,name,salary
0,Bob,Accounting,Bob,70000
1,Jake,Engineering,Jake,80000
2,Lisa,Engineering,Lisa,12000
3,Sue,HR,Sue,9000


In [44]:
# drop the duplicates columns, and we need to specify the axis
pd.merge(df1, df3, left_on = 'employee', right_on = 'name').drop('name', axis = 1)

Unnamed: 0,employee,group,salary
0,Bob,Accounting,70000
1,Jake,Engineering,80000
2,Lisa,Engineering,12000
3,Sue,HR,9000


In [45]:
display(df1, df2)

Unnamed: 0,employee,group
0,Bob,Accounting
1,Jake,Engineering
2,Lisa,Engineering
3,Sue,HR


Unnamed: 0,employee,hire_date
0,Lisa,2004
1,Bob,2008
2,Jake,2012
3,Sue,2014


In [102]:
# set specific column to be index

df1a = df1.set_index('employee')
df2a = df2.set_index('employee')
display(df1a, df2a)

Unnamed: 0_level_0,group
employee,Unnamed: 1_level_1
Bob,Accounting
Jake,Engineering
Lisa,Engineering
Sue,HR


Unnamed: 0_level_0,hire_date
employee,Unnamed: 1_level_1
Lisa,2004
Bob,2008
Jake,2012
Sue,2014


In [103]:
# bonus: how to add a fucking new row
row = pd.DataFrame([{'employee':'Frank', 'group': 'HR'}]).set_index('employee')
df1a = pd.concat([df1a, row])
df1a

Unnamed: 0_level_0,group
employee,Unnamed: 1_level_1
Bob,Accounting
Jake,Engineering
Lisa,Engineering
Sue,HR
Frank,HR


In [104]:
# and delete the row
df1a.drop(['Frank'], inplace = True)
df1a

Unnamed: 0_level_0,group
employee,Unnamed: 1_level_1
Bob,Accounting
Jake,Engineering
Lisa,Engineering
Sue,HR


## merge datasets based on index with left_index and right_index

In [107]:
# if df doesn't have index, we can use left_on and right_on
pd.merge(df1a, df2a, left_index = True, right_index = True)

Unnamed: 0_level_0,group,hire_date
employee,Unnamed: 1_level_1,Unnamed: 2_level_1
Bob,Accounting,2008
Jake,Engineering,2012
Lisa,Engineering,2004
Sue,HR,2014


In [114]:
display(df1a, df3)

Unnamed: 0_level_0,group
employee,Unnamed: 1_level_1
Bob,Accounting
Jake,Engineering
Lisa,Engineering
Sue,HR


Unnamed: 0,name,salary
0,Bob,70000
1,Jake,80000
2,Lisa,12000
3,Sue,9000


In [115]:
# merge with mix of index and column 
pd.merge(df1a, df3, left_index = True, right_on = 'name')

Unnamed: 0,group,name,salary
0,Accounting,Bob,70000
1,Engineering,Jake,80000
2,Engineering,Lisa,12000
3,HR,Sue,9000


In [116]:
df6 = pd.DataFrame({'name': ['Peter', 'Paul', 'Mary'],
                    'food': ['fish', 'beans', 'bread']})

df7 = pd.DataFrame({'name': ['Mary', 'Joseph'], 
                    'drink': ['wine', 'beer']})
display(df6, df7)

Unnamed: 0,name,food
0,Peter,fish
1,Paul,beans
2,Mary,bread


Unnamed: 0,name,drink
0,Mary,wine
1,Joseph,beer


In [117]:
pd.merge(df6, df7)

# defualt is intersection, equals to pd.merge(df6, df7, how = 'inner')

Unnamed: 0,name,food,drink
0,Mary,bread,wine


In [118]:
pd.merge(df6, df7, how = 'outer')

Unnamed: 0,name,food,drink
0,Peter,fish,
1,Paul,beans,
2,Mary,bread,wine
3,Joseph,,beer


# Overlapping column names
- When we have contradicting values.

In [119]:
df8 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'rank': [1, 2, 3, 4]})

df9 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'rank': [3, 1, 4, 2]})
display(df8, df9)

Unnamed: 0,name,rank
0,Bob,1
1,Jake,2
2,Lisa,3
3,Sue,4


Unnamed: 0,name,rank
0,Bob,3
1,Jake,1
2,Lisa,4
3,Sue,2


In [46]:
pd.merge(df8, df9, on = 'name')

# bacause the data is conflicting (same with different rank), pandas gives them suffixes

Unnamed: 0,name,rank_x,rank_y
0,Bob,1,3
1,Jake,2,1
2,Lisa,3,4
3,Sue,4,2


In [47]:
# specify the suffixes
pd.merge(df8, df9, on = 'name', suffixes = ['_L', '_R'])

Unnamed: 0,name,rank_L,rank_R
0,Bob,1,3
1,Jake,2,1
2,Lisa,3,4
3,Sue,4,2


# Aggregation and Grouping

## groupby
 - input -> split -> apply -> output

In [3]:
# import planets data

import seaborn as sns
planets = sns.load_dataset('planets')
planets

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.300000,7.10,77.40,2006
1,Radial Velocity,1,874.774000,2.21,56.95,2008
2,Radial Velocity,1,763.000000,2.60,19.84,2011
3,Radial Velocity,1,326.030000,19.40,110.62,2007
4,Radial Velocity,1,516.220000,10.50,119.47,2009
...,...,...,...,...,...,...
1030,Transit,1,3.941507,,172.00,2006
1031,Transit,1,2.615864,,148.00,2007
1032,Transit,1,3.191524,,174.00,2007
1033,Transit,1,4.125083,,293.00,2008


In [4]:
# summary each column in the dataset
planets.describe()

Unnamed: 0,number,orbital_period,mass,distance,year
count,1035.0,992.0,513.0,808.0,1035.0
mean,1.785507,2002.917596,2.638161,264.069282,2009.070531
std,1.240976,26014.728304,3.818617,733.116493,3.972567
min,1.0,0.090706,0.0036,1.35,1989.0
25%,1.0,5.44254,0.229,32.56,2007.0
50%,1.0,39.9795,1.26,55.25,2010.0
75%,2.0,526.005,3.04,178.5,2012.0
max,7.0,730000.0,25.0,8500.0,2014.0


In [8]:
df = pd.DataFrame({'key': ['a', 'b', 'c', 'a', 'b', 'c'],
                   'data': range(6)})
df

Unnamed: 0,key,data
0,a,0
1,b,1
2,c,2
3,a,3
4,b,4
5,c,5


In [44]:
# groupby returns a object, so we often combine it with other functions
# use .aggregate() or .get_group() to show the data
df.groupby('key')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001FE86677C70>

In [45]:
df.groupby('key').sum()

Unnamed: 0_level_0,data
key,Unnamed: 1_level_1
a,3
b,5
c,7


In [46]:

df.groupby('key').value_counts()

key  data
a    0       1
     3       1
b    1       1
     4       1
c    2       1
     5       1
dtype: int64

In [47]:
# we use 'key' to group by, so if we want to use get_group, we need to specify a key
df.groupby('key').get_group('b')

Unnamed: 0,key,data
1,b,1
4,b,4


## Column indexing

In [56]:
planets

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.300000,7.10,77.40,2006
1,Radial Velocity,1,874.774000,2.21,56.95,2008
2,Radial Velocity,1,763.000000,2.60,19.84,2011
3,Radial Velocity,1,326.030000,19.40,110.62,2007
4,Radial Velocity,1,516.220000,10.50,119.47,2009
...,...,...,...,...,...,...
1030,Transit,1,3.941507,,172.00,2006
1031,Transit,1,2.615864,,148.00,2007
1032,Transit,1,3.191524,,174.00,2007
1033,Transit,1,4.125083,,293.00,2008


In [59]:
# groupby base on column 'method', and describe each group in method
planets.groupby('method').describe()

Unnamed: 0_level_0,number,number,number,number,number,number,number,number,orbital_period,orbital_period,...,distance,distance,year,year,year,year,year,year,year,year
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
method,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Astrometry,2.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,2.0,631.18,...,19.3225,20.77,2.0,2011.5,2.12132,2010.0,2010.75,2011.5,2012.25,2013.0
Eclipse Timing Variations,9.0,1.666667,0.5,1.0,1.0,2.0,2.0,2.0,9.0,4751.644444,...,500.0,500.0,9.0,2010.0,1.414214,2008.0,2009.0,2010.0,2011.0,2012.0
Imaging,38.0,1.315789,0.933035,1.0,1.0,1.0,1.0,4.0,12.0,118247.7375,...,132.6975,165.0,38.0,2009.131579,2.781901,2004.0,2008.0,2009.0,2011.0,2013.0
Microlensing,23.0,1.173913,0.387553,1.0,1.0,1.0,1.0,2.0,7.0,3153.571429,...,4747.5,7720.0,23.0,2009.782609,2.859697,2004.0,2008.0,2010.0,2012.0,2013.0
Orbital Brightness Modulation,3.0,1.666667,0.57735,1.0,1.5,2.0,2.0,2.0,3.0,0.709307,...,1180.0,1180.0,3.0,2011.666667,1.154701,2011.0,2011.0,2011.0,2012.0,2013.0
Pulsar Timing,5.0,2.2,1.095445,1.0,1.0,3.0,3.0,3.0,5.0,7343.021201,...,1200.0,1200.0,5.0,1998.4,8.38451,1992.0,1992.0,1994.0,2003.0,2011.0
Pulsation Timing Variations,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,1170.0,...,,,1.0,2007.0,,2007.0,2007.0,2007.0,2007.0,2007.0
Radial Velocity,553.0,1.721519,1.157141,1.0,1.0,1.0,2.0,6.0,553.0,823.35468,...,59.2175,354.0,553.0,2007.518987,4.249052,1989.0,2005.0,2009.0,2011.0,2014.0
Transit,397.0,1.95466,1.399119,1.0,1.0,1.0,2.0,7.0,397.0,21.102073,...,650.0,8500.0,397.0,2011.236776,2.077867,2002.0,2010.0,2012.0,2013.0,2014.0
Transit Timing Variations,4.0,2.25,0.5,2.0,2.0,2.0,2.25,3.0,3.0,79.7835,...,1487.0,2119.0,4.0,2012.5,1.290994,2011.0,2011.75,2012.5,2013.25,2014.0


## aggregate, filter, transform and apply

In [30]:
df = pd.DataFrame({'key': ['a', 'b', 'c', 'a', 'b', 'c'],
                   'data1': range(6),
                   'data2': np.random.randint(0, 10, 6)})
df

Unnamed: 0,key,data1,data2
0,a,0,2
1,b,1,3
2,c,2,6
3,a,3,9
4,b,4,6
5,c,5,8


In [31]:
# agg and aggregate seem to be the same thing
# show 6 columns: group 'a' (min, median, max) for data1 and data2
#                 group 'b' (min, median, max) for data1 and data2
#                 group 'c' (min, median, max) for data1 and data2

df.groupby('key').agg([min, np.median, max])

Unnamed: 0_level_0,data1,data1,data1,data2,data2,data2
Unnamed: 0_level_1,min,median,max,min,median,max
key,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
a,0,1.5,3,2,5.5,9
b,1,2.5,4,3,4.5,6
c,2,3.5,5,6,7.0,8


In [32]:
# 'min' and min seem to be the same thing
# specify we only want data1's min and data2's max in each group

df.groupby('key').aggregate({'data1': 'min', 'data2': 'max'})

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0,9
b,1,6
c,2,8


In [33]:
# show the std of data1 and data2 in each group
df.groupby('key').std()

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
a,2.12132,4.949747
b,2.12132,2.12132
c,2.12132,1.414214


In [42]:
def filter_fun(x):
    return x['data2'].std() > 2

df.groupby('key').filter(filter_fun).groupby(['key']).sum()

# actually I don't understand why we use groupby here
# but groupby indeed produces an iterable object so that .filter() can use
# It's stupid to type 2 times 'groupby'/ Is there any other clever way?

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
a,3,11
b,5,9


## Transformation

In [62]:
# 這邊千萬不要 run!!!!!!! 我打得好辛苦
display(df, df.groupby('key').aggregate({'data1': 'min', 'data2': 'max'}))

#   data1's min  data2's max
# a 0            3
# b 1            5
# c 2            9

Unnamed: 0,key,data1,data2
0,a,0,0
1,b,1,5
2,c,2,2
3,a,3,3
4,b,4,0
5,c,5,9


Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0,3
b,1,5
c,2,9


In [67]:
# transform can pass data into a function
df.groupby('key').transform(lambda x: x - x.mean())

#   data1   data2
# 0 0-1.5   0-1.5
# 1 1-2.5   5-2.5
# 2 2-3.5   2-5.5
# 3 3-1.5   3-1.5
# 4 4-2.5   0-2.5
# 5 5-3.5   9-5.5

Unnamed: 0,data1,data2
0,-1.5,-1.5
1,-1.5,2.5
2,-1.5,-3.5
3,1.5,1.5
4,1.5,-2.5
5,1.5,3.5


## Apply

In [64]:
df

Unnamed: 0,key,data1,data2
0,a,0,0
1,b,1,5
2,c,2,2
3,a,3,3
4,b,4,0
5,c,5,9


In [68]:
# I don't know the difference between apply and transform

def norm_by_data2(x):
    x['data1'] /= x['data2'].sum()
    return x

df.groupby('key').apply(norm_by_data2)

#   data1    data2
# 0 0/(0+3)  0
# 1 1/(5+0)  5
# 2 2/(2+9)  2
# 3 3/(0+3)  3
# 4 4/(5+0)  0
# 5 5/(2+9)  9

Unnamed: 0,key,data1,data2
0,a,0.0,0
1,b,0.2,5
2,c,0.181818,2
3,a,1.0,3
4,b,0.8,0
5,c,0.454545,9


In [73]:
#     [0, 1, 2, 3, 4, 5] 
lst = [0, 1, 0, 1, 2, 0] # corresponding to the above indices of df
display(df, df.groupby(lst).sum())

# we provide keys, which can be a list or even a statement, to groupby and there will be 3 group(0, 1, 2)
# group 0: indices 0, 2, 5
# group 1: indices 1, 3
# group 2: indices 4

#   data1    data2
# 0 0+2+5    0+2+9
# 1 1+3      5+3
# 2 4        0

Unnamed: 0,key,data1,data2
0,a,0,0
1,b,1,5
2,c,2,2
3,a,3,3
4,b,4,0
5,c,5,9


Unnamed: 0,data1,data2
0,7,11
1,4,8
2,4,0


In [74]:
df2 = df.set_index('key')
df2

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0,0
b,1,5
c,2,2
a,3,3
b,4,0
c,5,9


In [75]:
mapping = {'a': 'vowel', 'b': 'consonant', 'c': 'consonant'}
df2.groupby(mapping).sum()

#                  data1    data2
# consonant(b, c)  1+2+4+5  5+2+0+9
# vowel(a)         0+3      0+3

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
consonant,12,16
vowel,3,3
