## Combining Datasets

In [1]:
import pandas as pd

## Merging Datasets

In [4]:
#intentionally left col1 in both dataframes. To be used as the merger colunm for these two dataframes
left = pd.DataFrame({'col1':[1,2,3,4,5], 'col2':['a','b','c','d','e']})
right = pd.DataFrame({'col1':[3,4,5,6,7], 'col3':['f','g','h','i','j']})

print(left)
print(right)

   col1 col2
0     1    a
1     2    b
2     3    c
3     4    d
4     5    e
   col1 col3
0     3    f
1     4    g
2     5    h
3     6    i
4     7    j


In [7]:
# the left= and right= are optional but kept for simplicity sake
# The on= is for which colunm to join based off of
# the how= is the join type. inner is the default if not included

# when we want intersection of the two dataframes then we use inner
pd.merge(left=left,right=right,on='col1',how = 'inner')

Unnamed: 0,col1,col2,col3
0,3,c,f
1,4,d,g
2,5,e,h


In [8]:
# when we all the values of col1 from the left dataframe we use left
pd.merge(left=left,right=right,on='col1',how = 'left')

Unnamed: 0,col1,col2,col3
0,1,a,
1,2,b,
2,3,c,f
3,4,d,g
4,5,e,h


In [9]:
# When we all the values of col1 from the right dataframe we use right
pd.merge(left=left,right=right,on='col1',how = 'right')

Unnamed: 0,col1,col2,col3
0,3,c,f
1,4,d,g
2,5,e,h
3,6,,i
4,7,,j


In [10]:
# when we all the values of col1 from both left and the right we use outer 
pd.merge(left=left,right=right,on='col1',how = 'outer')

Unnamed: 0,col1,col2,col3
0,1,a,
1,2,b,
2,3,c,f
3,4,d,g
4,5,e,h
5,6,,i
6,7,,j


## Concat

In [11]:
df1 = pd.DataFrame({'account_id':[1,2,3,11382],
                    'gender':['female','male','female','male'],
                    'age':[55,25,29,39]})

df1

Unnamed: 0,account_id,gender,age
0,1,female,55
1,2,male,25
2,3,female,29
3,11382,male,39


In [12]:
df2 = pd.DataFrame({'account_id':[4,5,6,7],
                    'gender':['female','male','female','male'],
                    'age':[19,28,14,15]})

df2

Unnamed: 0,account_id,gender,age
0,4,female,19
1,5,male,28
2,6,female,14
3,7,male,15


In [14]:
# by default axis is set to 0, meaning it merges based on the index. Notice 0123 0123 is the index for this run. It can be set to a specific column.
pd.concat([df1,df2])

Unnamed: 0,account_id,gender,age
0,1,female,55
1,2,male,25
2,3,female,29
3,11382,male,39
0,4,female,19
1,5,male,28
2,6,female,14
3,7,male,15


In [15]:
# index is fixed using ignore_index
pd.concat([df1,df2],ignore_index=True)

Unnamed: 0,account_id,gender,age
0,1,female,55
1,2,male,25
2,3,female,29
3,11382,male,39
4,4,female,19
5,5,male,28
6,6,female,14
7,7,male,15


## Hero dataset merge and concat using pkl files

In [17]:
hero_powers=pd.read_pickle('hero_powers.pkl')
hero_dc = pd.read_pickle('hero_dc.pkl')
hero_marvel = pd.read_pickle('hero_marvel.pkl')

#### Since both datasets are the same column wise we will use concat.

In [18]:
hero_dc.head(1)

Unnamed: 0,name,Gender,Eye color,Race,Hair color,Height,Publisher,Alignment,Weight
0,Abin Sur,Male,blue,Ungaran,No Hair,185,DC Comics,good,90.0


In [19]:
hero_marvel.head(1)

Unnamed: 0,name,Gender,Eye color,Race,Hair color,Height,Publisher,Alignment,Weight
0,A-Bomb,Male,yellow,Human,No Hair,203,Marvel Comics,good,441


In [21]:
hero_dc.shape

(215, 9)

In [22]:
hero_marvel.shape

(388, 9)

In [23]:
# total columns after concat
215+388

603

In [24]:
hero_info = pd.concat([hero_dc,hero_marvel],ignore_index=True)

In [25]:
hero_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 603 entries, 0 to 602
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   name        603 non-null    string  
 1   Gender      603 non-null    category
 2   Eye color   603 non-null    string  
 3   Race        603 non-null    string  
 4   Hair color  603 non-null    string  
 5   Height      603 non-null    int64   
 6   Publisher   603 non-null    string  
 7   Alignment   603 non-null    category
 8   Weight      532 non-null    float64 
dtypes: category(2), float64(1), int64(1), string(5)
memory usage: 34.6 KB


In [27]:
hero_info['Publisher'].value_counts()

Publisher
Marvel Comics    388
DC Comics        215
Name: count, dtype: Int64

#### Since they are not the same we will use a merge based on the name and hero_name columns

In [28]:
hero_info.head(1)

Unnamed: 0,name,Gender,Eye color,Race,Hair color,Height,Publisher,Alignment,Weight
0,Abin Sur,Male,blue,Ungaran,No Hair,185,DC Comics,good,90.0


In [30]:
hero_powers.head(1)

Unnamed: 0,hero_names,Agility,Accelerated Healing,Lantern Power Ring,Dimensional Awareness,Cold Resistance,Durability,Stealth,Energy Absorption,Flight,...,Web Creation,Reality Warping,Odin Force,Symbiote Costume,Speed Force,Phoenix Force,Molecular Dissipation,Vision - Cryo,Omnipresent,Omniscient
0,3-D Man,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [32]:
# since no columns are exactly the same we use left_on and right_on 
hero = pd.merge(hero_info,hero_powers,left_on = 'name', right_on = 'hero_names', how = 'inner' )

In [33]:
hero.head(1)

Unnamed: 0,name,Gender,Eye color,Race,Hair color,Height,Publisher,Alignment,Weight,hero_names,...,Web Creation,Reality Warping,Odin Force,Symbiote Costume,Speed Force,Phoenix Force,Molecular Dissipation,Vision - Cryo,Omnipresent,Omniscient
0,Abin Sur,Male,blue,Ungaran,No Hair,185,DC Comics,good,90.0,Abin Sur,...,False,False,False,False,False,False,False,False,False,False


In [34]:
hero.shape

(537, 177)

In [35]:
hero.to_pickle('hero.pkl')