# Merging DataFrames

In [2]:
import pandas as pd

# Staff
staff_df = pd.DataFrame([{'Name' : 'Kelly', 'Role' : 'Director of HR'},
                      {'Name' : 'Sally', 'Role' : 'Course liasion'},
                      {'Name' : 'James', 'Role' : 'Grader'}])

staff_df = staff_df.set_index('Name')

# Students
student_df = pd.DataFrame([{'Name' : 'James', 'School' : 'Busiess'},
                           {'Name' : 'Mike', 'School' : 'Law'},
                           {'Name' : 'Sally', 'School' : 'Engineering'}])

student_df = student_df.set_index('Name')

print(staff_df.head())
print('\n')
print(student_df.head())

                 Role
Name                 
Kelly  Director of HR
Sally  Course liasion
James          Grader


            School
Name              
James      Busiess
Mike           Law
Sally  Engineering


In [3]:
# Merge
pd.merge(staff_df, student_df, how = 'outer', left_index = True, right_index = True)

Unnamed: 0_level_0,Role,School
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
James,Grader,Busiess
Kelly,Director of HR,
Mike,,Law
Sally,Course liasion,Engineering


In [4]:
# Find intersection
pd.merge(staff_df, student_df, how = 'inner', left_index = True, right_index = True)

Unnamed: 0_level_0,Role,School
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Sally,Course liasion,Engineering
James,Grader,Busiess


In [5]:
pd.merge(staff_df, student_df, how = 'left', left_index = True, right_index = True)

Unnamed: 0_level_0,Role,School
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Kelly,Director of HR,
Sally,Course liasion,Engineering
James,Grader,Busiess


In [6]:
pd.merge(staff_df, student_df, how = 'right', left_index = True, right_index = True)

Unnamed: 0_level_0,Role,School
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
James,Grader,Busiess
Mike,,Law
Sally,Course liasion,Engineering


In [7]:
# Using columns instead of indeces to join

# Delete indeces
staff_df = staff_df.reset_index()
student_df = student_df.reset_index()

# Now merge using Name
pd.merge(staff_df, student_df, how = 'right', on = 'Name')

Unnamed: 0,Name,Role,School
0,James,Grader,Busiess
1,Mike,,Law
2,Sally,Course liasion,Engineering


In [8]:
# What if we have conflicts?

staff_df = pd.DataFrame([{'Name' : 'Kelly', 'Role' : 'Director of HR',
                          'Location' : 'State Street'},
                         {'Name' : 'Sally', 'Role' : 'Course liasion',
                          'Location' : 'Washington Avenue'},
                         {'Name' : 'James', 'Role' : 'Grades',
                          'Location' : 'Washington Avenue'}])

student_df = pd.DataFrame([{'Name' : 'James', 'School' : 'Business',
                          'Location' : '1024 Billiard Avenue'},
                          {'Name' : 'Mike', 'School' : 'Law',
                          'Location' : 'Fraternity House #22'},
                          {'Name' : 'Sally', 'School' : 'Engineering',
                          'Location' : '512 Wilson Crescent'}])

pd.merge(staff_df, student_df, how = 'left', on = 'Name')

Unnamed: 0,Name,Role,Location_x,School,Location_y
0,Kelly,Director of HR,State Street,,
1,Sally,Course liasion,Washington Avenue,Engineering,512 Wilson Crescent
2,James,Grades,Washington Avenue,Business,1024 Billiard Avenue


In [9]:
# Multi-Indexing and Multiple columns

staff_df = pd.DataFrame([{'First Name' : 'Kelly', 'Last Name' : 'Desjardins',
                          'Role' : 'Director of HR'},
                         {'First Name' : 'Sally', 'Last Name' : 'Brooks',
                          'Role' : 'Course liasion'},
                         {'First Name' : 'James', 'Last Name' : 'Wilde',
                          'Role' : 'Grader'}])

student_df = pd.DataFrame([{'First Name' : 'Jammes', 'Last Name' : 'Hammond',
                          'School' : 'Business'},
                         {'First Name' : 'Mike', 'Last Name' : 'Smith',
                          'School' : 'Law'},
                          {'First Name' : 'Sally', 'Last Name' : 'Brooks',
                          'School' : 'Engineering'}])

pd.merge(staff_df, student_df, how = 'inner', on = ['First Name', 'Last Name'])

Unnamed: 0,First Name,Last Name,Role,School
0,Sally,Brooks,Course liasion,Engineering


In [10]:
# Merging 'horizontally' -> concatenate

In [11]:
%%capture
df_2011 = pd.read_csv('/Users/jonathansuarezcaceres/Downloads/1_Data Science/Intro to DS with Python/Course1_Resources/resources/week-3/datasets/college_scorecard/MERGED2011_12_PP.csv',
                     error_bad_lines = False)
df_2012 = pd.read_csv('/Users/jonathansuarezcaceres/Downloads/1_Data Science/Intro to DS with Python/Course1_Resources/resources/week-3/datasets/college_scorecard/MERGED2012_13_PP.csv',
                     error_bad_lines = False)
df_2013 = pd.read_csv('/Users/jonathansuarezcaceres/Downloads/1_Data Science/Intro to DS with Python/Course1_Resources/resources/week-3/datasets/college_scorecard/MERGED2013_14_PP.csv',
                     error_bad_lines = False)

In [12]:
df_2011.head()

Unnamed: 0,version https://git-lfs.github.com/spec/v1
0,oid sha256:889cb25b86e8bd07b2aa82b253e3fcd0d8c...
1,size 309076268


In [14]:
print(len(df_2011))
print(len(df_2012))
print(len(df_2013))

2
2
2


In [16]:
# Let's concatenate
frames = [df_2011, df_2012, df_2013]
pd.concat(frames)

Unnamed: 0,version https://git-lfs.github.com/spec/v1
0,oid sha256:889cb25b86e8bd07b2aa82b253e3fcd0d8c...
1,size 309076268
0,oid sha256:55cfd6746fdcc1cb5a29350c5a39c742ae8...
1,size 157050855
0,oid sha256:dbef09960b9dd4392f144a05562af3639d8...
1,size 157811280


In [17]:
len(df_2011) + len(df_2012) + len(df_2013)

6

In [18]:
pd.concat(frames, keys = ['2011', '2012', '2013'])

Unnamed: 0,Unnamed: 1,version https://git-lfs.github.com/spec/v1
2011,0,oid sha256:889cb25b86e8bd07b2aa82b253e3fcd0d8c...
2011,1,size 309076268
2012,0,oid sha256:55cfd6746fdcc1cb5a29350c5a39c742ae8...
2012,1,size 157050855
2013,0,oid sha256:dbef09960b9dd4392f144a05562af3639d8...
2013,1,size 157811280
