##### ATPA 2.4 - Relational Databases

In [2]:
# CHUNK 1: Data for joins examples
# Stock data
import pandas as pd
Table_1 = pd.DataFrame({"Ticker":["TSLA","AMZN","WFC","PCG"],"Name":["Tesla, Inc.","Amazon.com, Inc.","Wells Fargo & Company","PG&E Corporation"]})
Table_2 = pd.DataFrame({"Ticker":["WFC","TSLA","F","AMZN"],"Price":[49,840,16,3400]})
display(Table_1)
Table_2

Unnamed: 0,Ticker,Name
0,TSLA,"Tesla, Inc."
1,AMZN,"Amazon.com, Inc."
2,WFC,Wells Fargo & Company
3,PCG,PG&E Corporation


Unnamed: 0,Ticker,Price
0,WFC,49
1,TSLA,840
2,F,16
3,AMZN,3400


In [3]:
# CHUNK 2: Left join
# Left join
Table_1.merge(Table_2,on="Ticker",how="left")

Unnamed: 0,Ticker,Name,Price
0,TSLA,"Tesla, Inc.",840.0
1,AMZN,"Amazon.com, Inc.",3400.0
2,WFC,Wells Fargo & Company,49.0
3,PCG,PG&E Corporation,


In [4]:
# CHUNK 3: Right join
# Right join
Table_1.merge(Table_2,on="Ticker",how="right")

Unnamed: 0,Ticker,Name,Price
0,WFC,Wells Fargo & Company,49
1,TSLA,"Tesla, Inc.",840
2,F,,16
3,AMZN,"Amazon.com, Inc.",3400


In [5]:
# CHUNK 4: Inner join
# Inner join
Table_1.merge(Table_2,on="Ticker",how="inner")

Unnamed: 0,Ticker,Name,Price
0,TSLA,"Tesla, Inc.",840
1,AMZN,"Amazon.com, Inc.",3400
2,WFC,Wells Fargo & Company,49


In [6]:
# CHUNK 5: Outer or Full join
# Outer or Full join
Table_1.merge(Table_2,on="Ticker",how="outer")

Unnamed: 0,Ticker,Name,Price
0,TSLA,"Tesla, Inc.",840.0
1,AMZN,"Amazon.com, Inc.",3400.0
2,WFC,Wells Fargo & Company,49.0
3,PCG,PG&E Corporation,
4,F,,16.0


In [7]:
# CHUNK 6: Joins with multiple keys
# Joins with multiple keys
Revenue = pd.DataFrame({"Year":[2019,2019,2020,2020],"Month":["March","September","March","September"], "Revenue":[8400,9600,9100,10300]})
Employee = pd.DataFrame({"Year":[2019,2019,2020,2020],"Month":["March","September","March","September"],"Employees":[55,56,56,60]})
Revenue.merge(Employee,on=["Year","Month"],how="outer")

Unnamed: 0,Year,Month,Revenue,Employees
0,2019,March,8400,55
1,2019,September,9600,56
2,2020,March,9100,56
3,2020,September,10300,60


In [9]:
Revenue.merge(Employee, on=['Year'], how='outer')

Unnamed: 0,Year,Month_x,Revenue,Month_y,Employees
0,2019,March,8400,March,55
1,2019,March,8400,September,56
2,2019,September,9600,March,55
3,2019,September,9600,September,56
4,2020,March,9100,March,56
5,2020,March,9100,September,60
6,2020,September,10300,March,56
7,2020,September,10300,September,60


In [10]:
# CHUNK 7: Joins with duplicate keys
# Joins with duplicate keys
Patient_Diet = pd.DataFrame({"Patient":["A","B","C","D"],"Diet": ["Traditional","Custom","Custom","Traditional"]})
Diet_Cost = pd.DataFrame({"Diet":["Traditional","Custom"],"Cost":[40,65]})
Patient_Diet.merge(Diet_Cost, on ="Diet",how="left")


Unnamed: 0,Patient,Diet,Cost
0,A,Traditional,40
1,B,Custom,65
2,C,Custom,65
3,D,Traditional,40


In [None]:
# CHUNK 8: Combine columns
# Combine columns
Table_3 = pd.DataFrame({"Week 1":[2,3,6],"Week 2":[2,3,7]})
Table_4 = pd.DataFrame({"Week 3":[3,3,6],"Week 4":[5,2,5]})
pd.concat([Table_3,Table_4],axis=1) # axis = 1 specifies this direction for concatenation

In [None]:
# CHUNK 9: Combine rows
# Combine rows
Table_5 = pd.DataFrame({"ID": ["A","B","C"],"Week 1":[2,3,6],"Week 2":[2,3,7]})
Table_6 = pd.DataFrame({"ID": ["D","E","F",],"Week 1":[3,3,6],"Week 2":[5,2,5]})
pd.concat([Table_5,Table_6],axis=0) # axis = 0 specifies this direction for concatenation

In [14]:
print('movie_boxoffice', movie_boxoffice.shape[0])
display(movie_boxoffice.head(2))
print('economy', economy.shape[0])
display(economy.head(2))
print('movie_details', movie_details.shape[0])
display(movie_details.head(2))
print('rotten_rutabagas', rotten_rutabagas.shape[0])
display(rotten_rutabagas.head(2))
print('movie_economy', movie_economy.shape[0])
display(movie_economy.head(2))

movie_boxoffice 14


Unnamed: 0,movie,boxoffice
0,The Force,944
1,Revengers,853


economy 5


Unnamed: 0,year,SP500
0,2015,2028.18
1,2016,1918.6


movie_details 11


Unnamed: 0,movie,MPAA_rating,year
0,Bad Kids,R,2020
1,Leech,R,2019


rotten_rutabagas 16


Unnamed: 0,rating,movie
0,100,Leech
1,100,Avian Woman


movie_economy 9


Unnamed: 0,year,USTotalBoxOffice
0,2020,11.738
1,2019,11.259


In [21]:
# CHUNK 10: Exercise 2.4.1: Construct the indicated joins

movie_boxoffice = pd.read_csv("Data/movie_boxoffice.csv")
economy = pd.read_csv("Data/economy.csv")
movie_details = pd.read_csv("Data/movie_details.csv")
rotten_rutabagas = pd.read_csv("Data/rotten_rutabagas.csv")
movie_economy = pd.read_csv("Data/movie_economy.csv")

# 1 Combine US economy and movie economy
economy.merge(movie_economy, 'inner', 'year')

# 2 Combine MPAA rating and Rotten Rutabagas score
rotten_rutabagas.merge(movie_details, 'inner', 'movie')

# 3 Add movie details to box office results
movie_boxoffice.merge(movie_details, 'left', 'movie')

# 4 Add all movies in box office, details and Rotten Rutabagas together into all_movies
all_movies = movie_boxoffice \
                .merge(movie_details, 'outer', 'movie') \
                .merge(rotten_rutabagas, 'outer', 'movie')

# 5 Add US economy and movie economy into all_movies
all_movies \
        .merge(economy, 'left', 'year') \
        .merge(movie_economy, 'left', 'year')

Unnamed: 0,movie,boxoffice,MPAA_rating,year,rating,SP500,USTotalBoxOffice
0,The Force,944.0,PG-13,2015.0,,2028.18,11.065
1,Revengers,853.0,PG-13,2019.0,96.0,2607.39,11.259
2,Revengers,616.0,PG-13,2019.0,96.0,2607.39,11.259
3,Icon,760.0,,,,,
4,Dark Puma,691.0,PG-13,2018.0,98.0,2789.8,11.9
5,Iceberg,670.0,,,,,
6,Revengers: Beginning,668.0,,,,,
7,Paleolithic Park,664.0,,,,,
8,Amazon Girl,437.0,PG-13,2017.0,93.0,2275.12,11.019
9,Bad Kids,191.0,R,2020.0,79.0,,11.738


In [None]:
# CHUNK 11: Exercise 2.4.1: Solution

movie_boxoffice = pd.read_csv("movie_boxoffice.csv")
economy = pd.read_csv("economy.csv")
movie_details = pd.read_csv("movie_details.csv")
rotten_rutabagas = pd.read_csv("rotten_rutabagas.csv")
movie_economy = pd.read_csv("movie_economy.csv")

# 1 Combine US economy and movie economy
df1 = economy.merge(movie_economy,on="year",how="inner")
df1

# 2 Combine MPAA Rating and Rotten Rutabagas score
df2 = movie_details.merge(rotten_rutabagas,on="movie",how="inner")
df2

# 3 Add movie details to box office results
df3 = movie_boxoffice.merge(movie_details,on="movie",how="left")
df3

# 4 Add all movies in box office, details and Rotten Rutabagas together into all_movies
df4 = movie_boxoffice.merge(movie_details,on="movie",how="outer")
all_movies = df4.merge(rotten_rutabagas,on="movie",how="outer")
all_movies

# 5 Add US economy and movie economy into all_movies
df5 = all_movies.merge(economy,on="year",how="left").merge(movie_economy,on="year",how="left")
df5
