In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from config import password

## Merge netflix and omdb csv files

In [2]:
file = "netflix_titles.csv"
netflix_df = pd.read_csv(file)
file = "omdb_list.csv"
omdb_df = pd.read_csv(file)

In [3]:
omdb_df.head()

Unnamed: 0.1,Unnamed: 0,title,genre,runtime,imdbRating,imdbVotes,poster,awards,boxoffice,language
0,0,Norm of the North: King Sized Adventure,"Animation, Adventure, Comedy, Family",90 min,3.3,311,https://m.media-amazon.com/images/M/MV5BNjMwZD...,,,English
1,1,Jandino: Whatever it Takes,Comedy,95 min,4.8,23,https://m.media-amazon.com/images/M/MV5BMWE3MG...,,,"English, Dutch"
2,2,Transformers Prime,"Animation, Action, Adventure, Comedy, Drama, F...",30 min,7.9,5454,https://m.media-amazon.com/images/M/MV5BMTczND...,14 wins & 26 nominations.,,English
3,3,Transformers: Robots in Disguise,"Animation, Action, Adventure, Comedy, Sci-Fi",22 min,6.0,842,https://m.media-amazon.com/images/M/MV5BMjMwNT...,2 wins & 11 nominations.,,English
4,4,Apaches,Drama,82 min,5.9,292,https://m.media-amazon.com/images/M/MV5BODYyOT...,3 nominations.,,"French, Arabic"


In [4]:
omdb_unique_df = omdb_df.drop_duplicates(subset=['title'])
print(len(omdb_df))
print(len(omdb_unique_df))

5473
5412


In [5]:
merged_df = netflix_df.merge(omdb_unique_df, how="left", on="title")
merged_df['imdbVotes']= merged_df['imdbVotes'].str.replace(',', '')
merged_df['imdbVotes']= merged_df['imdbVotes'].fillna(0)
merged_df['imdbVotes']= merged_df['imdbVotes'].astype(int)
print(len(merged_df))
merged_df.head()

6234


Unnamed: 0.1,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,...,description,Unnamed: 0,genre,runtime,imdbRating,imdbVotes,poster,awards,boxoffice,language
0,81145628,Movie,Norm of the North: King Sized Adventure,"Richard Finn, Tim Maltby","Alan Marriott, Andrew Toth, Brian Dobson, Cole...","United States, India, South Korea, China","September 9, 2019",2019,TV-PG,90 min,...,Before planning an awesome wedding for his gra...,0.0,"Animation, Adventure, Comedy, Family",90 min,3.3,311,https://m.media-amazon.com/images/M/MV5BNjMwZD...,,,English
1,80117401,Movie,Jandino: Whatever it Takes,,Jandino Asporaat,United Kingdom,"September 9, 2016",2016,TV-MA,94 min,...,Jandino Asporaat riffs on the challenges of ra...,1.0,Comedy,95 min,4.8,23,https://m.media-amazon.com/images/M/MV5BMWE3MG...,,,"English, Dutch"
2,70234439,TV Show,Transformers Prime,,"Peter Cullen, Sumalee Montano, Frank Welker, J...",United States,"September 8, 2018",2013,TV-Y7-FV,1 Season,...,"With the help of three human allies, the Autob...",2.0,"Animation, Action, Adventure, Comedy, Drama, F...",30 min,7.9,5454,https://m.media-amazon.com/images/M/MV5BMTczND...,14 wins & 26 nominations.,,English
3,80058654,TV Show,Transformers: Robots in Disguise,,"Will Friedle, Darren Criss, Constance Zimmer, ...",United States,"September 8, 2018",2016,TV-Y7,1 Season,...,When a prison ship crash unleashes hundreds of...,3.0,"Animation, Action, Adventure, Comedy, Sci-Fi",22 min,6.0,842,https://m.media-amazon.com/images/M/MV5BMjMwNT...,2 wins & 11 nominations.,,English
4,80125979,Movie,#realityhigh,Fernando Lebrija,"Nesta Cooper, Kate Walsh, John Michael Higgins...",United States,"September 8, 2017",2017,TV-14,99 min,...,When nerdy high schooler Dani finally attracts...,,,,,0,,,,


## Transform netflix genre

In [6]:
# Turn listed_in into list
merged_df.listed_in = merged_df.listed_in.str.split(',').tolist()
listed_df = merged_df.dropna(subset=['listed_in'])
listed_in = listed_df["listed_in"].tolist()

In [7]:
# Loop through listed_in (netflix genre) and get unique values into netflix_genre set
netflix_genre = set()

for movie in listed_in:
    for x in movie:
        netflix_genre.add(x)
print(netflix_genre)

netflix_genre_id = pd.DataFrame(netflix_genre, columns=["netflix_genre"])
netflix_genre_id["netflix_genre"]=netflix_genre_id["netflix_genre"].str.strip()
netflix_genre_id = netflix_genre_id.drop_duplicates()
netflix_genre_id = netflix_genre_id.reset_index()
netflix_genre_id = netflix_genre_id.rename(columns={'index': 'netflix_genre_no'})
print(len(netflix_genre_id))
netflix_genre_id.head()

{'Docuseries', 'Movies', ' Science & Nature TV', 'Romantic Movies', ' Romantic TV Shows', 'Anime Series', "Kids' TV", ' Crime TV Shows', ' TV Sci-Fi & Fantasy', 'TV Dramas', ' Docuseries', ' Korean TV Shows', 'Stand-Up Comedy', 'Reality TV', ' TV Horror', 'Horror Movies', ' International Movies', 'International TV Shows', ' Anime Features', ' Classic Movies', 'Classic & Cult TV', " Kids' TV", ' Independent Movies', ' Reality TV', ' Horror Movies', ' Romantic Movies', 'Spanish-Language TV Shows', 'Cult Movies', 'Thrillers', ' Sports Movies', 'Sports Movies', ' Classic & Cult TV', ' Faith & Spirituality', 'International Movies', 'Sci-Fi & Fantasy', 'TV Sci-Fi & Fantasy', 'Documentaries', 'Children & Family Movies', ' Music & Musicals', 'Independent Movies', 'Comedies', 'TV Shows', 'Music & Musicals', 'Crime TV Shows', ' Documentaries', 'Classic Movies', ' TV Thrillers', 'British TV Shows', 'Romantic TV Shows', ' TV Comedies', ' TV Action & Adventure', ' Spanish-Language TV Shows', ' Dram

Unnamed: 0,netflix_genre_no,netflix_genre
0,0,Docuseries
1,1,Movies
2,2,Science & Nature TV
3,3,Romantic Movies
4,4,Romantic TV Shows


In [8]:
# Break out genres (listed_in) with show_id
listed_in_df = merged_df.loc[:,['show_id', 'listed_in']]
listed_in_df = listed_in_df.explode('listed_in')
listed_in_df['listed_in'] = listed_in_df['listed_in'].str.strip()
listed_in_df = listed_in_df.rename(columns={'listed_in': 'netflix_genre'})
print(len(listed_in_df))
listed_in_df.head()

13670


Unnamed: 0,show_id,netflix_genre
0,81145628,Children & Family Movies
0,81145628,Comedies
1,80117401,Stand-Up Comedy
2,70234439,Kids' TV
3,80058654,Kids' TV


In [9]:
# merge tables to create show_id and genre_id table
netflix_genre_table = listed_in_df.merge(netflix_genre_id,how = "left", on="netflix_genre")
netflix_genre_table = netflix_genre_table.loc[:,['show_id','netflix_genre_no']]
print(len(netflix_genre_table))
netflix_genre_table.head()                                      

13670


Unnamed: 0,show_id,netflix_genre_no
0,81145628,37
1,81145628,40
2,80117401,12
3,70234439,6
4,80058654,6


## Transform omdb genre

In [10]:
# Turn genre into list 
merged_df.genre = merged_df.genre.str.split(',').tolist()
genre_df = merged_df.dropna(subset=['genre'])
genre = genre_df["genre"].tolist()

In [11]:
# Loop through genre (omdb genre) and get unique values into omdb_genre set
omdb_genre = set()

for movie in genre:
    for y in movie:
        omdb_genre.add(y)
print(omdb_genre)

omdb_genre_id = pd.DataFrame(omdb_genre, columns=["omdb_genre"])
omdb_genre_id["omdb_genre"]=omdb_genre_id["omdb_genre"].str.strip()
omdb_genre_id = omdb_genre_id.drop_duplicates()
omdb_genre_id = omdb_genre_id.reset_index()
omdb_genre_id = omdb_genre_id.rename(columns={'index': 'omdb_genre_no'})
print(len(omdb_genre_id))
omdb_genre_id.head()

{' Short', 'Drama', ' Documentary', ' Comedy', 'Reality-TV', ' Sport', ' Animation', 'Sci-Fi', 'Short', ' Thriller', 'War', ' News', 'Music', 'Western', ' Family', 'Crime', 'Mystery', ' Action', 'Horror', 'Documentary', 'News', ' Biography', 'Talk-Show', ' Talk-Show', ' Film-Noir', 'Thriller', 'Family', 'Comedy', ' Romance', ' Reality-TV', 'Sport', ' Music', 'Animation', 'Action', ' War', 'Adult', ' Game-Show', ' Adventure', 'Musical', ' Drama', ' Western', ' Mystery', 'Game-Show', ' Horror', 'Fantasy', ' History', ' Crime', ' Sci-Fi', ' Fantasy', 'Biography', 'History', ' Musical', 'Adventure', 'Romance'}
28


Unnamed: 0,omdb_genre_no,omdb_genre
0,0,Short
1,1,Drama
2,2,Documentary
3,3,Comedy
4,4,Reality-TV


In [12]:
# Break out genres with show_id
genre_df = merged_df.loc[:,['show_id', 'genre']]
genre_df = genre_df.explode('genre')
genre_df['genre'] = genre_df['genre'].str.strip()
genre_df = genre_df.rename(columns={'genre': 'omdb_genre'})
genre_df.dropna(inplace = True)
print(len(genre_df))
genre_df.head()

13276


Unnamed: 0,show_id,omdb_genre
0,81145628,Animation
0,81145628,Adventure
0,81145628,Comedy
0,81145628,Family
1,80117401,Comedy


In [13]:
# merge tables to create show_id and genre_id table
omdb_genre_table = genre_df.merge(omdb_genre_id,how = "left", on="omdb_genre")
omdb_genre_table = omdb_genre_table.loc[:,['show_id','omdb_genre_no']]
print(len(omdb_genre_table))
omdb_genre_table.head()

13276


Unnamed: 0,show_id,omdb_genre_no
0,81145628,6
1,81145628,37
2,81145628,3
3,81145628,14
4,80117401,3


## Transform language table

In [14]:
# Turn language into list 
merged_df.language = merged_df.language.str.split(',').tolist()
lan_df = merged_df.dropna(subset=['language'])
language = lan_df["language"].tolist()
print(len(language))

5320


[['English'],
 ['English', ' Dutch'],
 ['English'],
 ['English'],
 ['French', ' Arabic'],
 ['English'],
 ['Spanish'],
 ['English'],
 ['English'],
 ['English', ' Dutch', ' German'],
 ['Hindi', ' English'],
 ['English', ' French'],
 ['English'],
 ['English'],
 ['Spanish', ' English'],
 ['English'],
 ['English'],
 ['English'],
 ['English'],
 ['English', ' Mandarin'],
 ['English'],
 ['French'],
 ['French'],
 ['Hindi'],
 ['Telugu'],
 ['English', ' Arabic'],
 ['English'],
 ['Hindi'],
 ['English'],
 ['English'],
 ['English'],
 ['English'],
 ['Thai'],
 ['Thai'],
 ['English'],
 ['Thai'],
 ['Thai'],
 ['Thai'],
 ['English'],
 ['English'],
 ['English'],
 ['Urdu'],
 ['English'],
 ['English'],
 ['Tamil'],
 ['English', ' Swahili', ' Nama', ' Xhosa', ' Korean'],
 ['English'],
 ['English'],
 ['English'],
 ['English'],
 ['English'],
 ['English'],
 ['English'],
 ['English'],
 ['English'],
 ['English'],
 ['English'],
 ['English'],
 ['English'],
 ['English', ' Italian'],
 ['English'],
 ['English'],
 ['Engl

In [16]:
# Loop through language (omdb language) and get unique values into language_set set
language_set = set()

for row in language:
    for i in row:
        language_set.add(i)
print(language_set)

language_df = pd.DataFrame(language_set, columns=["language"])
language_df["language"]=language_df["language"].str.strip()
language_df = language_df.drop_duplicates()
language_df = language_df.reset_index()
language_df = language_df.rename(columns={'index': 'language_no'})
print(len(language_df))
language_df.head()

{'Kikuyu', ' Mende', ' Hindi', 'American Sign Language', ' Croatian', 'Pushto', ' Latin', 'Yoruba', ' Awadhi', ' Serbian', ' Lithuanian', 'Marathi', ' Assyrian Neo-Aramaic', ' Romanian', 'Korean', ' Saami', 'Panjabi', ' Tarahumara', ' Indian Sign Language', ' Dari', 'Sanskrit', 'Gujarati', 'Mandarin', 'Kannada', 'Akan', ' Washoe', 'Tagalog', ' Aragonese', ' Chechen', 'Indonesian', ' Polynesian', ' Berber languages', 'Ukrainian Sign Language', 'Latin', ' Sinhalese', ' Polish', ' Basque', ' Cheyenne', ' Punjabi', 'Flemish', 'Icelandic', 'Nepali', 'Japanese', ' Min Nan', ' Tamil', ' Kriolu', ' Minangkabau', ' Zulu', ' Aboriginal', 'Filipino', 'Assamese', ' Finnish', 'Welsh', ' Mixtec', 'Wolof', ' Teochew', 'French', 'Hindi', ' Pushto', 'Polish', 'Cantonese', 'Hokkien', 'Danish', 'Afrikaans', 'Romanian', 'Khmer', ' Akan', ' Czech', ' Tibetan', ' Norwegian', ' Afrikaans', 'Serbian', ' Manipuri', ' Mohawk', ' Yoruba', ' Ewe', ' Flemish', 'Norwegian', ' Tajik', 'Sinhalese', ' Mandarin', ' Tha

Unnamed: 0,language_no,language
0,0,Kikuyu
1,1,Mende
2,2,Hindi
3,3,American Sign Language
4,4,Croatian


In [17]:
# Break out languages with show_id
language_all_df = merged_df.loc[:,['show_id', 'language']]
language_all_df = language_all_df.explode('language')
language_all_df['language'] = language_all_df['language'].str.strip()
language_all_df.dropna(inplace = True)
language_all_df.head()

Unnamed: 0,show_id,language
0,81145628,English
1,80117401,English
1,80117401,Dutch
2,70234439,English
3,80058654,English


In [18]:
# merge tables to create show_id and language_no table
language_table = language_all_df.merge(language_df,how = "left", on="language")
language_table = language_table.loc[:,['show_id','language_no']]
print(len(language_table))
language_table.head()

7253


Unnamed: 0,show_id,language_no
0,81145628,114
1,80117401,114
2,80117401,124
3,70234439,114
4,80058654,114


In [19]:
title_df = merged_df.drop(columns=["listed_in","genre","language","Unnamed: 0"])
title_df_show = title_df.loc[title_df.show_id == 80057969]
title_df_show
print(len(title_df))
title_df.head()

6234


Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,description,runtime,imdbRating,imdbVotes,poster,awards,boxoffice
0,81145628,Movie,Norm of the North: King Sized Adventure,"Richard Finn, Tim Maltby","Alan Marriott, Andrew Toth, Brian Dobson, Cole...","United States, India, South Korea, China","September 9, 2019",2019,TV-PG,90 min,Before planning an awesome wedding for his gra...,90 min,3.3,311,https://m.media-amazon.com/images/M/MV5BNjMwZD...,,
1,80117401,Movie,Jandino: Whatever it Takes,,Jandino Asporaat,United Kingdom,"September 9, 2016",2016,TV-MA,94 min,Jandino Asporaat riffs on the challenges of ra...,95 min,4.8,23,https://m.media-amazon.com/images/M/MV5BMWE3MG...,,
2,70234439,TV Show,Transformers Prime,,"Peter Cullen, Sumalee Montano, Frank Welker, J...",United States,"September 8, 2018",2013,TV-Y7-FV,1 Season,"With the help of three human allies, the Autob...",30 min,7.9,5454,https://m.media-amazon.com/images/M/MV5BMTczND...,14 wins & 26 nominations.,
3,80058654,TV Show,Transformers: Robots in Disguise,,"Will Friedle, Darren Criss, Constance Zimmer, ...",United States,"September 8, 2018",2016,TV-Y7,1 Season,When a prison ship crash unleashes hundreds of...,22 min,6.0,842,https://m.media-amazon.com/images/M/MV5BMjMwNT...,2 wins & 11 nominations.,
4,80125979,Movie,#realityhigh,Fernando Lebrija,"Nesta Cooper, Kate Walsh, John Michael Higgins...",United States,"September 8, 2017",2017,TV-14,99 min,When nerdy high schooler Dani finally attracts...,,,0,,,


## Load data into postgres using sqlalchemy

In [20]:
# create connection to ETL_project_DB in postgres
engine = create_engine('postgresql://postgres:'+ password + '@localhost:5432/Netflix_movies')
connection = engine.connect()

In [21]:
# check table names in database
engine.table_names()

['OMDB_genre',
 'OMDB_title_genre',
 'Title',
 'Netflix_Listed_in',
 'Netflix_title_Listed_in',
 'OMDB_language',
 'OMDB_title_language']

In [22]:
# load dataframes into postgres using pandas
title_df.to_sql(name='Title', con=engine, if_exists='append', index=False)

netflix_genre_id.to_sql(name='Netflix_Listed_in', con=engine, if_exists='append', index=False)
netflix_genre_table.to_sql(name='Netflix_title_Listed_in', con=engine, if_exists='append', index=False)

omdb_genre_id.to_sql(name='OMDB_genre', con=engine, if_exists='append', index=False)
omdb_genre_table.to_sql(name='OMDB_title_genre', con=engine, if_exists='append', index=False)

language_df.to_sql(name='OMDB_language', con=engine, if_exists='append', index=False)
language_table.to_sql(name='OMDB_title_language', con=engine, if_exists='append', index=False)