In [1]:
import pandas as pd
from os import listdir
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pandas.api.types import is_numeric_dtype
from datetime import datetime

In [2]:
path = 'drive/MyDrive/Ciencia de Datos/2do Semestre/Proyecto Terminal/netflix_project/netflixData/titles/'
files = listdir(path)
joined_data = pd.DataFrame()

files

['Netflix Shows.csv',
 'netflix-rotten-tomatoes-metacritic-imdb.csv',
 'NetflixOriginals.csv',
 'netflix_titles.csv']

In [3]:
file = 'Netflix Shows.csv'
# Read file
netflix_shows_data = pd.read_csv(path + file, encoding='latin-1').dropna()
# Clean columns names
netflix_shows_data.columns = netflix_shows_data.columns.str.strip().str.lower()
# Remove duplicates
netflix_shows_data = netflix_shows_data.drop_duplicates(subset=['title'])
# Disregard unneeded columns
del netflix_shows_data['ratinglevel']
del netflix_shows_data['ratingdescription']
del netflix_shows_data['user rating size']
del netflix_shows_data['release year']
netflix_shows_data

Unnamed: 0,title,rating,user rating score
0,White Chicks,PG-13,82.0
2,Grey's Anatomy,TV-14,98.0
3,Prison Break,TV-14,98.0
4,How I Met Your Mother,TV-PG,94.0
5,Supernatural,TV-14,95.0
...,...,...,...
966,Blank Check,PG,93.0
967,Heavyweights,PG,74.0
972,D2: The Mighty Ducks,PG,70.0
973,"Honey, I Shrunk the Kids",PG,80.0


In [4]:
file = 'netflix-rotten-tomatoes-metacritic-imdb.csv'
# Read file
rotten_tomatoes_data = pd.read_csv(path + file, encoding='latin-1').dropna()
# Clean columns names
rotten_tomatoes_data.columns = rotten_tomatoes_data.columns.str.strip().str.lower()
# Disregard unneeded columns
delete = ['tags', 'hidden gem score', 'runtime', 'rotten tomatoes score', 'metacritic score', 'awards received', 'awards nominated for', 'netflix link', 'summary', 
'image', 'poster', 'tmdb trailer', 'trailer site']

for item in delete:
  del rotten_tomatoes_data[item]

# Remove duplicates
rotten_tomatoes_data = rotten_tomatoes_data.drop_duplicates(subset=['title'])

display(rotten_tomatoes_data.iloc[0])

title                                                               Joker
genre                                              Crime, Drama, Thriller
languages                                                         English
series or movie                                                     Movie
country availability    Lithuania,Poland,France,Italy,Spain,Greece,Bel...
director                                                    Todd Phillips
writer                  Scott Silver, Jerry Robinson, Todd Phillips, B...
actors                  Joaquin Phoenix, Zazie Beetz, Robert De Niro, ...
view rating                                                             R
imdb score                                                            8.4
boxoffice                                                    $335,451,311
release date                                                  04 Oct 2019
netflix release date                                           2021-03-03
production house        Bron Studios, 

In [5]:
joined_data = pd.merge(netflix_shows_data, rotten_tomatoes_data, how='outer', left_on=['title'], right_on = ['title'])

display(joined_data)

Unnamed: 0,title,rating,user rating score,genre,languages,series or movie,country availability,director,writer,actors,view rating,imdb score,boxoffice,release date,netflix release date,production house,imdb link,imdb votes
0,White Chicks,PG-13,82.0,"Comedy, Crime",English,Movie,"South Korea,Romania,United Kingdom,Mexico,Cana...",Keenen Ivory Wayans,"Michael Anthony Snowden, Keenen Ivory Wayans, ...","Frankie Faison, Shawn Wayans, Jaime King, Marl...",PG-13,5.6,"$70,831,760",23 Jun 2004,2015-04-14,"Wayan Bros. Entertainment, Gone North Producti...",https://www.imdb.com/title/tt0381707,131407.0
1,Grey's Anatomy,TV-14,98.0,,,,,,,,,,,,,,,
2,Prison Break,TV-14,98.0,,,,,,,,,,,,,,,
3,How I Met Your Mother,TV-PG,94.0,,,,,,,,,,,,,,,
4,Supernatural,TV-14,95.0,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2278,50 First Dates,,,"Comedy, Drama, Romance","English, Hawaiian, Mandarin",Movie,"Hong Kong,South Korea,Japan,United Kingdom,Tha...",Peter Segal,George Wing,"Sean Astin, Adam Sandler, Rob Schneider, Drew ...",PG-13,6.8,"$120,908,074",13 Feb 2004,2015-04-14,"Columbia Pictures Corporation, Flower Films, A...",https://www.imdb.com/title/tt0343660,327916.0
2279,21,,,"Crime, Drama, History, Thriller",English,Movie,"South Korea,Russia,Hong Kong,Portugal,Belgium,...",Robert Luketic,"Allan Loeb, Ben Mezrich, Peter Steinfeld","Kevin Spacey, Aaron Yoo, Kate Bosworth, Jim St...",PG-13,6.8,"$81,159,365",28 Mar 2008,2015-04-14,"Trigger Street Productions, Michael De Luca",https://www.imdb.com/title/tt0478087,230337.0
2280,One Chance,,,"Biography, Comedy, Drama, Music","English, Italian",Movie,"Japan,Malaysia",David Frankel,Justin Zackham,"James Corden, Colm Meaney, Alexandra Roach, Ju...",PG-13,6.8,"$101,196",25 Oct 2013,2015-04-14,"Relevant Entertainment, Weston Pictures",https://www.imdb.com/title/tt1196956,11536.0
2281,The Twilight Saga: Breaking Dawn: Part 1,,,"Adventure, Drama, Fantasy, Romance, Thriller","English, Portuguese",Movie,"Canada,Romania,Japan,Switzerland,United Kingdo...",Bill Condon,"Stephenie Meyer, Melissa Rosenberg","Billy Burke, Taylor Lautner, Sarah Clarke, Gil...",PG-13,4.9,"$281,287,133",18 Nov 2011,2015-04-14,Temple Hill,https://www.imdb.com/title/tt1324999,224372.0


In [6]:
# Convert ratings to string
joined_data['rating'] = joined_data['rating'].astype(str)
joined_data['view rating'] = joined_data['view rating'].astype(str)

# Fill rating column
joined_data['rating'] = [right if left == 'nan' else left for left, right in zip(joined_data['rating'], joined_data['view rating'])]
del joined_data['view rating']

# Change columns names
joined_data = joined_data.rename(columns = {
    'actors': 'cast',
    'country availability': 'countries'
})

# Cast dates
joined_data['netflix release date'] = pd.to_datetime(joined_data['netflix release date'])
joined_data['release date'] = pd.to_datetime(joined_data['release date'])

joined_data.iloc[0]

title                                                        White Chicks
rating                                                              PG-13
user rating score                                                    82.0
genre                                                       Comedy, Crime
languages                                                         English
series or movie                                                     Movie
countries               South Korea,Romania,United Kingdom,Mexico,Cana...
director                                              Keenen Ivory Wayans
writer                  Michael Anthony Snowden, Keenen Ivory Wayans, ...
cast                    Frankie Faison, Shawn Wayans, Jaime King, Marl...
imdb score                                                            5.6
boxoffice                                                     $70,831,760
release date                                          2004-06-23 00:00:00
netflix release date                  

In [7]:
file = 'netflix_titles.csv'
# Read file
netflix_titles = pd.read_csv(path + file, encoding='latin-1').dropna()
# Clean columns names
netflix_titles.columns = netflix_titles.columns.str.strip().str.lower()
# Disregard unneeded columns
delete = ['release_year', 'description', 'show_id']

for item in delete:
  del netflix_titles[item]

# Remove duplicates
netflix_titles = netflix_titles.drop_duplicates(subset=['title'])

# Change columns names
netflix_titles = netflix_titles.rename(columns = {
    'country': 'countries',
    'date_added': 'netflix release date',
    'listed_in': 'genre'
})

# Cast dates
netflix_titles['netflix release date'] = pd.to_datetime(netflix_titles['netflix release date'])

display(netflix_titles.iloc[0])

type                                                                Movie
title                                                             Sankofa
director                                                     Haile Gerima
cast                    Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra D...
countries               United States, Ghana, Burkina Faso, United Kin...
netflix release date                                  2021-09-24 00:00:00
rating                                                              TV-MA
duration                                                          125 min
genre                    Dramas, Independent Movies, International Movies
Name: 7, dtype: object

In [8]:
same_columns = ['title', 'countries', 'rating', 'genre', 'director', 'cast', 'netflix release date']
joined_data = pd.merge(joined_data, netflix_titles, how='outer', left_on=same_columns, right_on=same_columns)

joined_data.iloc[0]

title                                                        White Chicks
rating                                                              PG-13
user rating score                                                    82.0
genre                                                       Comedy, Crime
languages                                                         English
series or movie                                                     Movie
countries               South Korea,Romania,United Kingdom,Mexico,Cana...
director                                              Keenen Ivory Wayans
writer                  Michael Anthony Snowden, Keenen Ivory Wayans, ...
cast                    Frankie Faison, Shawn Wayans, Jaime King, Marl...
imdb score                                                            5.6
boxoffice                                                     $70,831,760
release date                                          2004-06-23 00:00:00
netflix release date                  

In [9]:
netflix_titles['title']

7                             Sankofa
8       The Great British Baking Show
9                        The Starling
12                       Je Suis Karl
24                              Jeans
                    ...              
8801                          Zinzana
8802                           Zodiac
8804                       Zombieland
8805                             Zoom
8806                           Zubaan
Name: title, Length: 5332, dtype: object

In [10]:
file = 'NetflixOriginals.csv'
# Read file
netflix_originals = pd.read_csv(path + file, encoding='latin-1').dropna()
# Clean columns names
netflix_originals.columns = netflix_originals.columns.str.strip().str.lower()
# Disregard unneeded columns
delete = ['imdb score']

for item in delete:
  del netflix_originals[item]

# Remove duplicates
netflix_originals = netflix_originals.drop_duplicates(subset=['title'])

# Change columns names
netflix_originals = netflix_originals.rename(columns = {
    'premiere': 'release date',
    'runtime': 'duration',
    'language': 'languages'
})

# Change duration to string
netflix_originals['duration'] = netflix_originals['duration'].astype(str) + ' min'

# Cast dates
netflix_originals['release date'] = pd.to_datetime(netflix_originals['release date'])

display(netflix_originals.iloc[100])

title                        Guilty
genre                      Thriller
release date    2020-03-06 00:00:00
duration                    119 min
languages                     Hindi
Name: 100, dtype: object

In [11]:
same_columns = ['title', 'genre', 'release date', 'duration', 'languages']
joined_data = pd.merge(joined_data, netflix_originals, how='outer', left_on=same_columns, right_on=same_columns)



In [12]:
joined_data = joined_data.drop_duplicates(subset=['title'])

joined_data['languages'] = joined_data['languages'].str.replace('/', ',').replace(', ', ',')

joined_data.iloc[7266]

title                   Winter on Fire: Ukraine's Fight for Freedom
rating                                                          NaN
user rating score                                               NaN
genre                                                   Documentary
languages                                  English,Ukranian,Russian
series or movie                                                 NaN
countries                                                       NaN
director                                                        NaN
writer                                                          NaN
cast                                                            NaN
imdb score                                                      NaN
boxoffice                                                       NaN
release date                                    2015-10-09 00:00:00
netflix release date                                            NaT
production house                                

In [16]:
joined_data.to_csv('drive/MyDrive/Ciencia de Datos/2do Semestre/Proyecto Terminal/netflix_project/netflixData/titles/joined_data.csv', index=False, sep="\\")