In [1]:
import pandas as pd
import json
from ast import literal_eval
from sqlalchemy import create_engine
from config import password
import wikipedia
import numpy as np



# Netflix titles csv

In [2]:
csv_path = 'kaggle_netflix_titles.csv'
netflix_titles_df = pd.read_csv(csv_path)
netflix_titles_df.head(2)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,TV Show,3%,,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,"August 14, 2020",2020,TV-MA,4 Seasons,"International TV Shows, TV Dramas, TV Sci-Fi &...",In a future where the elite inhabit an island ...
1,s2,Movie,7:19,Jorge Michel Grau,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",Mexico,"December 23, 2016",2016,TV-MA,93 min,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...


In [3]:
netflix_titles_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7787 entries, 0 to 7786
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       7787 non-null   object
 1   type          7787 non-null   object
 2   title         7787 non-null   object
 3   director      5398 non-null   object
 4   cast          7069 non-null   object
 5   country       7280 non-null   object
 6   date_added    7777 non-null   object
 7   release_year  7787 non-null   int64 
 8   rating        7780 non-null   object
 9   duration      7787 non-null   object
 10  listed_in     7787 non-null   object
 11  description   7787 non-null   object
dtypes: int64(1), object(11)
memory usage: 730.2+ KB


In [4]:
netflix_titles_df[['show_id', 'type', 'title', 'country']].head()

Unnamed: 0,show_id,type,title,country
0,s1,TV Show,3%,Brazil
1,s2,Movie,7:19,Mexico
2,s3,Movie,23:59,Singapore
3,s4,Movie,9,United States
4,s5,Movie,21,United States


In [5]:
movie_country_df = netflix_titles_df[['title' , 'country']].dropna()
movie_country_df.head()

Unnamed: 0,title,country
0,3%,Brazil
1,7:19,Mexico
2,23:59,Singapore
3,9,United States
4,21,United States


# Movie Matadata csv

In [2]:
csv_path = 'movies_metadata.csv'
movies_metadata_df = pd.read_csv(csv_path, low_memory=False)
movies_metadata_df.head(2)


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


In [3]:
movies_metadata_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

# List of movie genre

In [8]:
genre = [
        "Action",
        "Adventure",
        "Animated",
        "Biography",
        "Comedy",
        "Crime",
        "Dance",
        "Disaster",
        "Documentary",
        "Drama",
        "Erotic",
        "Family",
        "Fantasy",
        "Found Footage",
        "Historical",
        "Horror",
        "Independent",
        "Legal",
        "Live Action",
        "Martial Arts",
        "Musical",
        "Mystery",
        "Noir",
        "Performance",
        "Political",
        "Romance",
        "Satire",
        "Science Fiction",
        "Short",
        "Silent",
        "Slasher",
        "Sports",
        "Spy",
        "Superhero",
        "Supernatural",
        "Suspense",
        "Teen",
        "Thriller",
        "War",
        "Western"
]

# Cleaning database

## Creating a database with kaggle_id and movie_title columns

In [4]:
movie_df = movies_metadata_df[['id', 'original_title']]
movie_df.rename(columns={'id':'movie_id', 'original_title':'movie_title'}, inplace=True)
movie_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,movie_id,movie_title
0,862,Toy Story
1,8844,Jumanji
2,15602,Grumpier Old Men
3,31357,Waiting to Exhale
4,11862,Father of the Bride Part II
...,...,...
45461,439050,رگ خواب
45462,111109,Siglo ng Pagluluwal
45463,67758,Betrayal
45464,227506,Satana likuyushchiy


In [5]:
# Removing rows with inappropriate 'movie_id'
clean_df = movie_df.loc[movie_df['movie_id'].str.contains('-')==False]

# Converting string to integer
clean_df['movie_id'] = clean_df['movie_id'].astype(int)

# Dropping duplicates
clean_df.drop_duplicates(keep = False)

# Removing commas from movie titles and replacing them with space 
# clean_df['movie_title'].replace({',': ''}, inplace=True)
clean_df["movie_title"] = clean_df["movie_title"].str.replace(",","")
clean_df["movie_title"] = clean_df["movie_title"].str.replace('"','')
clean_df["movie_title"] = clean_df["movie_title"].str.replace("'","")



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: h

In [6]:
# Saving database to a csv file
clean_df.to_csv('movie_df_for_SQL.csv', index=None)


## Creating a database with imdb_id and movie_title columns

In [7]:
imdb_movie_df = movies_metadata_df[['imdb_id', 'original_title']]
imdb_movie_df.rename(columns={'original_title':'movie_title'}, inplace=True)

# movie_df.rename(columns={'id':'movie_id', 'original_title':'movie_title'}, inplace=True)
imdb_movie_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,imdb_id,movie_title
0,tt0114709,Toy Story
1,tt0113497,Jumanji
2,tt0113228,Grumpier Old Men
3,tt0114885,Waiting to Exhale
4,tt0113041,Father of the Bride Part II


In [8]:
# Cleaning dataframe by dropping NAN values
imdb_movie_df = imdb_movie_df.dropna()

# Dropping duplicates
clean_imdb_movie = imdb_movie_df.drop_duplicates(keep=False)
clean_imdb_movie.head()

# Removing commas from movie titles and replacing them with space 
# clean_df['movie_title'].replace({',': ''}, inplace=True)
clean_imdb_movie["movie_title"] = clean_imdb_movie["movie_title"].str.replace(",","")
clean_imdb_movie["movie_title"] = clean_imdb_movie["movie_title"].str.replace('"','')
clean_imdb_movie["movie_title"] = clean_imdb_movie["movie_title"].str.replace("'","")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


In [9]:
# Saving database to a csv file
clean_imdb_movie.to_csv('imdb_movie_df_for_SQL.csv', index=None)

In [10]:
# IMDB URLs
for id in clean_imdb_movie["imdb_id"]:
    clean_imdb_movie["imdb_url"] = f"https://www.imdb.com/title/{id}"

clean_imdb_movie

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,imdb_id,movie_title,imdb_url
0,tt0114709,Toy Story,https://www.imdb.com/title/tt6980792
1,tt0113497,Jumanji,https://www.imdb.com/title/tt6980792
2,tt0113228,Grumpier Old Men,https://www.imdb.com/title/tt6980792
3,tt0114885,Waiting to Exhale,https://www.imdb.com/title/tt6980792
4,tt0113041,Father of the Bride Part II,https://www.imdb.com/title/tt6980792
...,...,...,...
45461,tt6209470,رگ خواب,https://www.imdb.com/title/tt6980792
45462,tt2028550,Siglo ng Pagluluwal,https://www.imdb.com/title/tt6980792
45463,tt0303758,Betrayal,https://www.imdb.com/title/tt6980792
45464,tt0008536,Satana likuyushchiy,https://www.imdb.com/title/tt6980792


In [12]:
wikipedia.WikipediaPage.links

<property at 0x1faac56a868>

In [13]:
movies_metadata_df['genres'][19]

"[{'id': 28, 'name': 'Action'}, {'id': 35, 'name': 'Comedy'}, {'id': 80, 'name': 'Crime'}]"

In [14]:
movie_genre = []
genre=[]
total_genre_column=[]
for i in range(len(movies_metadata_df['original_title'])):

    movie_genre.clear()

    for genre in literal_eval(movies_metadata_df['genres'][i]):
        movie_genre.append(genre['name'])

    genre[i]=movie_genre
    print(movies_metadata_df['original_title'][i], genre[i])
    genre_dict = {movies_metadata_df['original_title'][i]:genre[i]}
    total_genre_column.append(genre_dict)
    # total_genre_column.append(genre[i])
# total_genre_column   
    # movies_metadata_df["movie_genre"]=genre[i]


# movies_metadata_df['genres'] = movie_genre
# literal_eval(movies_metadata_df['genres'][0])[1]['name']
# literal_eval(movies_metadata_df['genres'][0])

Toy Story ['Animation', 'Comedy', 'Family']
Jumanji ['Adventure', 'Fantasy', 'Family']
Grumpier Old Men ['Romance', 'Comedy']
Waiting to Exhale ['Comedy', 'Drama', 'Romance']
Father of the Bride Part II ['Comedy']
Heat ['Action', 'Crime', 'Drama', 'Thriller']
Sabrina ['Comedy', 'Romance']
Tom and Huck ['Action', 'Adventure', 'Drama', 'Family']
Sudden Death ['Action', 'Adventure', 'Thriller']
GoldenEye ['Adventure', 'Action', 'Thriller']
The American President ['Comedy', 'Drama', 'Romance']
Dracula: Dead and Loving It ['Comedy', 'Horror']
Balto ['Family', 'Animation', 'Adventure']
Nixon ['History', 'Drama']
Cutthroat Island ['Action', 'Adventure']
Casino ['Drama', 'Crime']
Sense and Sensibility ['Drama', 'Romance']
Four Rooms ['Crime', 'Comedy']
Ace Ventura: When Nature Calls ['Crime', 'Comedy', 'Adventure']
Money Train ['Action', 'Comedy', 'Crime']
Get Shorty ['Comedy', 'Thriller', 'Crime']
Copycat ['Drama', 'Thriller']
Assassins ['Action', 'Adventure', 'Crime', 'Thriller']
Powder ['Dr

# Movie Genre Dataframe

In [15]:
movie_genre_df = pd.read_csv('movie_genre.csv',index_col=None)
movie_genre_df.head()

Unnamed: 0,movie_name,genre_id1,genre_1,genre_id2,genre_2,genre_id3,genre_3,genre_id4,genre_4,genre_id5,genre_5,genre_id6,genre_6,genre_id7,genre_7,genre_id8,genre_8
0,Toy Story,16,'Animation',35.0,'Comedy',10751.0,'Family',,,,,,,,,,
1,Jumanji,12,'Adventure',14.0,'Fantasy',10751.0,'Family',,,,,,,,,,
2,Grumpier Old Men,10749,'Romance',35.0,'Comedy',,,,,,,,,,,,
3,Waiting to Exhale,35,'Comedy',18.0,'Drama',10749.0,'Romance',,,,,,,,,,
4,Father of the Bride Part II,35,'Comedy',,,,,,,,,,,,,,


In [16]:
movie_genre_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   movie_name  45460 non-null  object
 1   genre_id1   43024 non-null  object
 2   genre_1     43024 non-null  object
 3   genre_id2   28465 non-null  object
 4   genre_2     28465 non-null  object
 5   genre_id3   13994 non-null  object
 6   genre_3     13985 non-null  object
 7   genre_id4   4426 non-null   object
 8   genre_4     4400 non-null   object
 9   genre_id5   1039 non-null   object
 10  genre_5     1020 non-null   object
 11  genre_id6   195 non-null    object
 12  genre_6     185 non-null    object
 13  genre_id7   31 non-null     object
 14  genre_7     27 non-null     object
 15  genre_id8   4 non-null      object
 16  genre_8     3 non-null      object
dtypes: object(17)
memory usage: 5.9+ MB


## Cleaning dataframe

In [17]:
# Renaming columns
movie_genre_df.rename(columns={'movie_name':'movie_title'},inplace=True)
movie_genre_df.head()

Unnamed: 0,movie_title,genre_id1,genre_1,genre_id2,genre_2,genre_id3,genre_3,genre_id4,genre_4,genre_id5,genre_5,genre_id6,genre_6,genre_id7,genre_7,genre_id8,genre_8
0,Toy Story,16,'Animation',35.0,'Comedy',10751.0,'Family',,,,,,,,,,
1,Jumanji,12,'Adventure',14.0,'Fantasy',10751.0,'Family',,,,,,,,,,
2,Grumpier Old Men,10749,'Romance',35.0,'Comedy',,,,,,,,,,,,
3,Waiting to Exhale,35,'Comedy',18.0,'Drama',10749.0,'Romance',,,,,,,,,,
4,Father of the Bride Part II,35,'Comedy',,,,,,,,,,,,,,


In [60]:
movie_genre_df = movie_genre_df.replace("'","")
movie_genre_df = movie_genre_df.fillna('')
movie_genre_df.head()

Unnamed: 0,movie_title,genre_id1,genre_1,genre_id2,genre_2,genre_id3,genre_3,genre_id4,genre_4,genre_id5,genre_5,genre_id6,genre_6,genre_id7,genre_7,genre_id8,genre_8
0,Toy Story,16,Animation,35.0,Comedy,10751.0,Family,,,,,,,,,,
1,Jumanji,12,Adventure,14.0,Fantasy,10751.0,Family,,,,,,,,,,
2,Grumpier Old Men,10749,Romance,35.0,Comedy,,,,,,,,,,,,
3,Waiting to Exhale,35,Comedy,18.0,Drama,10749.0,Romance,,,,,,,,,,
4,Father of the Bride Part II,35,Comedy,,,,,,,,,,,,,,


In [64]:
movie_genre_dict = []
genre_list = []


movie_genre_df['genre_1'][0]

for i in range(len(movie_genre_df['movie_title'])):
        key = movie_genre_df['movie_title'][i]
        
        for j in np.arange(1,9): 
                if movie_genre_df[f"genre_id{j}"][i]:
                        genre_list.append(movie_genre_df[f"genre_id{j}"][i])
                
        genre_dict = {key:genre_list}
        print(genre_dict)

        movie_genre_dict.append(str(genre_dict))
        genre_list.clear()

 

{'Toy Story': ['16', '35', '10751']}
{'Jumanji': ['12', '14', '10751']}
{'Grumpier Old Men': ['10749', '35']}
{'Waiting to Exhale': ['35', '18', '10749']}
{'Father of the Bride Part II': ['35']}
{'Heat': ['28', '80', '18', '53']}
{'Sabrina': ['35', '10749']}
{'Tom and Huck': ['28', '12', '18', '10751']}
{'Sudden Death': ['28', '12', '53']}
{'GoldenEye': ['12', '28', '53']}
{'The American President': ['35', '18', '10749']}
{'Dracula: Dead and Loving It': ['35', '27']}
{'Balto': ['10751', '16', '12']}
{'Nixon': ['36', '18']}
{'Cutthroat Island': ['28', '12']}
{'Casino': ['18', '80']}
{'Sense and Sensibility': ['18', '10749']}
{'Four Rooms': ['80', '35']}
{'Ace Ventura: When Nature Calls': ['80', '35', '12']}
{'Money Train': ['28', '35', '80']}
{'Get Shorty': ['35', '53', '80']}
{'Copycat': ['18', '53']}
{'Assassins': ['28', '12', '80', '53']}
{'Powder': ['18', '14', '878', "''"]}
{'Leaving Las Vegas': ['18', '10749']}
{'Othello': ['18']}
{'Now and Then': ['35', '18', '10751']}
{'Persuasi

In [65]:
movie_genre_dict.str.re

["{'Toy Story': ['16', '35', '10751']}",
 "{'Jumanji': ['12', '14', '10751']}",
 "{'Grumpier Old Men': ['10749', '35']}",
 "{'Waiting to Exhale': ['35', '18', '10749']}",
 "{'Father of the Bride Part II': ['35']}",
 "{'Heat': ['28', '80', '18', '53']}",
 "{'Sabrina': ['35', '10749']}",
 "{'Tom and Huck': ['28', '12', '18', '10751']}",
 "{'Sudden Death': ['28', '12', '53']}",
 "{'GoldenEye': ['12', '28', '53']}",
 "{'The American President': ['35', '18', '10749']}",
 "{'Dracula: Dead and Loving It': ['35', '27']}",
 "{'Balto': ['10751', '16', '12']}",
 "{'Nixon': ['36', '18']}",
 "{'Cutthroat Island': ['28', '12']}",
 "{'Casino': ['18', '80']}",
 "{'Sense and Sensibility': ['18', '10749']}",
 "{'Four Rooms': ['80', '35']}",
 "{'Ace Ventura: When Nature Calls': ['80', '35', '12']}",
 "{'Money Train': ['28', '35', '80']}",
 "{'Get Shorty': ['35', '53', '80']}",
 "{'Copycat': ['18', '53']}",
 "{'Assassins': ['28', '12', '80', '53']}",
 '{\'Powder\': [\'18\', \'14\', \'878\', "\'\'"]}',
 "{