**Importing relevant libraries**

In [1]:
import pandas as pd
import numpy as np

import pymysql
pymysql.install_as_MySQLdb()

from sqlalchemy import create_engine

from sqlalchemy_utils import create_database, database_exists
connection_string = "mysql+pymysql://root:root@localhost/MOVIES"

**Loading the title_basics data**

In [2]:
df =pd.read_csv("Movies_files/Title_Basics.csv.gz")
df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020.0,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
4,tt0093119,movie,Grizzly II: Revenge,Grizzly II: The Predator,0,2020.0,,74,"Horror,Music,Thriller"


**Converting the single string of genres into 2 new tables**

In [3]:
#Storing the seperated list of genres in a new column
df["genres_to_split"] = df["genres"].str.split(",")
df.head(2)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,genres_to_split
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance","[Comedy, Fantasy, Romance]"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020.0,,70,Drama,[Drama]


In [4]:
#Extracting each item from the list using the .explode function.
genres_exploded = df.explode("genres_to_split")
genres_exploded.head(2)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,genres_to_split
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance",Comedy
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance",Fantasy


**Creating titles_genre table comprising of two columns**

In [5]:
#Sorting the items in the genres_to_split column and saving them in a variable to be used later
unique_genres = sorted(genres_exploded["genres_to_split"].unique())

In [6]:
title_genres =genres_exploded[["tconst", "genres_to_split"]].copy()
title_genres.head()

Unnamed: 0,tconst,genres_to_split
0,tt0035423,Comedy
0,tt0035423,Fantasy
0,tt0035423,Romance
1,tt0062336,Drama
2,tt0069049,Drama


**Creating a mapper dictionary to replace the string genres with integers**

In [7]:
genre_numb = range(len(unique_genres))
genre_map = dict(zip(unique_genres, genre_numb))
genre_map

{'Action': 0,
 'Adult': 1,
 'Adventure': 2,
 'Animation': 3,
 'Biography': 4,
 'Comedy': 5,
 'Crime': 6,
 'Drama': 7,
 'Family': 8,
 'Fantasy': 9,
 'Game-Show': 10,
 'History': 11,
 'Horror': 12,
 'Music': 13,
 'Musical': 14,
 'Mystery': 15,
 'News': 16,
 'Reality-TV': 17,
 'Romance': 18,
 'Sci-Fi': 19,
 'Short': 20,
 'Sport': 21,
 'Talk-Show': 22,
 'Thriller': 23,
 'War': 24,
 'Western': 25}

In [8]:
#Replacing a new column as genre_id
title_genres["genre_id"] = title_genres["genres_to_split"].map(genre_map)
title_genres.head(2)

Unnamed: 0,tconst,genres_to_split,genre_id
0,tt0035423,Comedy,5
0,tt0035423,Fantasy,9


In [9]:
#Dropping the old column of split list of genres
title_genres = title_genres.drop(columns =["genres_to_split"])


**Creating a new table for genres by converting the mapper dictionary into a dataframe, and naming it genres**

In [10]:
genres =pd.DataFrame({"genre_name":genre_map.keys(), "genre_id":genre_map.values()})

**Dropping all the unwanted columns from the title basics table**

In [11]:
title_basics= df.drop(columns =["originalTitle", "isAdult", "titleType", "genres", "genres_to_split"])

**Loading the previous data sets on title_ratings and tmdb**

In [12]:
title_ratings = pd.read_csv("Movies_files/Title_Ratings.csv.gz")

In [13]:
tmdb_data = pd.read_csv("Movies_files/tmdb_results_combined.csv.gz")

**Filtering only the required columns**

In [14]:
tmdb_data = tmdb_data[["imdb_id", "revenue", "budget", "certification"]]
tmdb_data.head()

Unnamed: 0,imdb_id,revenue,budget,certification
0,0,,,
1,tt0113026,0.0,10000000.0,
2,tt0113092,0.0,0.0,
3,tt0116391,0.0,0.0,
4,tt0118694,12854953.0,150000.0,PG


In [15]:
#Duplicate values will cause hinderance in upcoming codes, so it needs to be addressed early
tmdb_data.duplicated().sum()

1

In [16]:
#Dropping duplicates
tmdb_data = tmdb_data.drop_duplicates()

#Confirming changes
tmdb_data.duplicated().sum()

0

**Checking all the required tables to form the database**

### title_basics

In [17]:
title_basics.head(1)

Unnamed: 0,tconst,primaryTitle,startYear,endYear,runtimeMinutes
0,tt0035423,Kate & Leopold,2001.0,,118


### genres

In [18]:
genres.head(1)

Unnamed: 0,genre_name,genre_id
0,Action,0


### title_genres

In [19]:
title_genres.head(1)

Unnamed: 0,tconst,genre_id
0,tt0035423,5


### title_ratings

In [20]:
title_ratings.head(1)

Unnamed: 0,tconst,averageRating,numVotes
0,tt0035423,6.4,83564


### tmdb_data

In [21]:
tmdb_data.head(2)

Unnamed: 0,imdb_id,revenue,budget,certification
0,0,,,
1,tt0113026,0.0,10000000.0,


## Creating the database

In [22]:
engine =create_engine(connection_string)

In [23]:
if database_exists(connection_string)==False:
    create_database(connection_string)


**Creating a datatype schema for the database conversion**

In [24]:
from sqlalchemy.types import *

#Setting the dtypes for the title_basics database table.
key_len = title_basics["tconst"].fillna("").map(len).max()
title_len = title_basics["primaryTitle"].fillna("").map(len).max()

df_schema ={
    "tconst": String(key_len+1),
    "imdb_id":String(key_len+1),
    "primaryTitle":Text(title_len+1),
    "startYear":Float(),
    "endYear":Float(),
    "runtimeMinutes": Integer()}



#Setting the dtpyes for the tmdb_data database table
key_len = tmdb_data["imdb_id"].fillna("").map(len).max()
title_len = tmdb_data["certification"].fillna("").map(len).max()

tmdb_schema={
    "imdb_id":String(key_len+1),
    "certification": String(key_len+1),
    "revenue": Integer(),
    "budget": Integer()
    }

**Converting all the dataframe tables into database tables**

While converting dataframes into  database tables with integer primary keys, the keys can be assigned within the same line of codes. 

As for dataframes with with object columns as primary key, the dataframe needs to be converted to a database table first and then the keys can be assigned using a different line of codes.  

**Converting databases with integer column as primary keys, into database tables, at the same time assigning a primary key.**

In [25]:
#Converting databases with integer column as primary keys, into database tables.

title_genres.set_index("genre_id").to_sql('title_genres', engine, if_exists = "replace" )

genres.set_index("genre_id").to_sql('genres', engine, if_exists = "replace")

**Converting dataframes with object columns as primary key, into database tables, the primary keys will be assigned seperately in the next line of codes**

In [34]:
#Converting dataframes with object columns as primary key, into database tables

title_basics.to_sql('title_basics',engine,dtype = df_schema, if_exists ="replace", index = False )

title_ratings.to_sql('title_ratings',engine, dtype = df_schema, if_exists ="replace" , index = False)

tmdb_data.to_sql('tmdb_data', engine, dtype = df_schema, if_exists = "replace", index = False)

**Assigning primary keys to tables**

In [35]:
#Assigning the "tconst" column as key to the title_basics table
engine.execute('ALTER TABLE title_basics ADD PRIMARY KEY (`tconst`);')

#Assigning the "tconst" column as key to the title_ratings table
engine.execute('ALTER TABLE title_ratings ADD PRIMARY KEY (`tconst`);')

#Assigning the "tconst" column as key to the title_ratings table
engine.execute('ALTER TABLE tmdb_data ADD PRIMARY KEY (`imdb_id`);')

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x1593adc9a48>

**A view of the first 5 rows of each table in the database**

**Title_basics**

In [36]:
q =""" 
SELECT *
FROM  title_basics
LIMIT 5;
 """
pd.read_sql(q,engine)

Unnamed: 0,tconst,primaryTitle,startYear,endYear,runtimeMinutes
0,tt0035423,Kate & Leopold,2001.0,,118
1,tt0062336,The Tango of the Widower and Its Distorting Mi...,2020.0,,70
2,tt0069049,The Other Side of the Wind,2018.0,,122
3,tt0088751,The Naked Monster,2005.0,,100
4,tt0093119,Grizzly II: Revenge,2020.0,,74


**Title_ratings**

In [29]:
q =""" 
SELECT *
FROM  title_ratings
LIMIT 5;
 """
pd.read_sql(q,engine)

Unnamed: 0,tconst,averageRating,numVotes
0,tt0035423,6.4,83564
1,tt0062336,6.4,161
2,tt0069049,6.7,7193
3,tt0079644,7.3,35
4,tt0088751,5.2,319


**Title_genres**

In [30]:
q =""" 
SELECT *
FROM  title_genres
LIMIT 5;
 """
pd.read_sql(q,engine)


Unnamed: 0,genre_id,tconst
0,5,tt0035423
1,9,tt0035423
2,18,tt0035423
3,7,tt0062336
4,7,tt0069049


**genres**

In [31]:
q =""" 
SELECT *
FROM  genres
LIMIT 5;
 """
pd.read_sql(q,engine)


Unnamed: 0,genre_id,genre_name
0,0,Action
1,1,Adult
2,2,Adventure
3,3,Animation
4,4,Biography


**tmdb_data**

In [32]:
q =""" 
SELECT *
FROM  tmdb_data
LIMIT 5;
 """
pd.read_sql(q,engine)

Unnamed: 0,imdb_id,revenue,budget,certification
0,0,,,
1,tt0035423,76019048.0,48000000.0,PG-13
2,tt0113026,0.0,10000000.0,
3,tt0113092,0.0,0.0,
4,tt0114447,0.0,0.0,


**List of all tables in the database**

In [33]:
q =""" SHOW TABLES; """
pd.read_sql(q,engine)

Unnamed: 0,Tables_in_movies
0,genres
1,title_basics
2,title_genres
3,title_ratings
4,tmdb_data
