In [1]:
# All required imports
import pandas as pd
from sqlalchemy_utils import database_exists, create_database
import pymysql
pymysql.install_as_MySQLdb()
import os
from sqlalchemy.types import VARCHAR, CHAR, DECIMAL, DATE, DATETIME
from urllib.parse import quote_plus as urlquote
from sqlalchemy import create_engine
import json

In [2]:
with open("/Users/James/OneDrive/Desktop/secret.json") as f:
    login = json.load(f)
login.keys()

dict_keys(['username', 'password'])

In [3]:
connection = f"mysql+pymysql://{login['username']}:{urlquote(login['password'])}@localhost/movies"
engine = create_engine(connection)
conn = engine.connect()

Loading in Data

In [4]:
# Loading in data
df_2001 = pd.read_csv("Data/final_tmdb_data_2001.csv.gz")
df_2002 = pd.read_csv("Data/final_tmdb_data_2002.csv.gz")
df_ratings = pd.read_csv('ratings.csv')
df_basics = pd.read_csv("bascis.csv")

In [5]:
df_2001_2002 = pd.concat([df_2001,df_2002])

In [6]:
df_2001_2002 = df_2001_2002.drop_duplicates(subset=['imdb_id'])

In [7]:
df_2001_2002 = df_2001_2002[['imdb_id','budget','revenue','certification']]

In [8]:
df_2001_2002.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6768 entries, 0 to 3529
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   imdb_id        6768 non-null   object 
 1   budget         6767 non-null   float64
 2   revenue        6767 non-null   float64
 3   certification  1957 non-null   object 
dtypes: float64(2), object(2)
memory usage: 264.4+ KB


In [9]:
df_2001_2002 = df_2001_2002.loc[df_2001_2002['imdb_id']!='0']

In [10]:
# Normalizing genres column in basics dataframe
df_basics['genres_split'] = df_basics['genres'].str.split(',')
exploded_genres = df_basics.explode('genres_split')
unique_genres = sorted(exploded_genres['genres_split'].unique())
title_genres = exploded_genres[['tconst', 'genres_split']].rename(columns={'genres_split': 'genre'})

title_genres.info()

<class 'pandas.core.frame.DataFrame'>
Index: 162600 entries, 0 to 86978
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   tconst  162600 non-null  object
 1   genre   162600 non-null  object
dtypes: object(2)
memory usage: 3.7+ MB


In [11]:
df_2001_2002.head()

Unnamed: 0,imdb_id,budget,revenue,certification
1,tt0034413,0.0,0.0,
2,tt0035423,48000000.0,76019048.0,PG-13
3,tt0114447,0.0,0.0,
4,tt0116916,0.0,0.0,PG
5,tt0118154,0.0,0.0,


In [12]:
# transfroming basics to fit database
selected_columns = ['tconst', 'primaryTitle', 'startYear', 'runtimeMinutes']
df_basics = df_basics[selected_columns]

# Renaming the columns
df_basics = df_basics.rename(columns={
    'tconst': 'tconst',
    'primaryTitle': 'primary_title',
    'startYear': 'start_year',
    'runtimeMinutes': 'runtime'
})

In [13]:
df_basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86979 entries, 0 to 86978
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   tconst         86979 non-null  object
 1   primary_title  86978 non-null  object
 2   start_year     86979 non-null  int64 
 3   runtime        86979 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 2.7+ MB


In [14]:
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 370233 entries, 0 to 370232
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         370233 non-null  object 
 1   averageRating  370233 non-null  float64
 2   numVotes       370233 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 8.5+ MB


Normalizing Dataframe

In [15]:
# Create Genre Mapper Dictionary and Replace String Genres with Integer IDs:
genre_ints = range(len(unique_genres))
genre_map = dict(zip(unique_genres, genre_ints))

title_genres['genre_id'] = title_genres['genre'].map(genre_map)
title_genres = title_genres.drop(columns='genre')

# title_genres after adding genre_id
title_genres.info()

<class 'pandas.core.frame.DataFrame'>
Index: 162600 entries, 0 to 86978
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   tconst    162600 non-null  object
 1   genre_id  162600 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 3.7+ MB


In [16]:
genres_df = pd.DataFrame(list(genre_map.items()), columns=['genre_name', 'genre_id'])

# Display the genres_df
genres_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25 entries, 0 to 24
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   genre_name  25 non-null     object
 1   genre_id    25 non-null     int64 
dtypes: int64(1), object(1)
memory usage: 532.0+ bytes


In [17]:
# show tables
tables = pd.read_sql("SHOW TABLES", con=engine)
tables

Unnamed: 0,Tables_in_movies
0,genres
1,imdb_data
2,ratings
3,title_basics
4,title_genres


Loading data into tables

In [18]:
genres_df.to_sql('genres', con=engine, index=False, if_exists='replace')

25

In [19]:
df_2001_2002.to_sql('imdb_data', con=engine, if_exists='replace', index=False)

6767

In [20]:
df_ratings.to_sql('ratings', con=engine, index=False, if_exists='replace')

370233

In [21]:
title_genres.to_sql('title_genres', con=engine, index=False, if_exists='replace')

162600

In [22]:
df_basics.to_sql('title_basics', con=engine, index=False, if_exists='replace')

86979

Querying DataBase

Genres

In [30]:
q = pd.read_sql("""DESCRIBE genres""", con=engine)
q

Unnamed: 0,Field,Type,Null,Key,Default,Extra
0,genre_name,text,YES,,,
1,genre_id,bigint,YES,,,


In [35]:
q = pd.read_sql("""SELECT * FROM genres LIMIT 5""", con=engine)
q

Unnamed: 0,genre_name,genre_id
0,Action,0
1,Adult,1
2,Adventure,2
3,Animation,3
4,Biography,4


IMDB Data

In [38]:
q = pd.read_sql("""DESCRIBE imdb_data""", con=engine)
q

Unnamed: 0,Field,Type,Null,Key,Default,Extra
0,imdb_id,text,YES,,,
1,budget,double,YES,,,
2,revenue,double,YES,,,
3,certification,text,YES,,,


In [36]:
q = pd.read_sql("""SELECT * FROM imdb_data LIMIT 5""", con=engine)
q

Unnamed: 0,imdb_id,budget,revenue,certification
0,tt0034413,0.0,0.0,
1,tt0035423,48000000.0,76019048.0,PG-13
2,tt0114447,0.0,0.0,
3,tt0116916,0.0,0.0,PG
4,tt0118154,0.0,0.0,


Ratings

In [32]:
q = pd.read_sql("""ratings""", con=engine)
q

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1988
1,tt0000002,5.8,265
2,tt0000005,6.2,2632
3,tt0000006,5.1,182
4,tt0000007,5.4,825
...,...,...,...
370228,tt9916220,7.4,21
370229,tt9916272,6.8,7
370230,tt9916460,9.4,18
370231,tt9916580,7.4,8


In [37]:
q = pd.read_sql("""SELECT * FROM ratings LIMIT 5""", con=engine)
q

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1988
1,tt0000002,5.8,265
2,tt0000005,6.2,2632
3,tt0000006,5.1,182
4,tt0000007,5.4,825


Title Genres

In [33]:
q = pd.read_sql("""title_genres""", con=engine)
q

Unnamed: 0,tconst,genre_id
0,tt0035423,5
1,tt0035423,9
2,tt0035423,18
3,tt0062336,7
4,tt0069049,7
...,...,...
162595,tt9916190,0
162596,tt9916190,2
162597,tt9916190,22
162598,tt9916362,7


In [40]:
q = pd.read_sql("""SELECT * FROM title_genres LIMIT 5""", con=engine)
q

Unnamed: 0,tconst,genre_id
0,tt0035423,5
1,tt0035423,9
2,tt0035423,18
3,tt0062336,7
4,tt0069049,7


Title Basics

In [34]:
q = pd.read_sql("""title_basics""", con=engine)
q

Unnamed: 0,tconst,primary_title,start_year,runtime
0,tt0035423,Kate & Leopold,2001,118
1,tt0062336,The Tango of the Widower and Its Distorting Mi...,2020,70
2,tt0069049,The Other Side of the Wind,2018,122
3,tt0088751,The Naked Monster,2005,100
4,tt0096056,Crime and Punishment,2002,126
...,...,...,...,...
86974,tt9914942,Life Without Sara Amat,2019,74
86975,tt9915872,The Last White Witch,2019,97
86976,tt9916170,The Rehearsal,2019,51
86977,tt9916190,Safeguard,2020,95


In [41]:
q = pd.read_sql("""SELECT * FROM title_basics LIMIT 5""", con=engine)
q

Unnamed: 0,tconst,primary_title,start_year,runtime
0,tt0035423,Kate & Leopold,2001,118
1,tt0062336,The Tango of the Widower and Its Distorting Mi...,2020,70
2,tt0069049,The Other Side of the Wind,2018,122
3,tt0088751,The Naked Monster,2005,100
4,tt0096056,Crime and Punishment,2002,126


In [42]:
q = pd.read_sql('''SHOW TABLES''', con=engine)
q

Unnamed: 0,Tables_in_movies
0,genres
1,imdb_data
2,ratings
3,title_basics
4,title_genres
