In [1]:
# All required imports
import pandas as pd
from sqlalchemy_utils import database_exists, create_database
import pymysql
pymysql.install_as_MySQLdb()
import os
from sqlalchemy.types import VARCHAR, CHAR, DECIMAL, DATE, DATETIME
from urllib.parse import quote_plus as urlquote
from sqlalchemy import create_engine

In [2]:
# connection to MySQL

username = "root"
password = "root"

connection = f"mysql+pymysql://{username}:{urlquote(password)}@localhost/movies"
engine = create_engine(connection)
conn = engine.connect()

Loading in Data

In [3]:
# Loading in data
df_2001 = pd.read_csv("Data/final_tmdb_data_2001.csv.gz")
df_2002 = pd.read_csv("Data/final_tmdb_data_2002.csv.gz")
df_ratings = pd.read_csv('ratings.csv')
df_basics = pd.read_csv("bascis.csv")

In [4]:
df_2001_2002 = pd.concat([df_2001,df_2002])

In [5]:
df_2001_2002 = df_2001_2002.drop_duplicates(subset=['imdb_id'])

In [6]:
df_2001_2002 = df_2001_2002[['imdb_id','budget','revenue','certification']]

In [7]:
df_2001_2002.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6768 entries, 0 to 3529
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   imdb_id        6768 non-null   object 
 1   budget         6767 non-null   float64
 2   revenue        6767 non-null   float64
 3   certification  1957 non-null   object 
dtypes: float64(2), object(2)
memory usage: 264.4+ KB


In [8]:
df_2001_2002 = df_2001_2002.loc[df_2001_2002['imdb_id']!='0']

In [9]:
# Normalizing genres column in basics dataframe
df_basics['genres_split'] = df_basics['genres'].str.split(',')
exploded_genres = df_basics.explode('genres_split')
unique_genres = sorted(exploded_genres['genres_split'].unique())
title_genres = exploded_genres[['tconst', 'genres_split']].rename(columns={'genres_split': 'genre'})

title_genres.info()

<class 'pandas.core.frame.DataFrame'>
Index: 162600 entries, 0 to 86978
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   tconst  162600 non-null  object
 1   genre   162600 non-null  object
dtypes: object(2)
memory usage: 3.7+ MB


In [10]:
df_2001_2002.head()

Unnamed: 0,imdb_id,budget,revenue,certification
1,tt0034413,0.0,0.0,
2,tt0035423,48000000.0,76019048.0,PG-13
3,tt0114447,0.0,0.0,
4,tt0116916,0.0,0.0,PG
5,tt0118154,0.0,0.0,


In [11]:
# transfroming basics to fit database
selected_columns = ['tconst', 'primaryTitle', 'startYear', 'runtimeMinutes']
df_basics = df_basics[selected_columns]

# Renaming the columns
df_basics = df_basics.rename(columns={
    'tconst': 'tconst',
    'primaryTitle': 'primary_title',
    'startYear': 'start_year',
    'runtimeMinutes': 'runtime'
})

In [12]:
df_basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86979 entries, 0 to 86978
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   tconst         86979 non-null  object
 1   primary_title  86978 non-null  object
 2   start_year     86979 non-null  int64 
 3   runtime        86979 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 2.7+ MB


In [13]:
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 370233 entries, 0 to 370232
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         370233 non-null  object 
 1   averageRating  370233 non-null  float64
 2   numVotes       370233 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 8.5+ MB


Normalizing Dataframe

In [14]:
# Create Genre Mapper Dictionary and Replace String Genres with Integer IDs:
genre_ints = range(len(unique_genres))
genre_map = dict(zip(unique_genres, genre_ints))

title_genres['genre_id'] = title_genres['genre'].map(genre_map)
title_genres = title_genres.drop(columns='genre')

# title_genres after adding genre_id
title_genres.info()

<class 'pandas.core.frame.DataFrame'>
Index: 162600 entries, 0 to 86978
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   tconst    162600 non-null  object
 1   genre_id  162600 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 3.7+ MB


In [15]:
genres_df = pd.DataFrame(list(genre_map.items()), columns=['genre_name', 'genre_id'])

# Display the genres_df
genres_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25 entries, 0 to 24
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   genre_name  25 non-null     object
 1   genre_id    25 non-null     int64 
dtypes: int64(1), object(1)
memory usage: 532.0+ bytes


In [16]:
# show tables
tables = pd.read_sql("SHOW TABLES", con=engine)
tables

Unnamed: 0,Tables_in_movies
0,genres
1,imdb_data
2,ratings
3,title_basics
4,title_genres


Loading data into tables

In [17]:
genres_df.to_sql('genres', con=engine, index=False, if_exists='replace')

25

In [18]:
df_2001_2002.to_sql('imdb_data', con=engine, if_exists='replace', index=False)

6767

In [19]:
df_ratings.to_sql('ratings', con=engine, index=False, if_exists='replace')

370233

In [20]:
title_genres.to_sql('title_genres', con=engine, index=False, if_exists='replace')

162600

In [21]:
df_basics.to_sql('title_basics', con=engine, index=False, if_exists='replace')

86979

Querying DataBase

In [25]:
q = pd.read_sql("""SELECT * 
FROM title_basics
WHERE start_year
>2015""", con=engine)
q

Unnamed: 0,tconst,primary_title,start_year,runtime
0,tt0062336,The Tango of the Widower and Its Distorting Mi...,2020,70
1,tt0069049,The Other Side of the Wind,2018,122
2,tt0100275,The Wandering Soap Opera,2017,80
3,tt0119830,One Dog Day,2022,101
4,tt0120589,A Dangerous Practice,2022,108
...,...,...,...,...
37802,tt9914942,Life Without Sara Amat,2019,74
37803,tt9915872,The Last White Witch,2019,97
37804,tt9916170,The Rehearsal,2019,51
37805,tt9916190,Safeguard,2020,95


In [26]:
q = pd.read_sql("""SELECT * 
FROM imdb_data
WHERE revenue
>100000000
AND budget <100000000""", con=engine)
q

Unnamed: 0,imdb_id,budget,revenue,certification
0,tt0120737,93000000.0,871368364.0,PG-13
1,tt0126029,60000000.0,487853320.0,PG
2,tt0139654,45000000.0,104900000.0,R
3,tt0163025,93000000.0,368780809.0,
4,tt0164334,60000000.0,105200000.0,R
...,...,...,...,...
67,tt0298203,41000000.0,242875078.0,
68,tt0299658,45000000.0,306776732.0,PG-13
69,tt0299977,31000000.0,177394432.0,PG-13
70,tt0304669,65000000.0,172842355.0,G
