In [2]:
# imports
from sqlalchemy.engine import create_engine
from sqlalchemy.types import *
from sqlalchemy import text
import pymysql
import pandas as pd

In [3]:
# load data
df_basics = pd.read_csv("Data/title_basics_filtred.csv")
df_tmdb = pd.read_csv("Data/tmdb_results_combined.csv.gz")

In [4]:
df_basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
4,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002.0,,126,Drama


In [5]:
# creatre a column with a list of genres
df_basics["genres_split"] = df_basics["genres"].str.split(",")
# create an array with unique genres
df_explode = df_basics.explode("genres_split")
genres = df_explode.genres_split.unique()

In [6]:
# create title-genres dataframe
df_title_genres = df_explode[["tconst", "genres_split"]]
df_title_genres.rename(columns={"genres_split":"genre_id"}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_title_genres.rename(columns={"genres_split":"genre_id"}, inplace=True)


In [7]:
# create a genre mapper dictionary
genres_ints = range(len(genres))
genre_id_map = dict(zip(genres, genres_ints))

In [8]:
# create genres lookup dataframe
df_genre_lookup = pd.DataFrame(dict(genre_id=genre_id_map.values(), genre_name=genre_id_map.keys()))

In [9]:
# replace genres by their ids
df_title_genres.genre_id = df_title_genres.genre_id.map(genre_id_map)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_title_genres.genre_id = df_title_genres.genre_id.map(genre_id_map)


In [10]:
# Create the sqlalchemy engine and connection
pymysql.install_as_MySQLdb()
username = "root"
password = "azerty121"
db_name = "movies"
connection = f"mysql+pymysql://{username}:{password}@localhost/{db_name}"
engine = create_engine(connection)
conn = engine.connect()

In [11]:
# create data types dictionnary
types_dict = dict(imdb_id=VARCHAR(45), revenue=FLOAT, 
                  budget=FLOAT, certification=VARCHAR(45))

In [12]:
# Save to sql with dtype and index=False
df_tmdb[["imdb_id", "revenue", "budget", "certification"]].to_sql('tmdb_data',conn, 
                                                                  dtype=types_dict, if_exists="replace", 
                                                                  index=False)

2663

In [13]:
# add a primary key
conn.execute(text("ALTER TABLE tmdb_data ADD PRIMARY KEY (`imdb_id`);"))

<sqlalchemy.engine.cursor.CursorResult at 0x258d863c7c0>

In [14]:
# insert values into "genres" table
df_genre_lookup.to_sql("genres", conn, index=False, if_exists="append")
# insert values into "title_genres" table
df_title_genres.to_sql("title_genres", conn, index=False, if_exists="append")

162600

In [16]:
# show tables
q = "show tables;"
pd.read_sql(q, conn)

Unnamed: 0,Tables_in_movies
0,genres
1,ratings
2,title_basics
3,title_genres
4,tmdb_data


In [17]:
# tables descriptions
tables_list = ["genres", "title_genres", "tmdb_data"]
for table in tables_list:
    q = f"DESCRIBE {table};"
    print(f"* {table}\n{pd.read_sql(q, conn)}\n")

* genres
        Field         Type Null  Key Default           Extra
0    genre_id          int   NO  PRI    None  auto_increment
1  genre_name  varchar(45)  YES         None                

* title_genres
      Field         Type Null  Key Default Extra
0  genre_id          int   NO  PRI    None      
1    tconst  varchar(45)   NO  PRI    None      

* tmdb_data
           Field         Type Null  Key Default Extra
0        imdb_id  varchar(45)   NO  PRI    None      
1        revenue        float  YES         None      
2         budget        float  YES         None      
3  certification  varchar(45)  YES         None      



In [18]:
# display the 5 first rows of each table
for table in tables_list:
    q = f"""select *
    from {table}
    limit 5"""
    print(f"* {table}\n{pd.read_sql(q, conn)}\n")

* genres
   genre_id genre_name
0         1    Fantasy
1         2    Romance
2         3      Drama
3         4     Horror
4         5     Sci-Fi

* title_genres
   genre_id     tconst
0         0  tt0035423
1         0  tt0088751
2         0  tt0100275
3         0  tt0108549
4         0  tt0118652

* tmdb_data
     imdb_id     revenue      budget certification
0  tt0035423  76019000.0  48000000.0         PG-13
1  tt0096056         0.0         0.0          None
2  tt0114447         0.0         0.0          None
3  tt0116916         0.0         0.0            PG
4  tt0118589   5271670.0  22000000.0         PG-13

