In [1]:
import pandas as pd
import os, time,json
import tmdbsimple as tmdb
from tqdm.notebook import tqdm_notebook

from sqlalchemy import create_engine
import pymysql
pymysql.install_as_MySQLdb()

FOLDER = "Data/"
os.makedirs(FOLDER, exist_ok=True)
os.listdir(FOLDER)

['title_basics_cleaned.csv.gz',
 'title.akas.tsv.gz',
 'years_to_get.csv.gz',
 'ratings.akas.tsv.gz',
 'tmdb_api_results_2000.json',
 'final_tmdb_data_2000.csv.gz',
 'title_basics.csv.gz',
 'tmdb_api_results_[2000, 2001].json',
 'final_tmdb_data_[2000, 2001].csv.gz',
 'ratings_cleaned.akas.tsv.gz',
 "tmdb_api_results_['2000'].json"]

In [2]:
import json
with open('/Users/christinab/.secret/tmdb_api.json', 'r') as f:
    login = json.load(f)
## Display the keys of the loaded dict
login.keys()

dict_keys(['client-id', 'Api-key'])

In [3]:
#make a movie object using the .Movies function from tmdb
tmdb.API_Key = login['Api-key']

In [4]:
# list of tables needed for imports
# title_basics
# title_ratings
# title_genres
# genres
# tmdb_data

#### Table 1

In [5]:
#load dataframe from project part 1 as basics:
basics = pd.read_csv('Data/ratings_cleaned.akas.tsv.gz')
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0119830,movie,One Dog Day,One Dog Day,0,2022,,101,"Action,Comedy"
1,tt0120589,movie,A Dangerous Practice,A Dangerous Practice,0,2022,,108,Drama
2,tt0200940,movie,Over-sexed Rugsuckers from Mars,Over-sexed Rugsuckers from Mars,0,2022,,87,"Comedy,Sci-Fi"
3,tt0265705,movie,Saurians,Saurians,0,2022,,83,"Action,Sci-Fi"
4,tt0283145,movie,Wielka droga,Wielka droga,0,2022,,87,"Drama,War"


**List of Unique Genres**


In [6]:
# create a col with a list of genres
basics['genres_split'] = basics['genres'].str.split(',')
basics.head(3)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,genres_split
0,tt0119830,movie,One Dog Day,One Dog Day,0,2022,,101,"Action,Comedy","[Action, Comedy]"
1,tt0120589,movie,A Dangerous Practice,A Dangerous Practice,0,2022,,108,Drama,[Drama]
2,tt0200940,movie,Over-sexed Rugsuckers from Mars,Over-sexed Rugsuckers from Mars,0,2022,,87,"Comedy,Sci-Fi","[Comedy, Sci-Fi]"


In [7]:
# separate the genres col- variables into there individual row 
exploded_genres = basics.explode('genres_split')
exploded_genres.head(2)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,genres_split
0,tt0119830,movie,One Dog Day,One Dog Day,0,2022,,101,"Action,Comedy",Action
0,tt0119830,movie,One Dog Day,One Dog Day,0,2022,,101,"Action,Comedy",Comedy


In [8]:
# Get the string column converted to a list of strings in each row
genres_split = basics['genres'].str.split(',')

## Explore the series using.explore() and take the .unique() entires only.
unique_genres = genres_split.explode().unique()
unique_genres

array(['Action', 'Comedy', 'Drama', 'Sci-Fi', 'War', 'Crime', 'Thriller',
       'Horror', 'Mystery', 'Romance', 'Adventure', 'Animation',
       'History', 'Biography', 'Family', 'Musical', 'Music', 'Fantasy',
       'Western', 'Sport', 'Reality-TV', 'News', 'Talk-Show', 'Adult',
       'Game-Show'], dtype=object)

In [9]:
# use the unique genres from the genres_split colummn 
unique_genres = sorted(exploded_genres['genres_split'].unique())

**Create a new title table**

In [10]:
# create a new title_genres table with only the 'tconst & genres_split' columns
# save as a new df

title_genres = exploded_genres[['tconst', 'genres_split']].copy()
title_genres.head()

Unnamed: 0,tconst,genres_split
0,tt0119830,Action
0,tt0119830,Comedy
1,tt0120589,Drama
2,tt0200940,Comedy
2,tt0200940,Sci-Fi


**Create a genre mapper dictionary to replace string genres with INT**

In [11]:
## Making the genre mapper dictionary
genre_ints = range(len(unique_genres))
genre_map = dict(zip(unique_genres, genre_ints))
genre_map

{'Action': 0,
 'Adult': 1,
 'Adventure': 2,
 'Animation': 3,
 'Biography': 4,
 'Comedy': 5,
 'Crime': 6,
 'Drama': 7,
 'Family': 8,
 'Fantasy': 9,
 'Game-Show': 10,
 'History': 11,
 'Horror': 12,
 'Music': 13,
 'Musical': 14,
 'Mystery': 15,
 'News': 16,
 'Reality-TV': 17,
 'Romance': 18,
 'Sci-Fi': 19,
 'Sport': 20,
 'Talk-Show': 21,
 'Thriller': 22,
 'War': 23,
 'Western': 24}

In [12]:
# Make a dictionary with list of unique genres as the key and the new interger id as values
genre_id_map = dict(zip(unique_genres, range(len(unique_genres))))
genre_id_map

{'Action': 0,
 'Adult': 1,
 'Adventure': 2,
 'Animation': 3,
 'Biography': 4,
 'Comedy': 5,
 'Crime': 6,
 'Drama': 7,
 'Family': 8,
 'Fantasy': 9,
 'Game-Show': 10,
 'History': 11,
 'Horror': 12,
 'Music': 13,
 'Musical': 14,
 'Mystery': 15,
 'News': 16,
 'Reality-TV': 17,
 'Romance': 18,
 'Sci-Fi': 19,
 'Sport': 20,
 'Talk-Show': 21,
 'Thriller': 22,
 'War': 23,
 'Western': 24}

In [13]:
# make new interger genre_id and drop string genres
basics["genre_id"] = basics["genres"].map(genre_map)
basics = basics.drop(columns="genres")

In [14]:
## Manaully make dataframe with named cols from the .keyd and .values
genre_lookup = pd.DataFrame({'Genre_Name': genre_id_map.keys(),
                            'Genre_ID': genre_id_map.values()})
genre_lookup.head()

Unnamed: 0,Genre_Name,Genre_ID
0,Action,0
1,Adult,1
2,Adventure,2
3,Animation,3
4,Biography,4


#### Table 2  

In [15]:
# load ratings dataframe 

ratings = pd.read_csv('Data/ratings_cleaned.akas.tsv.gz')
ratings.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0119830,movie,One Dog Day,One Dog Day,0,2022,,101,"Action,Comedy"
1,tt0120589,movie,A Dangerous Practice,A Dangerous Practice,0,2022,,108,Drama
2,tt0200940,movie,Over-sexed Rugsuckers from Mars,Over-sexed Rugsuckers from Mars,0,2022,,87,"Comedy,Sci-Fi"
3,tt0265705,movie,Saurians,Saurians,0,2022,,83,"Action,Sci-Fi"
4,tt0283145,movie,Wielka droga,Wielka droga,0,2022,,87,"Drama,War"


In [16]:
ratings['genres_splitr']= ratings['genres'].str.split(',')
ratings 

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,genres_splitr
0,tt0119830,movie,One Dog Day,One Dog Day,0,2022,,101,"Action,Comedy","[Action, Comedy]"
1,tt0120589,movie,A Dangerous Practice,A Dangerous Practice,0,2022,,108,Drama,[Drama]
2,tt0200940,movie,Over-sexed Rugsuckers from Mars,Over-sexed Rugsuckers from Mars,0,2022,,87,"Comedy,Sci-Fi","[Comedy, Sci-Fi]"
3,tt0265705,movie,Saurians,Saurians,0,2022,,83,"Action,Sci-Fi","[Action, Sci-Fi]"
4,tt0283145,movie,Wielka droga,Wielka droga,0,2022,,87,"Drama,War","[Drama, War]"
...,...,...,...,...,...,...,...,...,...,...
7435,tt9893130,movie,"2025: Blood, White & Blue","2025: Blood, White & Blue",0,2022,,120,Comedy,[Comedy]
7436,tt9893158,movie,Clowning,Clowning,0,2022,,96,"Crime,Romance","[Crime, Romance]"
7437,tt9893160,movie,No Way Out,No Way Out,0,2022,,89,"Action,Crime,Thriller","[Action, Crime, Thriller]"
7438,tt9894000,movie,Twice As Strong: Made of Fire,Twice As Strong: Made of Fire,0,2022,,122,Drama,[Drama]


In [17]:
exploded_gr = ratings.explode('genres_splitr')
exploded_gr

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,genres_splitr
0,tt0119830,movie,One Dog Day,One Dog Day,0,2022,,101,"Action,Comedy",Action
0,tt0119830,movie,One Dog Day,One Dog Day,0,2022,,101,"Action,Comedy",Comedy
1,tt0120589,movie,A Dangerous Practice,A Dangerous Practice,0,2022,,108,Drama,Drama
2,tt0200940,movie,Over-sexed Rugsuckers from Mars,Over-sexed Rugsuckers from Mars,0,2022,,87,"Comedy,Sci-Fi",Comedy
2,tt0200940,movie,Over-sexed Rugsuckers from Mars,Over-sexed Rugsuckers from Mars,0,2022,,87,"Comedy,Sci-Fi",Sci-Fi
...,...,...,...,...,...,...,...,...,...,...
7437,tt9893160,movie,No Way Out,No Way Out,0,2022,,89,"Action,Crime,Thriller",Action
7437,tt9893160,movie,No Way Out,No Way Out,0,2022,,89,"Action,Crime,Thriller",Crime
7437,tt9893160,movie,No Way Out,No Way Out,0,2022,,89,"Action,Crime,Thriller",Thriller
7438,tt9894000,movie,Twice As Strong: Made of Fire,Twice As Strong: Made of Fire,0,2022,,122,Drama,Drama


In [18]:
# Get the string column converted to a list of strings in each row
genres_sr = ratings['genres'].str.split(',')

## Explore the series using.explore() and take the .unique() entires only.
unique_gr = genres_sr.explode().unique()
unique_gr

array(['Action', 'Comedy', 'Drama', 'Sci-Fi', 'War', 'Crime', 'Thriller',
       'Horror', 'Mystery', 'Romance', 'Adventure', 'Animation',
       'History', 'Biography', 'Family', 'Musical', 'Music', 'Fantasy',
       'Western', 'Sport', 'Reality-TV', 'News', 'Talk-Show', 'Adult',
       'Game-Show'], dtype=object)

In [19]:
# save a list of unique genres 
unique_gr = sorted(exploded_gr['genres_splitr'].unique())

In [20]:
# Ratings table 

title_ratings = exploded_gr[['tconst', 'genres_splitr']]
title_ratings.head(3)

Unnamed: 0,tconst,genres_splitr
0,tt0119830,Action
0,tt0119830,Comedy
1,tt0120589,Drama


In [21]:
# making the genre mapper dictionary 
genre_intsr= range(len(unique_gr))
genre_mapr = dict(zip(unique_gr, genre_ints))
genre_mapr

{'Action': 0,
 'Adult': 1,
 'Adventure': 2,
 'Animation': 3,
 'Biography': 4,
 'Comedy': 5,
 'Crime': 6,
 'Drama': 7,
 'Family': 8,
 'Fantasy': 9,
 'Game-Show': 10,
 'History': 11,
 'Horror': 12,
 'Music': 13,
 'Musical': 14,
 'Mystery': 15,
 'News': 16,
 'Reality-TV': 17,
 'Romance': 18,
 'Sci-Fi': 19,
 'Sport': 20,
 'Talk-Show': 21,
 'Thriller': 22,
 'War': 23,
 'Western': 24}

In [22]:
# make a dictionary with list of unique genres as the key and the new interger id as values

genre_id_mapr = dict(zip(unique_gr, range(len(unique_gr))))
genre_id_mapr

{'Action': 0,
 'Adult': 1,
 'Adventure': 2,
 'Animation': 3,
 'Biography': 4,
 'Comedy': 5,
 'Crime': 6,
 'Drama': 7,
 'Family': 8,
 'Fantasy': 9,
 'Game-Show': 10,
 'History': 11,
 'Horror': 12,
 'Music': 13,
 'Musical': 14,
 'Mystery': 15,
 'News': 16,
 'Reality-TV': 17,
 'Romance': 18,
 'Sci-Fi': 19,
 'Sport': 20,
 'Talk-Show': 21,
 'Thriller': 22,
 'War': 23,
 'Western': 24}

In [23]:
# make new interger genre_id and drop string genres
ratings["genre_id"] = ratings["genres"].map(genre_mapr)
ratings = ratings.drop(columns="genres")

In [24]:
# Manaually make dataframe with named cols from the .keys and .values
genre_lookupr = pd.DataFrame({'Genre_Name': genre_id_mapr.keys(),
                            'Genre_ID': genre_id_mapr.values()})
genre_lookupr.head()

Unnamed: 0,Genre_Name,Genre_ID
0,Action,0
1,Adult,1
2,Adventure,2
3,Animation,3
4,Biography,4


#### Table 3

#### DataFrame to SQL 

In [25]:
## loading mysql credentials 
from sqlalchemy import create_engine
import pymysql
pymysql.install_as_MySQLdb()
from sqlalchemy_utils import create_database, database_exists

## Change username and password to match your personal MySQL Server settings
username = 'root' # default username for MySQL db is root
password = 'Ao9!#Gt8' # whatever password you chose during MySQL installation.

connection = f'mysql+pymysql://{username}:{password}@localhost/movies_yr'
engine = create_engine(connection)
engine

Engine(mysql+pymysql://root:***@localhost/movies_yr)

In [26]:
if database_exists(connection):
    print('It exists!')
else:
    create_database(connection)
    print('Database created.')

It exists!


In [27]:
## Example 
from sqlalchemy.types import *
## Calculate max string lengths for object columns
key_len = basics['tconst'].fillna('').map(len).max()
title_len = basics['primaryTitle'].fillna('').map(len).max()
## Create a schema dictionary using Sqlalchemy datatype objects
basics_schema = {
    'tconst': String(key_len+1),
    'primaryTitle': Text(title_len+1),
    'StartYear':Float(),
    'endYear':Float(),
    'runtimeMinutes': Integer()}

In [28]:
basics = basics.drop(columns=['genres_split'])

In [29]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7440 entries, 0 to 7439
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          7440 non-null   object 
 1   titleType       7440 non-null   object 
 2   primaryTitle    7440 non-null   object 
 3   originalTitle   7440 non-null   object 
 4   isAdult         7440 non-null   int64  
 5   startYear       7440 non-null   int64  
 6   endYear         0 non-null      float64
 7   runtimeMinutes  7440 non-null   int64  
 8   genre_id        4254 non-null   float64
dtypes: float64(2), int64(3), object(4)
memory usage: 523.2+ KB


In [30]:
# Save to sql with dtype and index=False
basics.to_sql('title_basics',engine,dtype=basics_schema,if_exists='replace',index=False)


7440

In [31]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7440 entries, 0 to 7439
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          7440 non-null   object 
 1   titleType       7440 non-null   object 
 2   primaryTitle    7440 non-null   object 
 3   originalTitle   7440 non-null   object 
 4   isAdult         7440 non-null   int64  
 5   startYear       7440 non-null   int64  
 6   endYear         0 non-null      float64
 7   runtimeMinutes  7440 non-null   int64  
 8   genres_splitr   7440 non-null   object 
 9   genre_id        4254 non-null   float64
dtypes: float64(2), int64(3), object(5)
memory usage: 581.4+ KB


In [32]:
ratings = ratings.drop(columns=['genres_splitr'])

In [33]:
## Example 
from sqlalchemy.types import *
## Calculate max string lengths for object columns
key_len = ratings['tconst'].fillna('').map(len).max()
title_len = ratings['primaryTitle'].fillna('').map(len).max()
## Create a schema dictionary using Sqlalchemy datatype objects
ratings_schema = {
    'tconst': String(key_len+1),
    'primaryTitle': Text(title_len+1),
    'StartYear':Float(),
    'endYear':Float(),
    'runtimeMinutes': Integer()}

In [34]:
# Save to sql with dtype and index=False
ratings.to_sql('title_ratings',engine,dtype=ratings_schema,if_exists='replace',index=False)

7440

In [35]:
q= '''SHOW TABLES'''
pd.read_sql(q, engine)

Unnamed: 0,Tables_in_movies_yr
0,title_basics
1,title_ratings


In [36]:
q= '''SELECT * FROM title_basics'''
pd.read_sql(q, engine)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genre_id
0,tt0119830,movie,One Dog Day,One Dog Day,0,2022,,101,
1,tt0120589,movie,A Dangerous Practice,A Dangerous Practice,0,2022,,108,7.0
2,tt0200940,movie,Over-sexed Rugsuckers from Mars,Over-sexed Rugsuckers from Mars,0,2022,,87,
3,tt0265705,movie,Saurians,Saurians,0,2022,,83,
4,tt0283145,movie,Wielka droga,Wielka droga,0,2022,,87,
...,...,...,...,...,...,...,...,...,...
7435,tt9893130,movie,"2025: Blood, White & Blue","2025: Blood, White & Blue",0,2022,,120,5.0
7436,tt9893158,movie,Clowning,Clowning,0,2022,,96,
7437,tt9893160,movie,No Way Out,No Way Out,0,2022,,89,
7438,tt9894000,movie,Twice As Strong: Made of Fire,Twice As Strong: Made of Fire,0,2022,,122,7.0


In [37]:
engine.execute("ALTER TABLE title_basics ADD PRIMARY KEY (`tconst`);")

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x7feff21479d0>

In [38]:
q= '''SHOW TABLES'''
pd.read_sql(q, engine)

Unnamed: 0,Tables_in_movies_yr
0,title_basics
1,title_ratings
