## Basics data frame

In [1]:
import pandas as pd
# load data - change the path to match your drive!
basics = pd.read_csv('Data/title_basics.csv.gz')
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86707 entries, 0 to 86706
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          86707 non-null  object 
 1   titleType       86707 non-null  object 
 2   primaryTitle    86707 non-null  object 
 3   originalTitle   86707 non-null  object 
 4   isAdult         86707 non-null  int64  
 5   startYear       86707 non-null  float64
 6   endYear         0 non-null      float64
 7   runtimeMinutes  86707 non-null  int64  
 8   genres          86707 non-null  object 
dtypes: float64(2), int64(2), object(5)
memory usage: 6.0+ MB


### Removing unwanted columns from Basics

In [2]:
basics = basics.drop(columns=['titleType', 'originalTitle','isAdult','endYear'])
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86707 entries, 0 to 86706
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          86707 non-null  object 
 1   primaryTitle    86707 non-null  object 
 2   startYear       86707 non-null  float64
 3   runtimeMinutes  86707 non-null  int64  
 4   genres          86707 non-null  object 
dtypes: float64(1), int64(1), object(3)
memory usage: 3.3+ MB


### Changing the name of column 'tconst' to Movie_ID

In [3]:
basics = basics.rename(columns ={'tconst':'Movie_ID'})
basics.info()
basics.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86707 entries, 0 to 86706
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Movie_ID        86707 non-null  object 
 1   primaryTitle    86707 non-null  object 
 2   startYear       86707 non-null  float64
 3   runtimeMinutes  86707 non-null  int64  
 4   genres          86707 non-null  object 
dtypes: float64(1), int64(1), object(3)
memory usage: 3.3+ MB


Unnamed: 0,Movie_ID,primaryTitle,startYear,runtimeMinutes,genres
0,tt0035423,Kate & Leopold,2001.0,118,"Comedy,Fantasy,Romance"
1,tt0062336,The Tango of the Widower and Its Distorting Mi...,2020.0,70,Drama
2,tt0069049,The Other Side of the Wind,2018.0,122,Drama
3,tt0088751,The Naked Monster,2005.0,100,"Comedy,Horror,Sci-Fi"
4,tt0096056,Crime and Punishment,2002.0,126,Drama


In [4]:
## get the unique ids
unique_basics = sorted(basics['Movie_ID'].unique())
unique_basics[:2]

['tt0035423', 'tt0062336']

In [5]:
## make integers for each id
int_basics = range(len(unique_basics))
int_basics

range(0, 86707)

In [6]:
## Converting our range to a list and showing the first 10 values
example_range = list(int_basics)
example_range[:10]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [7]:
# Zip together the unique_ids as the keys and the int_ids as the values
id_map = dict(zip(unique_basics,int_basics))
id_map

{'tt0035423': 0,
 'tt0062336': 1,
 'tt0069049': 2,
 'tt0088751': 3,
 'tt0096056': 4,
 'tt0100275': 5,
 'tt0103340': 6,
 'tt0108549': 7,
 'tt0113026': 8,
 'tt0113092': 9,
 'tt0114447': 10,
 'tt0115937': 11,
 'tt0116391': 12,
 'tt0116628': 13,
 'tt0116916': 14,
 'tt0116991': 15,
 'tt0117743': 16,
 'tt0118141': 17,
 'tt0118589': 18,
 'tt0118652': 19,
 'tt0118694': 20,
 'tt0118710': 21,
 'tt0118852': 22,
 'tt0118926': 23,
 'tt0119004': 24,
 'tt0119231': 25,
 'tt0119273': 26,
 'tt0119495': 27,
 'tt0119806': 28,
 'tt0119830': 29,
 'tt0119966': 30,
 'tt0119970': 31,
 'tt0119980': 32,
 'tt0120166': 33,
 'tt0120202': 34,
 'tt0120263': 35,
 'tt0120467': 36,
 'tt0120589': 37,
 'tt0120607': 38,
 'tt0120624': 39,
 'tt0120626': 40,
 'tt0120630': 41,
 'tt0120667': 42,
 'tt0120673': 43,
 'tt0120679': 44,
 'tt0120681': 45,
 'tt0120698': 46,
 'tt0120733': 47,
 'tt0120737': 48,
 'tt0120753': 49,
 'tt0120755': 50,
 'tt0120804': 51,
 'tt0120807': 52,
 'tt0120824': 53,
 'tt0120903': 54,
 'tt0120912': 55,
 '

In [8]:
## demonstrating using id_map to get iteger id 
basics_str_id ="tt0118141"
id_map[basics_str_id]

17

In [9]:
basics['Movie_ID'].map(id_map)

0            0
1            1
2            2
3            3
4            4
         ...  
86702    86702
86703    86703
86704    86704
86705    86705
86706    86706
Name: Movie_ID, Length: 86707, dtype: int64

In [10]:
## overwriting the original id column 
basics['Movie_ID'] = basics['Movie_ID'].replace(id_map)

basics.head(3)

Unnamed: 0,Movie_ID,primaryTitle,startYear,runtimeMinutes,genres
0,0,Kate & Leopold,2001.0,118,"Comedy,Fantasy,Romance"
1,1,The Tango of the Widower and Its Distorting Mi...,2020.0,70,Drama
2,2,The Other Side of the Wind,2018.0,122,Drama


In [11]:
# Using pd.DataFrame and a dictionary
id_lookup = pd.DataFrame({'str_id': id_map.keys(),
                         'int_id':id_map.values()})
id_lookup.head(3)

Unnamed: 0,str_id,int_id
0,tt0035423,0
1,tt0062336,1
2,tt0069049,2


### Genres separation and creating columns

In [12]:
## adding expand=True
basics['genre_split'] = basics['genres'].str.split(',')

In [13]:
## exploding the column of lists
exploded = basics.explode('genre_split')
exploded.head()

Unnamed: 0,Movie_ID,primaryTitle,startYear,runtimeMinutes,genres,genre_split
0,0,Kate & Leopold,2001.0,118,"Comedy,Fantasy,Romance",Comedy
0,0,Kate & Leopold,2001.0,118,"Comedy,Fantasy,Romance",Fantasy
0,0,Kate & Leopold,2001.0,118,"Comedy,Fantasy,Romance",Romance
1,1,The Tango of the Widower and Its Distorting Mi...,2020.0,70,Drama,Drama
2,2,The Other Side of the Wind,2018.0,122,Drama,Drama


In [14]:
## saving the unique values from the exploded column
cols_to_make = exploded['genre_split'].dropna().unique()
cols_to_make

array(['Comedy', 'Fantasy', 'Romance', 'Drama', 'Horror', 'Sci-Fi',
       'Biography', 'Mystery', 'Musical', 'Action', 'Adventure', 'Crime',
       'Thriller', 'Music', 'Animation', 'Family', 'History', 'War',
       'Sport', 'Western', 'Adult', 'Short', 'Reality-TV', 'News',
       'Talk-Show', 'Game-Show'], dtype=object)

In [15]:
for col in cols_to_make:
    basics[col] = basics['genres'].str.contains(col)
basics.head()

Unnamed: 0,Movie_ID,primaryTitle,startYear,runtimeMinutes,genres,genre_split,Comedy,Fantasy,Romance,Drama,...,History,War,Sport,Western,Adult,Short,Reality-TV,News,Talk-Show,Game-Show
0,0,Kate & Leopold,2001.0,118,"Comedy,Fantasy,Romance","[Comedy, Fantasy, Romance]",True,True,True,False,...,False,False,False,False,False,False,False,False,False,False
1,1,The Tango of the Widower and Its Distorting Mi...,2020.0,70,Drama,[Drama],False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
2,2,The Other Side of the Wind,2018.0,122,Drama,[Drama],False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
3,3,The Naked Monster,2005.0,100,"Comedy,Horror,Sci-Fi","[Comedy, Horror, Sci-Fi]",True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,4,Crime and Punishment,2002.0,126,Drama,[Drama],False,False,False,True,...,False,False,False,False,False,False,False,False,False,False


In [16]:
# drop transactions clumns
basics = basics.drop(columns=['genres','genre_split'])
basics.head()

Unnamed: 0,Movie_ID,primaryTitle,startYear,runtimeMinutes,Comedy,Fantasy,Romance,Drama,Horror,Sci-Fi,...,History,War,Sport,Western,Adult,Short,Reality-TV,News,Talk-Show,Game-Show
0,0,Kate & Leopold,2001.0,118,True,True,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,1,The Tango of the Widower and Its Distorting Mi...,2020.0,70,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
2,2,The Other Side of the Wind,2018.0,122,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
3,3,The Naked Monster,2005.0,100,True,False,False,False,True,True,...,False,False,False,False,False,False,False,False,False,False
4,4,Crime and Punishment,2002.0,126,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False


In [17]:
## save data for next lesson
basics.to_csv('basics.csv', index=False)

## Ratings Dataframe

In [18]:
ratings = pd.read_csv('Data/title_ratings.csv.gz')
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 499155 entries, 0 to 499154
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         499155 non-null  object 
 1   averageRating  499155 non-null  float64
 2   numVotes       499155 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 11.4+ MB


### Changing the name of column 'tconst' to Movie_ID

In [19]:
ratings = ratings.rename(columns ={'tconst':'Movie_ID'})
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 499155 entries, 0 to 499154
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Movie_ID       499155 non-null  object 
 1   averageRating  499155 non-null  float64
 2   numVotes       499155 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 11.4+ MB


In [20]:
## save data for next lesson
ratings.to_csv('ratings.csv', index=False)

### TMDB_API Dataframe

In [21]:
tmdb_api = pd.read_csv('Data/tmdb_results_combined.csv.gz')
tmdb_api.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2569 entries, 0 to 2568
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_id                2569 non-null   object 
 1   adult                  2568 non-null   float64
 2   backdrop_path          1400 non-null   object 
 3   belongs_to_collection  205 non-null    object 
 4   budget                 2568 non-null   float64
 5   genres                 2568 non-null   object 
 6   homepage               172 non-null    object 
 7   id                     2568 non-null   float64
 8   original_language      2568 non-null   object 
 9   original_title         2568 non-null   object 
 10  overview               2518 non-null   object 
 11  popularity             2568 non-null   float64
 12  poster_path            2310 non-null   object 
 13  production_companies   2568 non-null   object 
 14  production_countries   2568 non-null   object 
 15  rele

### Removing unwanted columns from Ratings

In [22]:
tmdb_api = tmdb_api.drop(columns=['adult', 'backdrop_path', 'belongs_to_collection', 'genres','homepage', 'id',
                                  'original_language', 'original_title', 'overview','popularity','poster_path',
                                  'production_companies', 'production_countries','release_date','runtime','spoken_languages',
                                  'status','tagline','title','video','vote_average','vote_count'])
tmdb_api.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2569 entries, 0 to 2568
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   imdb_id        2569 non-null   object 
 1   budget         2568 non-null   float64
 2   revenue        2568 non-null   float64
 3   certification  816 non-null    object 
dtypes: float64(2), object(2)
memory usage: 80.4+ KB


In [23]:
tmdb_api.head(2)

Unnamed: 0,imdb_id,budget,revenue,certification
0,0,,,
1,tt0113026,10000000.0,0.0,


In [24]:
tmdb_api = tmdb_api.rename(columns ={'imdb_id':'Movie_ID'})
tmdb_api.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2569 entries, 0 to 2568
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Movie_ID       2569 non-null   object 
 1   budget         2568 non-null   float64
 2   revenue        2568 non-null   float64
 3   certification  816 non-null    object 
dtypes: float64(2), object(2)
memory usage: 80.4+ KB


In [25]:
## save data for next lesson
tmdb_api.to_csv('tmdb_api.csv', index=False)

## SQL

In [26]:
# imports
import pymysql
pymysql.install_as_MySQLdb()
from sqlalchemy import create_engine
from sqlalchemy_utils import create_database, database_exists
from urllib.parse import quote_plus as urlquote
# Create connection string using credentials following this format

In [27]:
import json
with open('/Users/miran/.secret/mysql.json') as f:
    login = json.load(f)
login.keys()

dict_keys(['username', 'password'])

In [28]:
connection = f"mysql+pymysql://{login['username']}:{urlquote(login['password'])}@localhost/movie"

In [29]:
# create sqlite engine for a database called schools
engine = create_engine(connection)
# create connection to engine
conn = engine.connect()

In [30]:
database_exists(connection)

True

In [31]:
movie = pd.read_csv('Data/title_basics.csv.gz')
movie.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86707 entries, 0 to 86706
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          86707 non-null  object 
 1   titleType       86707 non-null  object 
 2   primaryTitle    86707 non-null  object 
 3   originalTitle   86707 non-null  object 
 4   isAdult         86707 non-null  int64  
 5   startYear       86707 non-null  float64
 6   endYear         0 non-null      float64
 7   runtimeMinutes  86707 non-null  int64  
 8   genres          86707 non-null  object 
dtypes: float64(2), int64(2), object(5)
memory usage: 6.0+ MB


In [32]:
## Check if database exists, if not, create it
if database_exists(connection):
    print('It exists!')
else:
    create_database(connection)
    print('Database created!')

It exists!


In [33]:
movie.to_sql('movie', engine, if_exists = 'replace')

86707

In [34]:
q = """SELECT * FROM movie;"""
pd.read_sql(q, engine)

Unnamed: 0,index,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
1,1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
2,2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
3,3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
4,4,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002.0,,126,Drama
...,...,...,...,...,...,...,...,...,...,...
86702,86702,tt9914942,movie,Life Without Sara Amat,La vida sense la Sara Amat,0,2019.0,,74,Drama
86703,86703,tt9915872,movie,The Last White Witch,My Girlfriend is a Wizard,0,2019.0,,97,"Comedy,Drama,Fantasy"
86704,86704,tt9916170,movie,The Rehearsal,O Ensaio,0,2019.0,,51,Drama
86705,86705,tt9916190,movie,Safeguard,Safeguard,0,2020.0,,95,"Action,Adventure,Thriller"


In [35]:
q = '''SHOW TABLES'''
pd.read_sql(q,engine)


Unnamed: 0,Tables_in_movie
0,movie
