## Basics data frame

In [3]:
import pandas as pd
# load data - change the path to match your drive!
basics = pd.read_csv('Data/title_basics.csv.gz')
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147425 entries, 0 to 147424
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          147425 non-null  object 
 1   titleType       147425 non-null  object 
 2   primaryTitle    147425 non-null  object 
 3   originalTitle   147425 non-null  object 
 4   isAdult         147425 non-null  int64  
 5   startYear       147425 non-null  float64
 6   endYear         0 non-null       float64
 7   runtimeMinutes  147425 non-null  int64  
 8   genres          147425 non-null  object 
dtypes: float64(2), int64(2), object(5)
memory usage: 10.1+ MB


### Removing unwanted columns from Basics

In [4]:
basics = basics.drop(columns=['titleType', 'originalTitle','isAdult','endYear'])
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147425 entries, 0 to 147424
Data columns (total 5 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          147425 non-null  object 
 1   primaryTitle    147425 non-null  object 
 2   startYear       147425 non-null  float64
 3   runtimeMinutes  147425 non-null  int64  
 4   genres          147425 non-null  object 
dtypes: float64(1), int64(1), object(3)
memory usage: 5.6+ MB


### Changing the name of column 'tconst' to Movie_ID

In [5]:
basics = basics.rename(columns ={'tconst':'Movie_ID'})
basics.info()
basics.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147425 entries, 0 to 147424
Data columns (total 5 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   Movie_ID        147425 non-null  object 
 1   primaryTitle    147425 non-null  object 
 2   startYear       147425 non-null  float64
 3   runtimeMinutes  147425 non-null  int64  
 4   genres          147425 non-null  object 
dtypes: float64(1), int64(1), object(3)
memory usage: 5.6+ MB


Unnamed: 0,Movie_ID,primaryTitle,startYear,runtimeMinutes,genres
0,tt0035423,Kate & Leopold,2001.0,118,"Comedy,Fantasy,Romance"
1,tt0062336,The Tango of the Widower and Its Distorting Mi...,2020.0,70,Drama
2,tt0069049,The Other Side of the Wind,2018.0,122,Drama
3,tt0079644,November 1828,2001.0,140,"Drama,War"
4,tt0088751,The Naked Monster,2005.0,100,"Comedy,Horror,Sci-Fi"


In [6]:
## get the unique ids
unique_basics = sorted(basics['Movie_ID'].unique())
unique_basics[:2]

['tt0035423', 'tt0062336']

In [7]:
## make integers for each id
int_basics = range(len(unique_basics))
int_basics

range(0, 147425)

In [8]:
## Converting our range to a list and showing the first 10 values
example_range = list(int_basics)
example_range[:10]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [9]:
# Zip together the unique_ids as the keys and the int_ids as the values
id_map = dict(zip(unique_basics,int_basics))
id_map

{'tt0035423': 0,
 'tt0062336': 1,
 'tt0069049': 2,
 'tt0079644': 3,
 'tt0088751': 4,
 'tt0096056': 5,
 'tt0100275': 6,
 'tt0102362': 7,
 'tt0103340': 8,
 'tt0108549': 9,
 'tt0110476': 10,
 'tt0111414': 11,
 'tt0112912': 12,
 'tt0113026': 13,
 'tt0113092': 14,
 'tt0114447': 15,
 'tt0114722': 16,
 'tt0115686': 17,
 'tt0115937': 18,
 'tt0116391': 19,
 'tt0116628': 20,
 'tt0116748': 21,
 'tt0116916': 22,
 'tt0116991': 23,
 'tt0117743': 24,
 'tt0118141': 25,
 'tt0118578': 26,
 'tt0118589': 27,
 'tt0118652': 28,
 'tt0118694': 29,
 'tt0118710': 30,
 'tt0118852': 31,
 'tt0118926': 32,
 'tt0119004': 33,
 'tt0119231': 34,
 'tt0119273': 35,
 'tt0119495': 36,
 'tt0119727': 37,
 'tt0119806': 38,
 'tt0119830': 39,
 'tt0119866': 40,
 'tt0119966': 41,
 'tt0119970': 42,
 'tt0119980': 43,
 'tt0120117': 44,
 'tt0120166': 45,
 'tt0120202': 46,
 'tt0120263': 47,
 'tt0120467': 48,
 'tt0120485': 49,
 'tt0120589': 50,
 'tt0120607': 51,
 'tt0120624': 52,
 'tt0120626': 53,
 'tt0120630': 54,
 'tt0120667': 55,
 '

In [10]:
## demonstrating using id_map to get iteger id 
basics_str_id ="tt0111414"
id_map[basics_str_id]

11

In [11]:
basics['Movie_ID'].map(id_map)

0              0
1              1
2              2
3              3
4              4
           ...  
147420    147420
147421    147421
147422    147422
147423    147423
147424    147424
Name: Movie_ID, Length: 147425, dtype: int64

In [12]:
## overwriting the original id column 
basics['Movie_ID'] = basics['Movie_ID'].replace(id_map)

basics.head(3)

Unnamed: 0,Movie_ID,primaryTitle,startYear,runtimeMinutes,genres
0,0,Kate & Leopold,2001.0,118,"Comedy,Fantasy,Romance"
1,1,The Tango of the Widower and Its Distorting Mi...,2020.0,70,Drama
2,2,The Other Side of the Wind,2018.0,122,Drama


In [13]:
# Using pd.DataFrame and a dictionary
id_lookup = pd.DataFrame({'str_id': id_map.keys(),
                         'int_id':id_map.values()})
id_lookup.head(3)

Unnamed: 0,str_id,int_id
0,tt0035423,0
1,tt0062336,1
2,tt0069049,2


### Genres separation and creating columns

In [15]:
## adding expand=True
basics['genre_split'] = basics['genres'].str.split(',')

In [16]:
## exploding the column of lists
exploded = basics.explode('genre_split')
exploded.head()

Unnamed: 0,Movie_ID,primaryTitle,startYear,runtimeMinutes,genres,genre_split
0,0,Kate & Leopold,2001.0,118,"Comedy,Fantasy,Romance",Comedy
0,0,Kate & Leopold,2001.0,118,"Comedy,Fantasy,Romance",Fantasy
0,0,Kate & Leopold,2001.0,118,"Comedy,Fantasy,Romance",Romance
1,1,The Tango of the Widower and Its Distorting Mi...,2020.0,70,Drama,Drama
2,2,The Other Side of the Wind,2018.0,122,Drama,Drama


In [17]:
## saving the unique values from the exploded column
cols_to_make = exploded['genre_split'].dropna().unique()
cols_to_make

array(['Comedy', 'Fantasy', 'Romance', 'Drama', 'War', 'Horror', 'Sci-Fi',
       'Biography', 'Mystery', 'Adventure', 'Musical', 'Action', 'Crime',
       'Thriller', 'Music', 'Animation', 'Family', 'History', 'Sport',
       'Western', 'Adult', 'Short', 'Reality-TV', 'News', 'Talk-Show',
       'Game-Show'], dtype=object)

In [21]:
for col in cols_to_make:
    basics[col] = basics['genres'].str.contains(col)
basics.head()

Unnamed: 0,Movie_ID,primaryTitle,startYear,runtimeMinutes,genres,genre_split,Comedy,Fantasy,Romance,Drama,...,Family,History,Sport,Western,Adult,Short,Reality-TV,News,Talk-Show,Game-Show
0,0,Kate & Leopold,2001.0,118,"Comedy,Fantasy,Romance","[Comedy, Fantasy, Romance]",True,True,True,False,...,False,False,False,False,False,False,False,False,False,False
1,1,The Tango of the Widower and Its Distorting Mi...,2020.0,70,Drama,[Drama],False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
2,2,The Other Side of the Wind,2018.0,122,Drama,[Drama],False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
3,3,November 1828,2001.0,140,"Drama,War","[Drama, War]",False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
4,4,The Naked Monster,2005.0,100,"Comedy,Horror,Sci-Fi","[Comedy, Horror, Sci-Fi]",True,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [22]:
# drop transactions clumns
basics = basics.drop(columns=['genres','genre_split'])
basics.head()

Unnamed: 0,Movie_ID,primaryTitle,startYear,runtimeMinutes,Comedy,Fantasy,Romance,Drama,War,Horror,...,Family,History,Sport,Western,Adult,Short,Reality-TV,News,Talk-Show,Game-Show
0,0,Kate & Leopold,2001.0,118,True,True,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,1,The Tango of the Widower and Its Distorting Mi...,2020.0,70,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
2,2,The Other Side of the Wind,2018.0,122,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
3,3,November 1828,2001.0,140,False,False,False,True,True,False,...,False,False,False,False,False,False,False,False,False,False
4,4,The Naked Monster,2005.0,100,True,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False


In [24]:
## save data for next lesson
basics.to_csv('basics.csv', index=False)

## Ratings Dataframe

In [25]:
ratings = pd.read_csv('Data/title_ratings.csv.gz')
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1307127 entries, 0 to 1307126
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1307127 non-null  object 
 1   averageRating  1307127 non-null  float64
 2   numVotes       1307127 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 29.9+ MB


### Changing the name of column 'tconst' to Movie_ID

In [26]:
ratings = ratings.rename(columns ={'tconst':'Movie_ID'})
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1307127 entries, 0 to 1307126
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   Movie_ID       1307127 non-null  object 
 1   averageRating  1307127 non-null  float64
 2   numVotes       1307127 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 29.9+ MB


In [32]:
## save data for next lesson
ratings.to_csv('ratings.csv', index=False)

### TMDB_API Dataframe

In [27]:
tmdb_api = pd.read_csv('Data/tmdb_results_combined.csv.gz')
tmdb_api.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4522 entries, 0 to 4521
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_id                4522 non-null   object 
 1   adult                  4521 non-null   float64
 2   backdrop_path          2071 non-null   object 
 3   belongs_to_collection  255 non-null    object 
 4   budget                 4521 non-null   float64
 5   genres                 4521 non-null   object 
 6   homepage               232 non-null    object 
 7   id                     4521 non-null   float64
 8   original_language      4521 non-null   object 
 9   original_title         4521 non-null   object 
 10  overview               4175 non-null   object 
 11  popularity             4521 non-null   float64
 12  poster_path            3894 non-null   object 
 13  production_companies   4521 non-null   object 
 14  production_countries   4521 non-null   object 
 15  rele

### Removing unwanted columns from Ratings

In [28]:
tmdb_api = tmdb_api.drop(columns=['adult', 'backdrop_path', 'belongs_to_collection', 'genres','homepage', 'id',
                                  'original_language', 'original_title', 'overview','popularity','poster_path',
                                  'production_companies', 'production_countries','release_date','runtime','spoken_languages',
                                  'status','tagline','title','video','vote_average','vote_count'])
tmdb_api.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4522 entries, 0 to 4521
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   imdb_id        4522 non-null   object 
 1   budget         4521 non-null   float64
 2   revenue        4521 non-null   float64
 3   certification  841 non-null    object 
dtypes: float64(2), object(2)
memory usage: 141.4+ KB


In [29]:
tmdb_api.head(2)

Unnamed: 0,imdb_id,budget,revenue,certification
0,0,,,
1,tt1361336,50000000.0,136536687.0,PG


In [30]:
tmdb_api = tmdb_api.rename(columns ={'imdb_id':'Movie_ID'})
tmdb_api.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4522 entries, 0 to 4521
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Movie_ID       4522 non-null   object 
 1   budget         4521 non-null   float64
 2   revenue        4521 non-null   float64
 3   certification  841 non-null    object 
dtypes: float64(2), object(2)
memory usage: 141.4+ KB


In [31]:
## save data for next lesson
tmdb_api.to_csv('tmdb_api.csv', index=False)

## SQL

In [None]:
# imports
import pymysql
pymysql.install_as_MySQLdb()
from sqlalchemy import create_engine
from sqlalchemy_utils import create_database, database_exists
from urllib.parse import quote_plus as urlquote
# Create connection string using credentials following this format

In [None]:
import json
with open('/Users/miran/.secret/mysql.json') as f:
    login = json.load(f)
login.keys()

In [None]:
connection = f"mysql+pymysql://{login['username']}:{urlquote(login['password'])}@localhost/movie"

In [None]:
# create sqlite engine for a database called schools
engine = create_engine(connection)
# create connection to engine
conn = engine.connect()

In [None]:
database_exists(connection)

In [None]:
movie = pd.read_csv('Data/title_basics.csv.gz')
basics.info()

In [None]:
## Check if database exists, if not, create it
if database_exists(connection):
    print('It exists!')
else:
    create_database(connection)
    print('Database created!')

In [None]:
movie.to_sql('movie', engine, if_exists = 'replace')

In [None]:
q = """SELECT * FROM movie;"""
pd.read_sql(q, engine)

In [None]:
q = '''SHOW TABLES'''
pd.read_sql(q,engine)
