## **Movie Prediction ptoject**

Joe Lardie

2/19/2023

## **Imports**

In [1]:
# Numerical and Data Analysis Libraries
import pandas as pd
import numpy as np

# Standard Libraries
import os

## **Importing Data**

In [2]:
#Loading Basics data set
basics = pd.read_csv("https://datasets.imdbws.com/title.basics.tsv.gz",sep='\t', low_memory=False)

In [3]:
#Loading akas data set
akas = pd.read_csv("https://datasets.imdbws.com/title.akas.tsv.gz",sep='\t', low_memory=False)

In [4]:
#Loading ratings data set
ratings = pd.read_csv("https://datasets.imdbws.com/title.ratings.tsv.gz",sep='\t', low_memory=False)

## **Preprocessing**

### **Data Cleaning**

#### **Akas**

In [5]:
#Replacing '\\N' with nan
akas.replace({'\\N':np.nan}, inplace = True)

In [6]:
#Keep only US movies 
akas=akas.loc[akas['region']=="US"]

#### **Basics**

In [7]:
#replace \N with np.nan
basics = basics.replace({'\\N':np.nan})

In [8]:
#drop null genre
basics = basics.dropna(axis=0,subset = 'genres')
basics.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10568762 entries, 0 to 11065002
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 806.3+ MB


In [9]:
#keep titletype=movie
basics= basics[basics['titleType']=='movie']
basics.info()

<class 'pandas.core.frame.DataFrame'>
Index: 616275 entries, 8 to 11064953
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          616275 non-null  object
 1   titleType       616275 non-null  object
 2   primaryTitle    616273 non-null  object
 3   originalTitle   616273 non-null  object
 4   isAdult         616275 non-null  object
 5   startYear       521922 non-null  object
 6   endYear         0 non-null       object
 7   runtimeMinutes  410109 non-null  object
 8   genres          616275 non-null  object
dtypes: object(9)
memory usage: 47.0+ MB


In [10]:
#drop null startYears
basics = basics.dropna(axis=0,subset = 'startYear')
basics.info()

<class 'pandas.core.frame.DataFrame'>
Index: 521922 entries, 8 to 11064953
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          521922 non-null  object
 1   titleType       521922 non-null  object
 2   primaryTitle    521920 non-null  object
 3   originalTitle   521920 non-null  object
 4   isAdult         521922 non-null  object
 5   startYear       521922 non-null  object
 6   endYear         0 non-null       object
 7   runtimeMinutes  402838 non-null  object
 8   genres          521922 non-null  object
dtypes: object(9)
memory usage: 39.8+ MB


In [11]:
#convert year to an int
basics['startYear']= basics['startYear'].astype('int')

#Keep only the movies between 2000-2022
basics= basics.loc[(basics["startYear"]>= 2000) 
                        & (basics["startYear"]<= 2022)]
basics['startYear'].describe()

count    285768.000000
mean       2013.633559
std           5.923038
min        2000.000000
25%        2009.000000
50%        2014.000000
75%        2019.000000
max        2022.000000
Name: startYear, dtype: float64

In [12]:
# Exclude movies that are included in the documentary category.
documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~documentary]
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
11632,tt0011801,movie,Tötet nicht mehr,Tötet nicht mehr,0,2019,,,"Action,Crime"
34795,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118.0,"Comedy,Fantasy,Romance"
61105,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70.0,Drama
67657,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122.0,Drama
80541,tt0082328,movie,Embodiment of Evil,Encarnação do Demônio,0,2008,,94.0,Horror


In [13]:
# Filter the basics table down to only include the US by using the filter ...
#Akas dataframe
keepers =basics['tconst'].isin(akas['titleId'])
basics = basics[keepers]
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34795,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61105,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
67657,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
80541,tt0082328,movie,Embodiment of Evil,Encarnação do Demônio,0,2008,,94,Horror
86783,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"


#### **Ratings**

In [14]:
#replace \N with np.nan
ratings = ratings.replace({'\\N':np.nan})

In [15]:
# Filter the ratings table down to only include the US by using the filter ...
#Akas dataframe
keepers = ratings['tconst'].isin(akas['titleId'])
ratings = ratings[keepers]
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2083
1,tt0000002,5.6,281
4,tt0000005,6.2,2820
5,tt0000006,5.0,194
6,tt0000007,5.4,886


# **Data File Storage**

In [16]:
# creating a new folder with os
os.makedirs('Data/',exist_ok=True) 
# Confirm folder created
os.listdir("Data/")

['akas.csv.gz',
 'basics.csv.gz',
 'final_tmdb_data_2000 (1).csv.gz',
 'final_tmdb_data_2000 (2).csv.gz',
 'final_tmdb_data_2000 (3).csv.gz',
 'final_tmdb_data_2000.csv.gz',
 'final_tmdb_data_2001.csv.gz',
 'final_tmdb_data_2002.csv.gz',
 'final_tmdb_data_2003.csv.gz',
 'final_tmdb_data_2004.csv.gz',
 'final_tmdb_data_2005.csv.gz',
 'final_tmdb_data_2006.csv.gz',
 'final_tmdb_data_2007.csv.gz',
 'final_tmdb_data_2008.csv.gz',
 'final_tmdb_data_2009.csv.gz',
 'final_tmdb_data_2010.csv.gz',
 'final_tmdb_data_2011.csv.gz',
 'final_tmdb_data_2012.csv.gz',
 'final_tmdb_data_2013.csv.gz',
 'final_tmdb_data_2014.csv.gz',
 'final_tmdb_data_2015.csv.gz',
 'final_tmdb_data_2016.csv.gz',
 'final_tmdb_data_2017.csv.gz',
 'final_tmdb_data_2018.csv.gz',
 'final_tmdb_data_2019.csv.gz',
 'final_tmdb_data_2020.csv.gz',
 'ratings.csv.gz',
 'tmdb_api_results_2000.csv.gz',
 'tmdb_api_results_2000.json',
 'tmdb_api_results_2001.json',
 'tmdb_api_results_2002.json',
 'tmdb_api_results_2003.json',
 'tmdb_api

In [17]:
## Save current dataframe to file.
basics.to_csv("Data/basics.csv.gz",compression='gzip',index=False)
ratings.to_csv("Data/ratings.csv.gz",compression='gzip',index=False)
akas.to_csv("Data/akas.csv.gz",compression='gzip',index=False)

In [18]:
# Open saved file and preview again
basics = pd.read_csv("Data/basics.csv.gz", low_memory = False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118.0,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70.0,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122.0,Drama
3,tt0082328,movie,Embodiment of Evil,Encarnação do Demônio,0,2008,,94.0,Horror
4,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100.0,"Comedy,Horror,Sci-Fi"


In [19]:
# Open saved file and preview again
akas = pd.read_csv("Data/akas.csv.gz", low_memory = False)
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,3,Carmencita,US,,imdbDisplay,,0
1,tt0000002,6,The Clown and His Dogs,US,,,literal English title,0
2,tt0000005,3,Blacksmith Scene,US,,imdbDisplay,,0
3,tt0000005,5,Blacksmith Scene #1,US,,alternative,,0
4,tt0000005,7,Blacksmithing,US,,,informal alternative title,0


In [20]:
# Open saved file and preview again
ratings = pd.read_csv("Data/ratings.csv.gz", low_memory = False)
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2083
1,tt0000002,5.6,281
2,tt0000005,6.2,2820
3,tt0000006,5.0,194
4,tt0000007,5.4,886


### **File Info Summary**

In [21]:
# Preview of Basics data
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103737 entries, 0 to 103736
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          103737 non-null  object 
 1   titleType       103737 non-null  object 
 2   primaryTitle    103735 non-null  object 
 3   originalTitle   103735 non-null  object 
 4   isAdult         103737 non-null  int64  
 5   startYear       103737 non-null  int64  
 6   endYear         0 non-null       float64
 7   runtimeMinutes  89371 non-null   float64
 8   genres          103737 non-null  object 
dtypes: float64(2), int64(2), object(5)
memory usage: 7.1+ MB


In [22]:
# Preview of Ratings data
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 543702 entries, 0 to 543701
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         543702 non-null  object 
 1   averageRating  543702 non-null  float64
 2   numVotes       543702 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 12.4+ MB


In [23]:
# Preview of akas data
akas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1554878 entries, 0 to 1554877
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1554878 non-null  object
 1   ordering         1554878 non-null  int64 
 2   title            1554875 non-null  object
 3   region           1554878 non-null  object
 4   language         7233 non-null     object
 5   types            1005739 non-null  object
 6   attributes       49454 non-null    object
 7   isOriginalTitle  1554878 non-null  int64 
dtypes: int64(2), object(6)
memory usage: 94.9+ MB
