## **Movie Prediction ptoject**

Joe Lardie

2/19/2023

## **Imports**

In [1]:
import pandas as pd
import numpy as np
import os

## **Importing Data**

In [2]:
#Loading Basics data set
basics = pd.read_csv("https://datasets.imdbws.com/title.basics.tsv.gz",sep='\t', low_memory=False)

In [3]:
#Loading akas data set
akas = pd.read_csv("https://datasets.imdbws.com/title.akas.tsv.gz",sep='\t', low_memory=False)

In [4]:
#Loading ratings data set
ratings = pd.read_csv("https://datasets.imdbws.com/title.ratings.tsv.gz",sep='\t', low_memory=False)

## **Preprocessing**

### **Data Cleaning**

#### **Akas**

In [5]:
#Replacing '\\N' with nan
akas.replace({'\\N':np.nan}, inplace = True)

In [6]:
#Keep only US movies 
akas=akas.loc[akas['region']=="US"]

#### **Basics**

In [8]:
#replace \N with np.nan
basics = basics.replace({'\\N':np.nan})

In [9]:
#drop null genre
basics = basics.dropna(axis=0,subset = 'genres')
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9256520 entries, 0 to 9694043
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 706.2+ MB


In [10]:
#keep titletype=movie
basics= basics[basics['titleType']=='movie']
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 567090 entries, 8 to 9693994
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          567090 non-null  object
 1   titleType       567090 non-null  object
 2   primaryTitle    567090 non-null  object
 3   originalTitle   567090 non-null  object
 4   isAdult         567090 non-null  object
 5   startYear       482290 non-null  object
 6   endYear         0 non-null       object
 7   runtimeMinutes  379101 non-null  object
 8   genres          567090 non-null  object
dtypes: object(9)
memory usage: 43.3+ MB


In [11]:
#drop null startYears
basics = basics.dropna(axis=0,subset = 'startYear')
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 482290 entries, 8 to 9693994
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          482290 non-null  object
 1   titleType       482290 non-null  object
 2   primaryTitle    482290 non-null  object
 3   originalTitle   482290 non-null  object
 4   isAdult         482290 non-null  object
 5   startYear       482290 non-null  object
 6   endYear         0 non-null       object
 7   runtimeMinutes  372703 non-null  object
 8   genres          482290 non-null  object
dtypes: object(9)
memory usage: 36.8+ MB


In [12]:
#convert year to an int
basics['startYear']= basics['startYear'].astype('int')

#Keep only the movies between 2000-2022
basics= basics.loc[(basics["startYear"]>= 2000) 
                        & (basics["startYear"]<= 2022)]
basics['startYear'].describe()

count    277051.000000
mean       2013.609974
std           5.901089
min        2000.000000
25%        2009.000000
50%        2014.000000
75%        2018.000000
max        2022.000000
Name: startYear, dtype: float64

In [13]:
# Exclude movies that are included in the documentary category.
documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~documentary]
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
11636,tt0011801,movie,Tötet nicht mehr,Tötet nicht mehr,0,2019,,,"Action,Crime"
34803,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118.0,"Comedy,Fantasy,Romance"
61116,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70.0,Drama
67669,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122.0,Drama
77964,tt0079644,movie,November 1828,November 1828,0,2001,,140.0,"Drama,War"


In [14]:
# Filter the basics table down to only include the US by using the filter ...
#Akas dataframe
keepers =basics['tconst'].isin(akas['titleId'])
basics = basics[keepers]
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34803,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61116,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
67669,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
86801,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
93938,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,,126,Drama


#### **Ratings**

In [15]:
#replace \N with np.nan
ratings = ratings.replace({'\\N':np.nan})

In [16]:
# Filter the ratings table down to only include the US by using the filter ...
#Akas dataframe
keepers = ratings['tconst'].isin(akas['titleId'])
ratings = ratings[keepers]
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1959
1,tt0000002,5.8,264
4,tt0000005,6.2,2596
5,tt0000006,5.1,177
6,tt0000007,5.4,815


# **Data File Storage**

In [17]:
# example making new folder with os
os.makedirs('Data/',exist_ok=True) 
# Confirm folder created
os.listdir("Data/")

['.ipynb_checkpoints']

In [18]:
## Save current dataframe to file.
basics.to_csv("Data/basics.csv.gz",compression='gzip',index=False)
ratings.to_csv("Data/ratings.csv.gz",compression='gzip',index=False)
akas.to_csv("Data/akas.csv.gz",compression='gzip',index=False)

In [19]:
# Open saved file and preview again
basics = pd.read_csv("Data/basics.csv.gz", low_memory = False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118.0,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70.0,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122.0,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100.0,"Comedy,Horror,Sci-Fi"
4,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,,126.0,Drama


In [20]:
# Open saved file and preview again
akas = pd.read_csv("Data/akas.csv.gz", low_memory = False)
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,,imdbDisplay,,0.0
1,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0.0
2,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0.0
3,tt0000005,1,Blacksmithing Scene,US,,alternative,,0.0
4,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0.0


In [21]:
# Open saved file and preview again
ratings = pd.read_csv("Data/ratings.csv.gz", low_memory = False)
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1959
1,tt0000002,5.8,264
2,tt0000005,6.2,2596
3,tt0000006,5.1,177
4,tt0000007,5.4,815


### **File Info Summary**

In [22]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100527 entries, 0 to 100526
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          100527 non-null  object 
 1   titleType       100527 non-null  object 
 2   primaryTitle    100527 non-null  object 
 3   originalTitle   100527 non-null  object 
 4   isAdult         100527 non-null  int64  
 5   startYear       100527 non-null  int64  
 6   endYear         0 non-null       float64
 7   runtimeMinutes  86015 non-null   float64
 8   genres          100527 non-null  object 
dtypes: float64(2), int64(2), object(5)
memory usage: 6.9+ MB


In [23]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 492216 entries, 0 to 492215
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         492216 non-null  object 
 1   averageRating  492216 non-null  float64
 2   numVotes       492216 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 11.3+ MB


In [24]:
akas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1422072 entries, 0 to 1422071
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   titleId          1422072 non-null  object 
 1   ordering         1422072 non-null  int64  
 2   title            1422072 non-null  object 
 3   region           1422072 non-null  object 
 4   language         3853 non-null     object 
 5   types            975811 non-null   object 
 6   attributes       46196 non-null    object 
 7   isOriginalTitle  1420727 non-null  float64
dtypes: float64(1), int64(1), object(6)
memory usage: 86.8+ MB
