## **Movie Prediction ptoject**

Joe Lardie

2/19/2023

## **Imports**

In [1]:
import pandas as pd
import numpy as np

## **Importing Data**

In [2]:
#Loading Basics data set
basics_url = pd.read_csv("https://datasets.imdbws.com/title.basics.tsv.gz",sep='\t', low_memory=False)

In [3]:
#Loading askas data set
askas_url = pd.read_csv("https://datasets.imdbws.com/title.akas.tsv.gz",sep='\t', low_memory=False)

In [4]:
#Loading ratings data set
ratings_url = pd.read_csv("https://datasets.imdbws.com/title.ratings.tsv.gz",sep='\t', low_memory=False)

In [5]:
basics_url.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


## **Preprocessing**

### **Data Cleaning**

In [6]:
#Replacing '\\N' with nan
basics_url.replace({'\\N':np.nan}, inplace = True)

In [7]:
#Checking it worked
basics_url.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"


In [8]:
#Replacing '\\N' with nan
askas_url.replace({'\\N':np.nan}, inplace = True)

In [9]:
#Checking it worked
askas_url.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,,imdbDisplay,,0
1,tt0000001,2,Carmencita,DE,,,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,,imdbDisplay,,0
3,tt0000001,4,Καρμενσίτα,GR,,imdbDisplay,,0
4,tt0000001,5,Карменсита,RU,,imdbDisplay,,0


In [10]:
#Replacing '\\N' with nan
ratings_url.replace({'\\N':np.nan}, inplace = True)

In [11]:
#Checking it worked
ratings_url.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1954
1,tt0000002,5.8,263
2,tt0000003,6.5,1787
3,tt0000004,5.6,179
4,tt0000005,6.2,2589


## **Eliminate movies that are null for runtimeminutes**

In [12]:
basics_url = basics_url.dropna(subset=["runtimeMinutes"])

## **Eliminate movies that are null for genre**

In [13]:
basics_url = basics_url.dropna(subset=['genres'])

## **Keep only titleType==Movie**

In [14]:
basics_url = basics_url[basics_url['titleType'].str.lower() == 'movie']

## **Keep startYear 200-2022**

In [15]:
basics_url.isna().sum()

tconst                 0
titleType              0
primaryTitle           0
originalTitle          0
isAdult                0
startYear           6381
endYear           377695
runtimeMinutes         0
genres                 0
dtype: int64

In [16]:
#Fill the missing values with (0) and convert the column to integer type
basics_url['startYear']=basics_url['startYear'].fillna(0).astype(float)

In [17]:
basics_url.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 377695 entries, 8 to 9638170
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          377695 non-null  object 
 1   titleType       377695 non-null  object 
 2   primaryTitle    377695 non-null  object 
 3   originalTitle   377695 non-null  object 
 4   isAdult         377695 non-null  object 
 5   startYear       377695 non-null  float64
 6   endYear         0 non-null       object 
 7   runtimeMinutes  377695 non-null  object 
 8   genres          377695 non-null  object 
dtypes: float64(1), object(8)
memory usage: 28.8+ MB


In [18]:
#include only movies that were released between 2000 - 2021(include 2000 & 2021)
filtered_movies = basics_url[(basics_url['startYear'] >= 2000)&(basics_url['startYear'] <= 2021)]

In [19]:
print(filtered_movies)

            tconst titleType  \
13082    tt0013274     movie   
34803    tt0035423     movie   
61116    tt0062336     movie   
67669    tt0069049     movie   
77964    tt0079644     movie   
...            ...       ...   
9637986  tt9916362     movie   
9638070  tt9916538     movie   
9638111  tt9916622     movie   
9638138  tt9916680     movie   
9638170  tt9916754     movie   

                                              primaryTitle  \
13082                          Istoriya grazhdanskoy voyny   
34803                                       Kate & Leopold   
61116    The Tango of the Widower and Its Distorting Mi...   
67669                           The Other Side of the Wind   
77964                                        November 1828   
...                                                    ...   
9637986                                              Coven   
9638070                                Kuambil Lagi Hatiku   
9638111        Rodolpho Teóphilo - O Legado de um Pioneir

In [20]:
#Checking that only movies between 2000-2021 remain.
filtered_movies['startYear'].value_counts()

2017.0    14313
2018.0    14266
2019.0    13983
2016.0    13913
2015.0    13429
2014.0    13051
2013.0    12350
2021.0    12167
2012.0    11605
2020.0    11454
2011.0    10747
2010.0    10181
2009.0     9325
2008.0     8128
2007.0     6940
2006.0     6486
2005.0     5801
2004.0     5181
2003.0     4567
2002.0     4116
2001.0     3846
2000.0     3630
Name: startYear, dtype: int64

## **Include only fictional Movies**

In [21]:
# Exclude movies that are included in the documentary category.
is_documentary = filtered_movies['genres'].str.contains('documentary',case=False)
filtered_movies = filtered_movies[~is_documentary]

## **Include only movies that were released in the US**

In [22]:
usfilter = askas_url['region']=='US'
askas_url = askas_url[usfilter]

In [23]:
askas_url = askas_url.rename(columns={'titleId' : 'tconst'})
merged_url = pd.merge(filtered_movies, askas_url, on='tconst')

In [24]:
merged_url.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance",37,Kate and Leopold,US,,,alternative spelling,0
1,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance",39,Kate & Leopold,US,,imdbDisplay,,0
2,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama,6,The Tango of the Widower and Its Distorting Mi...,US,,imdbDisplay,,0
3,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama,3,The Other Side of the Wind,US,,imdbDisplay,,0
4,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi",1,Attack of the B-Movie Monster,US,,working,,0


In [25]:
basics_url.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 377695 entries, 8 to 9638170
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          377695 non-null  object 
 1   titleType       377695 non-null  object 
 2   primaryTitle    377695 non-null  object 
 3   originalTitle   377695 non-null  object 
 4   isAdult         377695 non-null  object 
 5   startYear       377695 non-null  float64
 6   endYear         0 non-null       object 
 7   runtimeMinutes  377695 non-null  object 
 8   genres          377695 non-null  object 
dtypes: float64(1), object(8)
memory usage: 28.8+ MB


In [26]:
askas_url.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1416568 entries, 5 to 35026261
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   tconst           1416568 non-null  object
 1   ordering         1416568 non-null  int64 
 2   title            1416568 non-null  object
 3   region           1416568 non-null  object
 4   language         3833 non-null     object
 5   types            974118 non-null   object
 6   attributes       46043 non-null    object
 7   isOriginalTitle  1415223 non-null  object
dtypes: int64(1), object(7)
memory usage: 97.3+ MB


In [27]:
ratings_url.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1282932 entries, 0 to 1282931
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1282932 non-null  object 
 1   averageRating  1282932 non-null  float64
 2   numVotes       1282932 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 29.4+ MB
