# IMBD Movies

- Kevin Ridge

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
main_url = "https://datasets.imdbws.com/"

basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz"
        
akas_url = "https://datasets.imdbws.com/title.akas.tsv.gz"
            
ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"

In [3]:
basics = pd.read_csv(basics_url, sep='\t', low_memory=False)
akas = pd.read_csv(akas_url, sep='\t', low_memory=False)


In [33]:
ratings = pd.read_csv(ratings_url, sep='\t', low_memory=False)

## Basics dataset filtering

#### Filtering/Cleaning Steps: 
- Title Basics:
- Replace "\N" with np.nan
- Eliminate movies that are null for runtimeMinutes
- Eliminate movies that are null for genre
- keep only titleType==Movie
- keep startYear 2000-2022
- Eliminate movies that include "Documentary" in genre (see tip below)
- Keep only US movies (Use AKAs table, see "Filtering one dataframe based on another" section below.

In [4]:
# Replace "\N" with np.nan
basics.replace({'\\N':np.nan}, inplace=True)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"


In [5]:
# Eliminate movies that are null for runtimeMinutes, and genres
basics.dropna(subset=['runtimeMinutes','genres'], inplace=True)

In [6]:
# keep only titleType==Movie
basics = basics[basics['titleType']=='movie']
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,,45,Romance
144,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897,,100,"Documentary,News,Sport"
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,,90,Drama
672,tt0000679,movie,The Fairylogue and Radio-Plays,The Fairylogue and Radio-Plays,0,1908,,120,"Adventure,Fantasy"


In [7]:
# Check for Nan's
basics.isna().sum()

tconst                 0
titleType              0
primaryTitle           0
originalTitle          0
isAdult                0
startYear           6549
endYear           385568
runtimeMinutes         0
genres                 0
dtype: int64

In [8]:
# drop the NaN's so filter can be used
basics.dropna(subset = ['startYear'], inplace=True)

In [9]:
# Change startYear datatype to integer so filter can be used
basics['startYear'] = basics['startYear'].astype(int)

In [10]:
# Keep only movies with startYear 2000-2022
basics = basics.loc[(basics['startYear']>=2000)&(basics['startYear']<=2022)]

In [11]:
# Check filter for startYear
basics['startYear'].value_counts()

2017    14404
2018    14385
2019    14138
2016    13993
2015    13502
2014    13141
2022    13023
2021    12471
2013    12407
2012    11661
2020    11628
2011    10790
2010    10219
2009     9378
2008     8170
2007     6979
2006     6545
2005     5860
2004     5219
2003     4606
2002     4145
2001     3886
2000     3655
Name: startYear, dtype: int64

In [12]:
# Create object data type variable
dtypes = basics.dtypes
typ_obj = dtypes[dtypes== 'object'].index
typ_obj

Index(['tconst', 'titleType', 'primaryTitle', 'originalTitle', 'isAdult',
       'endYear', 'runtimeMinutes', 'genres'],
      dtype='object')

In [13]:
# Inspect the object data types
for col in typ_obj:
  print(f'-Column= {col}')
  print(basics[col].value_counts(dropna=False))
  print('\n')

-Column= tconst
tt0013274     1
tt3392324     1
tt3391692     1
tt3391710     1
tt3391782     1
             ..
tt1458550     1
tt14585564    1
tt14585862    1
tt14585902    1
tt9916754     1
Name: tconst, Length: 224205, dtype: int64


-Column= titleType
movie    224205
Name: titleType, dtype: int64


-Column= primaryTitle
Home                              31
Broken                            26
Alone                             24
Metamorphosis                     23
Homecoming                        21
                                  ..
Love Sorries                       1
Lords of Fuzz                      1
The Last Christeros                1
Love Khichdi                       1
Chico Albuquerque - Revelações     1
Name: primaryTitle, Length: 204284, dtype: int64


-Column= originalTitle
Home                              26
Broken                            25
Run                               19
Alone                             19
Gone                              16
        

In [14]:
# Exclude movies that are included in the documentary category.
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]

In [15]:
# Check remove documentary filter
basics['genres'].value_counts()

Drama                        36122
Comedy                       13470
Comedy,Drama                  6459
Horror                        5900
Drama,Romance                 4324
                             ...  
Action,Fantasy,Western           1
Family,Musical,Sport             1
Comedy,History,Mystery           1
Animation,Biography,Sport        1
Crime,Fantasy,Sci-Fi             1
Name: genres, Length: 955, dtype: int64

## AKA dataset filtering

In [16]:
# Display title akas info
akas.info()
akas.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36573690 entries, 0 to 36573689
Data columns (total 8 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   titleId          object
 1   ordering         int64 
 2   title            object
 3   region           object
 4   language         object
 5   types            object
 6   attributes       object
 7   isOriginalTitle  object
dtypes: int64(1), object(7)
memory usage: 2.2+ GB


Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


In [17]:
# Replace "\N" with np.nan
akas.replace({'\\N': np.nan}, inplace=True)

In [18]:
# Keep only the movies from US region
akas = akas[akas['region']=='US']

In [19]:
# Filter the basics table down to only include the US by using the filter akas
keepers = basics['tconst'].isin(akas['titleId'])
keepers

34802        True
61114        True
67666        True
86793        True
93930        True
            ...  
10014414     True
10014453    False
10014498     True
10014582    False
10014672    False
Name: tconst, Length: 148019, dtype: bool

In [20]:
# Display the filtered data
basics = basics[keepers]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34802,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61114,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
67666,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
86793,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
93930,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,,126,Drama
...,...,...,...,...,...,...,...,...,...
10013870,tt9914942,movie,Life Without Sara Amat,La vida sense la Sara Amat,0,2019,,74,Drama
10014265,tt9915872,movie,The Last White Witch,My Girlfriend is a Wizard,0,2019,,97,"Comedy,Drama,Fantasy"
10014405,tt9916170,movie,The Rehearsal,O Ensaio,0,2019,,51,Drama
10014414,tt9916190,movie,Safeguard,Safeguard,0,2020,,95,"Action,Adventure,Thriller"


In [21]:
akas.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1452045 entries, 5 to 36573434
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1452045 non-null  object
 1   ordering         1452045 non-null  int64 
 2   title            1452045 non-null  object
 3   region           1452045 non-null  object
 4   language         4015 non-null     object
 5   types            981406 non-null   object
 6   attributes       46999 non-null    object
 7   isOriginalTitle  1450703 non-null  object
dtypes: int64(1), object(7)
memory usage: 99.7+ MB


## Ratings dataset filtering

In [34]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1330841 entries, 0 to 1330840
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1330841 non-null  object 
 1   averageRating  1330841 non-null  float64
 2   numVotes       1330841 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 30.5+ MB


In [35]:
# Keep only US movies 
US = ratings['tconst'].isin(akas['titleId'])
US

0           True
1           True
2          False
3          False
4           True
           ...  
1330836    False
1330837    False
1330838    False
1330839    False
1330840    False
Name: tconst, Length: 1330841, dtype: bool

In [36]:
# Replace "\N" with np.nan (if any)
ratings.replace({'\\N': np.nan})

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1987
1,tt0000002,5.8,265
2,tt0000003,6.5,1848
3,tt0000004,5.5,178
4,tt0000005,6.2,2631
...,...,...,...
1330836,tt9916730,8.3,10
1330837,tt9916766,7.0,21
1330838,tt9916778,7.2,36
1330839,tt9916840,7.5,7


In [37]:
# Display the filtered data
ratings = ratings[US]
ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1987
1,tt0000002,5.8,265
4,tt0000005,6.2,2631
5,tt0000006,5.1,182
6,tt0000007,5.4,824
...,...,...,...
1330802,tt9916200,8.1,231
1330803,tt9916204,8.2,264
1330810,tt9916348,8.3,18
1330811,tt9916362,6.4,5420


In [38]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 503742 entries, 0 to 1330816
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         503742 non-null  object 
 1   averageRating  503742 non-null  float64
 2   numVotes       503742 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 15.4+ MB


In [None]:
# example making new folder with os
import os
os.makedirs('Data/',exist_ok=True) 
# Confirm folder created
os.listdir("Data/")

In [None]:
# Save basics file to data folder
basics.to_csv('Data/title_basics_cleaned.csv.gz',compression='gzip',index=False)

In [None]:
# Save akas file to data folder
akas.to_csv('Data/title_akas_cleaned.csv.gz',compression='gzip',index=False)

In [39]:
# Save ratings file to data folder
ratings.to_csv('Data/title_ratings_cleaned.csv.gz',compression='gzip',index=False)