# IMBD Movies

- Kevin Ridge

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
main_url = "https://datasets.imdbws.com/"

basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz"
        
akas_url = "https://datasets.imdbws.com/title.akas.tsv.gz"
            
ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"

In [3]:
basics = pd.read_csv(basics_url, sep='\t', low_memory=False)
akas = pd.read_csv(akas_url, sep='\t', low_memory=False)
ratings = pd.read_csv(ratings_url, sep='\t', low_memory=False)

## Basics dataset filtering

#### Filtering/Cleaning Steps: 
- Title Basics:
- Replace "\N" with np.nan
- Eliminate movies that are null for runtimeMinutes
- Eliminate movies that are null for genre
- keep only titleType==Movie
- keep startYear 2000-2022
- Eliminate movies that include "Documentary" in genre (see tip below)
- Keep only US movies (Use AKAs table, see "Filtering one dataframe based on another" section below.

In [4]:
# Replace "\N" with np.nan
basics.replace({'\\N':np.nan}, inplace=True)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"


In [5]:
# Eliminate movies that are null for runtimeMinutes, and genres
basics.dropna(subset=['runtimeMinutes','genres'], inplace=True)

In [6]:
# keep only titleType==Movie
basics = basics[basics['titleType']=='movie']
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,,45,Romance
144,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897,,100,"Documentary,News,Sport"
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,,90,Drama
672,tt0000679,movie,The Fairylogue and Radio-Plays,The Fairylogue and Radio-Plays,0,1908,,120,"Adventure,Fantasy"


In [7]:
# Check for Nan's
basics.isna().sum()

tconst                 0
titleType              0
primaryTitle           0
originalTitle          0
isAdult                0
startYear           6520
endYear           384809
runtimeMinutes         0
genres                 0
dtype: int64

In [8]:
# drop the NaN's so filter can be used
basics.dropna(subset = ['startYear'], inplace=True)

In [9]:
# Change startYear datatype to integer so filter can be used
basics['startYear'] = basics['startYear'].astype(int)

In [12]:
# Keep only movies with startYear 2000-2022
basics = basics.loc[(basics['startYear']>=2000)&(basics['startYear']<=2022)]

In [13]:
# Check filter for startYear
basics['startYear'].value_counts()

2017    14393
2018    14369
2019    14119
2016    13981
2015    13493
2014    13134
2022    12982
2021    12440
2013    12404
2012    11661
2020    11609
2011    10787
2010    10217
2009     9375
2008     8168
2007     6975
2006     6537
2005     5856
2004     5219
2003     4606
2002     4140
2001     3882
2000     3651
Name: startYear, dtype: int64

In [14]:
# Create object data type variable
dtypes = basics.dtypes
typ_obj = dtypes[dtypes== 'object'].index
typ_obj

Index(['tconst', 'titleType', 'primaryTitle', 'originalTitle', 'isAdult',
       'endYear', 'runtimeMinutes', 'genres'],
      dtype='object')

In [15]:
# Inspect the object data types
for col in typ_obj:
  print(f'-Column= {col}')
  print(basics[col].value_counts(dropna=False))
  print('\n')

-Column= tconst
tt0013274    1
tt3395356    1
tt3395118    1
tt3395128    1
tt3395166    1
            ..
tt1457764    1
tt1457765    1
tt1457766    1
tt1457767    1
tt9916754    1
Name: tconst, Length: 223998, dtype: int64


-Column= titleType
movie    223998
Name: titleType, dtype: int64


-Column= primaryTitle
Home                              31
Broken                            26
Alone                             24
Metamorphosis                     23
Homecoming                        21
                                  ..
Waster                             1
Il Gomorrista                      1
Petolinie na pametta               1
I Was There in Color               1
Chico Albuquerque - Revelações     1
Name: primaryTitle, Length: 204085, dtype: int64


-Column= originalTitle
Home                                  26
Broken                                25
Alone                                 19
Run                                   19
Homecoming                            16

In [16]:
# Exclude movies that are included in the documentary category.
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]

In [17]:
# Check remove documentary filter
basics['genres'].value_counts()

Drama                        36115
Comedy                       13477
Comedy,Drama                  6455
Horror                        5819
Drama,Romance                 4317
                             ...  
Family,Musical,Sport             1
Horror,Music,Mystery             1
Comedy,History,Mystery           1
Animation,Biography,Sport        1
Crime,Fantasy,Sci-Fi             1
Name: genres, Length: 954, dtype: int64

## AKA dataset filtering

In [18]:
# Display title akas info
akas.info()
akas.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36450954 entries, 0 to 36450953
Data columns (total 8 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   titleId          object
 1   ordering         int64 
 2   title            object
 3   region           object
 4   language         object
 5   types            object
 6   attributes       object
 7   isOriginalTitle  object
dtypes: int64(1), object(7)
memory usage: 2.2+ GB


Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


In [19]:
# Replace "\N" with np.nan
akas.replace({'\\N': np.nan}, inplace=True)

In [20]:
# Keep only the movies from US region
akas = akas[akas['region']=='US']

In [21]:
# Filter the basics table down to only include the US by using the filter akas
keepers = basics['tconst'].isin(akas['titleId'])
keepers

34803       True
61115       True
67667       True
86799       True
93936       True
           ...  
9983200     True
9983209     True
9983248    False
9983293     True
9983377    False
Name: tconst, Length: 147872, dtype: bool

In [22]:
# Display the filtered data
basics = basics[keepers]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34803,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61115,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
67667,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
86799,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
93936,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,,126,Drama
...,...,...,...,...,...,...,...,...,...
9982665,tt9914942,movie,Life Without Sara Amat,La vida sense la Sara Amat,0,2019,,74,Drama
9983060,tt9915872,movie,The Last White Witch,My Girlfriend is a Wizard,0,2019,,97,"Comedy,Drama,Fantasy"
9983200,tt9916170,movie,The Rehearsal,O Ensaio,0,2019,,51,Drama
9983209,tt9916190,movie,Safeguard,Safeguard,0,2020,,95,"Action,Adventure,Thriller"


In [23]:
akas.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1449308 entries, 5 to 36450698
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1449308 non-null  object
 1   ordering         1449308 non-null  int64 
 2   title            1449308 non-null  object
 3   region           1449308 non-null  object
 4   language         3981 non-null     object
 5   types            980948 non-null   object
 6   attributes       46927 non-null    object
 7   isOriginalTitle  1447966 non-null  object
dtypes: int64(1), object(7)
memory usage: 99.5+ MB


## Ratings dataset filtering

In [24]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1326959 entries, 0 to 1326958
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1326959 non-null  object 
 1   averageRating  1326959 non-null  float64
 2   numVotes       1326959 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 30.4+ MB


In [25]:
# Replace "\N" with np.nan (if any)
ratings.replace({'\\N': np.nan}, inplace=True)

In [26]:
# Keep only US movies 
ratings = ratings['tconst'].isin(akas['titleId'])
ratings

0           True
1           True
2          False
3          False
4           True
           ...  
1326954    False
1326955    False
1326956    False
1326957    False
1326958    False
Name: tconst, Length: 1326959, dtype: bool

In [27]:
# Display the filtered data
ratings = ratings[ratings]
ratings

0          True
1          True
4          True
5          True
6          True
           ... 
1326920    True
1326921    True
1326928    True
1326929    True
1326934    True
Name: tconst, Length: 502824, dtype: bool

In [28]:
ratings.info()

<class 'pandas.core.series.Series'>
Int64Index: 502824 entries, 0 to 1326934
Series name: tconst
Non-Null Count   Dtype
--------------   -----
502824 non-null  bool 
dtypes: bool(1)
memory usage: 4.3 MB


In [29]:
# example making new folder with os
import os
os.makedirs('Data/',exist_ok=True) 
# Confirm folder created
os.listdir("Data/")

['.ipynb_checkpoints',
 'final_tmdb_data_2000.csv.gz',
 'title_akas_cleaned.csv.gz',
 'title_basics_cleaned.csv.gz',
 'title_ratings_cleaned.csv.gz',
 'tmdb_api_results_2000.json']

In [30]:
# Save basics file to data folder
basics.to_csv('Data/title_basics_cleaned.csv.gz',compression='gzip',index=False)

In [31]:
# Save akas file to data folder
akas.to_csv('Data/title_akas_cleaned.csv.gz',compression='gzip',index=False)

In [32]:
# Save ratings file to data folder
ratings.to_csv('Data/title_ratings_cleaned.csv.gz',compression='gzip',index=False)