# IMBD Movies

- Kevin Ridge

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
main_url = "https://datasets.imdbws.com/"

basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz"
        
akas_url = "https://datasets.imdbws.com/title.akas.tsv.gz"
            
ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"

In [3]:
basics = pd.read_csv(basics_url, sep='\t', low_memory=False)
akas = pd.read_csv(akas_url, sep='\t', low_memory=False)
ratings = pd.read_csv(ratings_url, sep='\t', low_memory=False)

## Basics dataset filtering

#### Filtering/Cleaning Steps: 
- Title Basics:
- Replace "\N" with np.nan
- Eliminate movies that are null for runtimeMinutes
- Eliminate movies that are null for genre
- keep only titleType==Movie
- keep startYear 2000-2022
- Eliminate movies that include "Documentary" in genre (see tip below)
- Keep only US movies (Use AKAs table, see "Filtering one dataframe based on another" section below.

In [45]:
# Replace "\N" with np.nan
basics.replace({'\\N':np.nan}, inplace=True)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34803,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
42384,tt0043139,movie,Life of a Beijing Policeman,Wo zhe yi bei zi,0,2013,,120,"Drama,History"
61115,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
67667,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
86799,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"


In [46]:
# Eliminate movies that are null for runtimeMinutes, and genres
basics.dropna(subset=['runtimeMinutes','genres'], inplace=True)

In [47]:
# keep only titleType==Movie
basics = basics[basics['titleType']=='movie']
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34803,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
42384,tt0043139,movie,Life of a Beijing Policeman,Wo zhe yi bei zi,0,2013,,120,"Drama,History"
61115,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
67667,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
86799,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"


In [48]:
# Check for Nan's
basics.isna().sum()

tconst                 0
titleType              0
primaryTitle           0
originalTitle          0
isAdult                0
startYear              0
endYear           147865
runtimeMinutes         0
genres                 0
dtype: int64

In [49]:
# drop the NaN's so filter can be used
basics.dropna(subset = ['startYear'], inplace=True)

In [50]:
# Change startYear datatype to integer so filter can be used
basics['startYear'] = basics['startYear'].astype(int)

In [51]:
# Keep only movies with startYear 2000-2022
basics = basics.loc[(df_bas['startYear']>=2000)&(basics['startYear']<=2022)]

In [52]:
# Check filter for startYear
basics['startYear'].value_counts()

2018    9724
2017    9505
2019    9478
2022    9131
2016    9076
2015    8641
2021    8381
2014    8229
2013    7834
2020    7672
2012    7341
2011    6792
2010    6391
2009    6001
2008    5235
2007    4643
2006    4418
2005    3930
2004    3564
2003    3254
2002    3000
2001    2878
2000    2747
Name: startYear, dtype: int64

In [53]:
# Create object data type variable
dtypes = basics.dtypes
typ_obj = dtypes[dtypes== 'object'].index
typ_obj

Index(['tconst', 'titleType', 'primaryTitle', 'originalTitle', 'isAdult',
       'runtimeMinutes', 'genres'],
      dtype='object')

In [54]:
# Inspect the object data types
for col in typ_obj:
  print(f'-Column= {col}')
  print(basics[col].value_counts(dropna=False))
  print('\n')

-Column= tconst
tt0035423     1
tt3255606     1
tt3262728     1
tt3262740     1
tt3262822     1
             ..
tt14019648    1
tt14020182    1
tt14020276    1
tt14020762    1
tt9916538     1
Name: tconst, Length: 147865, dtype: int64


-Column= titleType
movie    147865
Name: titleType, dtype: int64


-Column= primaryTitle
Broken                 25
Alone                  21
Homecoming             20
Run                    17
Home                   17
                       ..
Chal Mera Putt 3        1
Khesarot                1
L'amour impossible      1
Nature of the dream     1
Kuambil Lagi Hatiku     1
Name: primaryTitle, Length: 133183, dtype: int64


-Column= originalTitle
Broken                 24
Alone                  18
Run                    17
Homecoming             16
Gone                   15
                       ..
Coupled with Love       1
Almaz Black Box         1
Klyuch salamandry       1
Trader                  1
Kuambil Lagi Hatiku     1
Name: originalTitle, Length:

In [55]:
# Exclude movies that are included in the documentary category.
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]

In [56]:
# Check remove documentary filter
basics['genres'].value_counts()

Drama                        36113
Comedy                       13477
Comedy,Drama                  6455
Horror                        5817
Drama,Romance                 4317
                             ...  
Family,Musical,Sport             1
Horror,Music,Mystery             1
Comedy,History,Mystery           1
Animation,Biography,Sport        1
Crime,Fantasy,Sci-Fi             1
Name: genres, Length: 954, dtype: int64

## AKA dataset filtering

In [57]:
# Display title akas info
akas.info()
akas.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1449100 entries, 5 to 36441940
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1449100 non-null  object
 1   ordering         1449100 non-null  int64 
 2   title            1449100 non-null  object
 3   region           1449100 non-null  object
 4   language         3979 non-null     object
 5   types            980896 non-null   object
 6   attributes       46915 non-null    object
 7   isOriginalTitle  1447758 non-null  object
dtypes: int64(1), object(7)
memory usage: 99.5+ MB


Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,,imdbDisplay,,0
14,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0
36,tt0000005,1,Blacksmithing Scene,US,,alternative,,0
41,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0


In [58]:
# Replace "\N" with np.nan
akas.replace({'\\N': np.nan}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  akas.replace({'\\N': np.nan}, inplace=True)


In [59]:
# Keep only the movies from US region
akas = akas[akas['region']=='US']

In [60]:
# Filter the basics table down to only include the US by using the filter akas
keepers = basics['tconst'].isin(akas['titleId'])
keepers

34803       True
42384       True
61115       True
67667       True
86799       True
           ...  
9980958     True
9980967     True
9981006    False
9981051     True
9981135    False
Name: tconst, Length: 147865, dtype: bool

In [61]:
# Display the filtered data
basics = basics[keepers]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34803,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
42384,tt0043139,movie,Life of a Beijing Policeman,Wo zhe yi bei zi,0,2013,,120,"Drama,History"
61115,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
67667,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
86799,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
...,...,...,...,...,...,...,...,...,...
9980423,tt9914942,movie,Life Without Sara Amat,La vida sense la Sara Amat,0,2019,,74,Drama
9980818,tt9915872,movie,The Last White Witch,My Girlfriend is a Wizard,0,2019,,97,"Comedy,Drama,Fantasy"
9980958,tt9916170,movie,The Rehearsal,O Ensaio,0,2019,,51,Drama
9980967,tt9916190,movie,Safeguard,Safeguard,0,2020,,95,"Action,Adventure,Thriller"


In [66]:
akas.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1449100 entries, 5 to 36441940
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1449100 non-null  object
 1   ordering         1449100 non-null  int64 
 2   title            1449100 non-null  object
 3   region           1449100 non-null  object
 4   language         3979 non-null     object
 5   types            980896 non-null   object
 6   attributes       46915 non-null    object
 7   isOriginalTitle  1447758 non-null  object
dtypes: int64(1), object(7)
memory usage: 99.5+ MB


## Ratings dataset filtering

In [62]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1326238 entries, 0 to 1326237
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1326238 non-null  object 
 1   averageRating  1326238 non-null  float64
 2   numVotes       1326238 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 30.4+ MB


In [63]:
# Replace "\N" with np.nan (if any)
ratings.replace({'\\N': np.nan}, inplace=True)

In [64]:
# Keep only US movies 
ratings = ratings['tconst'].isin(akas['titleId'])
ratings

0           True
1           True
2          False
3          False
4           True
           ...  
1326233    False
1326234    False
1326235    False
1326236    False
1326237    False
Name: tconst, Length: 1326238, dtype: bool

In [65]:
# Display the filtered data
ratings = ratings[ratings]
ratings

0          True
1          True
4          True
5          True
6          True
           ... 
1326199    True
1326200    True
1326207    True
1326208    True
1326213    True
Name: tconst, Length: 502679, dtype: bool

In [68]:
# example making new folder with os
import os
os.makedirs('Data/',exist_ok=True) 
# Confirm folder created
os.listdir("Data/")

['.ipynb_checkpoints']

In [69]:
# Save basics file to data folder
basics.to_csv('Data/title_basics_cleaned.csv.gz',compression='gzip',index=False)

In [70]:
# Save akas file to data folder
akas.to_csv('Data/title_akas_cleaned.csv.gz',compression='gzip',index=False)

In [71]:
# Save ratings file to data folder
ratings.to_csv('Data/title_ratings_cleaned.csv.gz',compression='gzip',index=False)

In [None]:
# Eliminate movies that include "Documentary" in genre
#df_bas1 = df_bas[~df_bas['genres'].isin(['Documentary'])]