# Data Enrichment Project 3 

# Part 1: Download several files from IMDB’s movie data set and filter out the subset of moves requested by the stakeholder.

In [1]:
# Impors
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [2]:
basics_url= "https://datasets.imdbws.com/title.basics.tsv.gz"
ratings_url= "https://datasets.imdbws.com/title.ratings.tsv.gz"
aka_url= "https://datasets.imdbws.com/title.akas.tsv.gz"

In [3]:
basics = pd.read_csv(basics_url, sep='\t', low_memory=False)
ratings = pd.read_csv(ratings_url, sep='\t', low_memory=False)
aka = pd.read_csv(aka_url, sep='\t', low_memory=False)

In [4]:
# Mount and loading: Akas Dataset

In [5]:
aka.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


In [6]:
# Checking for duplicates
aka.duplicated().sum()

0

In [7]:
# Identify and address missing values 
aka.isna().sum()

titleId              0
ordering             0
title                5
region             105
language             0
types                0
attributes           0
isOriginalTitle      0
dtype: int64

In [8]:
aka = aka[(aka['region']=='US')]

In [9]:
## Replace "\N" with np.nan.
aka.replace({'\\N':np.nan},inplace=True)

In [10]:
aka.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,,imdbDisplay,,0
14,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0
36,tt0000005,1,Blacksmithing Scene,US,,alternative,,0
41,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0


In [11]:
# Mount and loading: Basics Dataset

In [12]:
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [13]:
# Checking for duplicates
basics.duplicated().sum()

0

In [14]:
# Identify and address missing values 
basics.isna().sum()

tconst             0
titleType          0
primaryTitle      11
originalTitle     11
isAdult            0
startYear          0
endYear            0
runtimeMinutes     0
genres            10
dtype: int64

In [15]:
 # Replace "\N" with np.nan.
basics.replace({'\\N':np.nan},inplace=True)

In [16]:
# Eliminate movies that are null for runtimeMinute & genres.
basics = basics.dropna(subset=['runtimeMinutes', 'genres'])

In [17]:
# Keep only titleType==Movie.
basics = basics[(basics['titleType']=='movie')]

In [18]:
# Convert startyear to numeric for slicing.
basics['startYear'] = basics['startYear'].astype(float)

In [19]:
basics = basics[(basics['startYear']>=2000)&(basics['startYear']<2022)]

In [20]:
# Eliminate movies that include "Documentary" in genre.
is_documentary = basics['genres'].str.contains('Documentary',case=False)
basics = basics[~is_documentary]

In [21]:
# Keep only US movies.
# Create the filter.
keep_US_movies = basics['tconst'].isin(aka['titleId'])
# Apply the filter to the dataset.
basics = basics[keep_US_movies]

In [22]:
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34792,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
61094,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020.0,,70,Drama
67640,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
86771,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
93907,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002.0,,126,Drama


In [23]:
# Mount and loading: Ratings Dataset

In [24]:
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1914
1,tt0000002,5.8,259
2,tt0000003,6.5,1720
3,tt0000004,5.6,172
4,tt0000005,6.2,2537


In [25]:
# Checking for duplicates
ratings.duplicated().sum()

0

In [26]:
# Identify and address missing values 
ratings.isna().sum()

tconst           0
averageRating    0
numVotes         0
dtype: int64

In [27]:
# Replace "\N" with np.nan
ratings.replace({'\\N':np.nan}, inplace=True)

In [28]:
# Keep only US movies.
# Create the filter.
keep_US_movies = ratings['tconst'].isin(aka['titleId'])
# Apply the filter to the dataset.
ratings = ratings[keep_US_movies]

In [29]:
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1914
1,tt0000002,5.8,259
4,tt0000005,6.2,2537
5,tt0000006,5.1,175
6,tt0000007,5.4,793


In [30]:
aka.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1356655 entries, 5 to 33413442
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1356655 non-null  object
 1   ordering         1356655 non-null  int64 
 2   title            1356655 non-null  object
 3   region           1356655 non-null  object
 4   language         3685 non-null     object
 5   types            964524 non-null   object
 6   attributes       45129 non-null    object
 7   isOriginalTitle  1355280 non-null  object
dtypes: int64(1), object(7)
memory usage: 93.2+ MB


In [31]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 79709 entries, 34792 to 9282652
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          79709 non-null  object 
 1   titleType       79709 non-null  object 
 2   primaryTitle    79709 non-null  object 
 3   originalTitle   79709 non-null  object 
 4   isAdult         79709 non-null  object 
 5   startYear       79709 non-null  float64
 6   endYear         0 non-null      object 
 7   runtimeMinutes  79709 non-null  object 
 8   genres          79709 non-null  object 
dtypes: float64(1), object(8)
memory usage: 6.1+ MB


In [32]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 477742 entries, 0 to 1261835
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         477742 non-null  object 
 1   averageRating  477742 non-null  float64
 2   numVotes       477742 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 14.6+ MB


In [33]:
## Save current dataframe to file.
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)

In [34]:
## Save current dataframe to file.
aka.to_csv("Data/title_aka.csv.gz",compression='gzip',index=False)

In [35]:
## Save current dataframe to file.
ratings.to_csv("Data/title_ratings.csv.gz",compression='gzip',index=False)