# Project 3 Part 1 (CORE)

**Marco Jimenez**

**6/21/2022**

In [1]:
import pandas as pd
import numpy as np

In [2]:
akas_url = 'https://datasets.imdbws.com/title.akas.tsv.gz'
basics_url = 'https://datasets.imdbws.com/title.basics.tsv.gz'
ratings_url = 'https://datasets.imdbws.com/title.ratings.tsv.gz'

In [3]:
akas = pd.read_csv(akas_url, sep='\t', low_memory=False)
basics = pd.read_csv(basics_url, sep='\t', low_memory=False)
ratings = pd.read_csv(ratings_url, sep='\t', low_memory=False)

In [4]:
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


# Processing Basics Data

In [5]:
# Checking basic column info
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9009150 entries, 0 to 9009149
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 618.6+ MB


In [6]:
# Checking for duplicates
basics.duplicated().sum()

0

In [7]:
# Checking for missing values
basics.isna().sum()

tconst             0
titleType          0
primaryTitle      11
originalTitle     11
isAdult            0
startYear          0
endYear            0
runtimeMinutes     0
genres            10
dtype: int64

In [8]:
# replacing all '\N' values with np.nan
basics = basics.replace({'\\N':np.nan})
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle           11
originalTitle          11
isAdult                 1
startYear         1198933
endYear           8917253
runtimeMinutes    6583717
genres             410970
dtype: int64

In [9]:
# Eliminating movies that are null for 'runtimeMinutes' and 'genre'
basics = basics.dropna(subset=['runtimeMinutes','genres'])
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle            0
originalTitle           0
isAdult                 0
startYear           35709
endYear           2314908
runtimeMinutes          0
genres                  0
dtype: int64

In [10]:
# Excluding movies without the titleType movie 
basics = basics[basics['titleType'] == 'movie']
basics['titleType'].value_counts()

movie    362545
Name: titleType, dtype: int64

In [11]:
# Confirming new number of columns
basics.shape

(362545, 9)

In [12]:
# Keeping start year 2000-2022
basics = basics[(basics['startYear'] >='2000') & (basics['startYear'] <= '2022')]
basics['startYear'].value_counts()

2017    14167
2018    14072
2016    13803
2019    13657
2015    13317
2014    12970
2013    12245
2012    11516
2021    11478
2020    11063
2011    10668
2010    10094
2009     9241
2008     8047
2007     6856
2006     6406
2005     5749
2022     5613
2004     5111
2003     4519
2002     4079
2001     3804
2000     3584
Name: startYear, dtype: int64

In [13]:
# Checking new dataframe shape
basics.shape

(212059, 9)

In [14]:
# Eliminating movies that include 'Documentary' in genre
docs_list = basics['genres'].str.contains('documentary', case=False)
basics = basics[~docs_list]
basics.shape

(140151, 9)

# Processing AKAs Data

In [15]:
# Checking basic column info
akas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32359997 entries, 0 to 32359996
Data columns (total 8 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   titleId          object
 1   ordering         int64 
 2   title            object
 3   region           object
 4   language         object
 5   types            object
 6   attributes       object
 7   isOriginalTitle  object
dtypes: int64(1), object(7)
memory usage: 1.9+ GB


In [16]:
# Keeping only US entries
akas = akas[akas['region'] == 'US']
akas['region'].value_counts()

US    1329213
Name: region, dtype: int64

In [17]:
keeps_list = basics['tconst'].isin(akas['titleId'])
keeps_list

34805       True
61119       True
67672       True
77968      False
86806       True
           ...  
9008822     True
9008831     True
9008870    False
9008915     True
9008999    False
Name: tconst, Length: 140151, dtype: bool

In [18]:
basics = basics[keeps_list]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34805,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61119,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
67672,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
86806,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
91077,tt0093119,movie,Grizzly II: Revenge,Grizzly II: The Predator,0,2020,,74,"Horror,Music,Thriller"
...,...,...,...,...,...,...,...,...,...
9008286,tt9914942,movie,Life Without Sara Amat,La vida sense la Sara Amat,0,2019,,74,Drama
9008682,tt9915872,movie,The Last White Witch,My Girlfriend is a Wizard,0,2019,,97,"Comedy,Drama,Fantasy"
9008822,tt9916170,movie,The Rehearsal,O Ensaio,0,2019,,51,Drama
9008831,tt9916190,movie,Safeguard,Safeguard,0,2020,,90,"Action,Adventure,Thriller"


In [19]:
akas = akas.replace({'\\N':np.nan})
akas.isna().sum()

titleId                  0
ordering                 0
title                    0
region                   0
language           1325623
types               303016
attributes         1285006
isOriginalTitle       1375
dtype: int64

# Processing Ratings Data

In [20]:
ratings = ratings.replace({'\\N':np.nan})
ratings.isna().sum()

tconst           0
averageRating    0
numVotes         0
dtype: int64

# Saving Data

In [21]:
import os
# Creating new folder for processed dataframes
os.makedirs('Data/', exist_ok=True)
# Confirming new folder was created
os.listdir("Data/")

['.ipynb_checkpoints',
 'final_tmdb_data_2000.csv.gz',
 'final_tmdb_data_2001.csv.gz',
 'title_akas.csv.gz',
 'title_basics.csv.gz',
 'title_ratings.csv.gz',
 'tmdb_api_results_2000.json',
 'tmdb_api_results_2001.json',
 'tmdb_results_combined.csv.gz']

In [22]:
# Saving basics dataframe
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)
# Opening saved file and previewing it
basics = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
4,tt0093119,movie,Grizzly II: Revenge,Grizzly II: The Predator,0,2020,,74,"Horror,Music,Thriller"


In [23]:
# Saving akas dataframe
akas.to_csv("Data/title_akas.csv.gz",compression='gzip',index=False)
# Opening saved file and previewing it
akas = pd.read_csv("Data/title_akas.csv.gz", low_memory = False)
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,,imdbDisplay,,0.0
1,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0.0
2,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0.0
3,tt0000005,1,Blacksmithing Scene,US,,alternative,,0.0
4,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0.0


In [24]:
# Saving ratings dataframe
ratings.to_csv("Data/title_ratings.csv.gz",compression='gzip',index=False)
# Opening saved file and previewing it
ratings = pd.read_csv("Data/title_ratings.csv.gz", low_memory = False)
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1891
1,tt0000002,5.9,252
2,tt0000003,6.5,1681
3,tt0000004,5.7,165
4,tt0000005,6.2,2499
