In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
# Confirm needed files are present
os.listdir('Data/')

['title-akas-us-only.csv', 'title.basics.tsv.gz', 'title.ratings.tsv.gz']

In [3]:
# Load required files into pandas

# Load us_only filter
us_only_url = 'Data/title-akas-us-only.csv'
us_only = pd.read_csv(us_only_url, low_memory=False)

# Title Basics

In [4]:
# Load title_basics
basics_url = 'Data/title.basics.tsv.gz'
basics = pd.read_csv(basics_url, sep='\t', low_memory=False)

In [5]:
# Remove non-US movies

# Filter the basics table down to only include the US by using the filter akas dataframe
filter_us_titles = basics['tconst'].isin(us_only['titleId'])
basics = basics[filter_us_titles]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"
5,tt0000006,short,Chinese Opium Den,Chinese Opium Den,0,1894,\N,1,Short
6,tt0000007,short,Corbett and Courtney Before the Kinetograph,Corbett and Courtney Before the Kinetograph,0,1894,\N,1,"Short,Sport"
...,...,...,...,...,...,...,...,...,...
10016872,tt9916560,tvMovie,March of Dimes Presents: Once Upon a Dime,March of Dimes Presents: Once Upon a Dime,0,1963,\N,58,Family
10016901,tt9916620,movie,The Copeland Case,The Copeland Case,0,\N,\N,\N,Drama
10016939,tt9916702,short,Loving London: The Playground,Loving London: The Playground,0,\N,\N,\N,"Drama,Short"
10016962,tt9916756,short,Pretty Pretty Black Girl,Pretty Pretty Black Girl,0,2019,\N,\N,Short


In [6]:
# Replace null-placeholders with actual null values
basics = basics.replace({'\\N': np.nan})

In [7]:
# Drop rows where data is missing in runtimeMinutes or genres
basics = basics.dropna(subset=['runtimeMinutes', 'genres'])

In [8]:
# Keep only full-length movies (i.e., titleType==movie)
movies_only_filter = basics['titleType'] == 'movie'
basics = basics[movies_only_filter]

In [9]:
# Convert 'startYear' to a float
basics['startYear'] = basics['startYear'].astype(float)

In [16]:
# Filter to keep movies with startYear that is >= 2000 AND <= 2022
startYear_filter = basics['startYear'].between(2000, 2022, inclusive='both')
basics = basics[startYear_filter]

In [19]:
# Filter out 'Documentary' in 'genres'
docu_filter = basics['genres'].str.contains('Documentary')
basics = basics[~docu_filter]

In [21]:
# Verify condition of dataframe after filtering and cleaning
basics.info()
basics.head(10)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 86979 entries, 34802 to 10016777
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          86979 non-null  object 
 1   titleType       86979 non-null  object 
 2   primaryTitle    86979 non-null  object 
 3   originalTitle   86979 non-null  object 
 4   isAdult         86979 non-null  object 
 5   startYear       86979 non-null  float64
 6   endYear         0 non-null      object 
 7   runtimeMinutes  86979 non-null  object 
 8   genres          86979 non-null  object 
dtypes: float64(1), object(8)
memory usage: 6.6+ MB


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34802,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
61114,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
67666,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
86793,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
93930,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002.0,,126,Drama
98035,tt0100275,movie,The Wandering Soap Opera,La Telenovela Errante,0,2017.0,,80,"Comedy,Drama,Fantasy"
101033,tt0103340,movie,Life for Life: Maximilian Kolbe,Zycie za zycie. Maksymilian Kolbe,0,2006.0,,90,"Biography,Drama"
106097,tt0108549,movie,West from North Goes South,West from North Goes South,0,2004.0,,96,"Comedy,Mystery"
110468,tt0113026,movie,The Fantasticks,The Fantasticks,0,2000.0,,86,"Musical,Romance"
110531,tt0113092,movie,For the Cause,For the Cause,0,2000.0,,100,"Action,Adventure,Drama"


In [28]:
# Save copy of CSV file to DATA folder in repository
basics.to_csv('DATA/movie_basics.csv')

# Title Ratings

In [22]:
# Load title_basics
ratings_url = 'Data/title.ratings.tsv.gz'
ratings = pd.read_csv(ratings_url, sep='\t', low_memory=False)

In [23]:
# Filter the ratings table down to only include the same titles as Title Basics
filtered_ratings = ratings['tconst'].isin(basics['tconst'])
ratings = ratings[filtered_ratings]

In [25]:
# Replace null-placeholders with actual null values
ratings = ratings.replace({'\\N': np.nan})

In [26]:
# Display final info and preview of ratings
ratings.info()
ratings.head(10)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 71900 entries, 17961 to 1331462
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   tconst         71900 non-null  object 
 1   averageRating  71900 non-null  float64
 2   numVotes       71900 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 2.2+ MB


Unnamed: 0,tconst,averageRating,numVotes
17961,tt0035423,6.4,87153
40764,tt0062336,6.4,175
46645,tt0069049,6.7,7754
63640,tt0088751,5.2,336
69953,tt0096056,5.6,846
73596,tt0100275,6.5,347
76168,tt0103340,6.3,354
80465,tt0108549,7.7,33
84153,tt0113026,5.6,1406
84204,tt0113092,3.4,837


In [29]:
# Save copy of CSV file to DATA folder in repository
ratings.to_csv('DATA/movie_ratings.csv')