In [1]:
# example making new folder with os
import os
os.makedirs('Data/',exist_ok=True) 

In [2]:
# Confirm folder was created and files added successfully
os.listdir("Data/")

['.ipynb_checkpoints',
 'basics-data.tsv',
 'basics.csv',
 'ratings-data.tsv',
 'title-akas-us-only.csv']

In [3]:
# Load importsa
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

In [4]:
# Load basics
basics = pd.read_csv("Data/basics-data.tsv", sep='\t', low_memory = False)
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"
...,...,...,...,...,...,...,...,...,...
10017006,tt9916848,tvEpisode,Episode #3.17,Episode #3.17,0,2009,\N,\N,"Action,Drama,Family"
10017007,tt9916850,tvEpisode,Episode #3.19,Episode #3.19,0,2010,\N,\N,"Action,Drama,Family"
10017008,tt9916852,tvEpisode,Episode #3.20,Episode #3.20,0,2010,\N,\N,"Action,Drama,Family"
10017009,tt9916856,short,The Wind,The Wind,0,2015,\N,27,Short


In [5]:
# Load akas
akas = pd.read_csv('Data/title-akas-us-only.csv')
akas

  akas = pd.read_csv('Data/title-akas-us-only.csv')


Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,\N,imdbDisplay,\N,0
1,tt0000002,7,The Clown and His Dogs,US,\N,\N,literal English title,0
2,tt0000005,10,Blacksmith Scene,US,\N,imdbDisplay,\N,0
3,tt0000005,1,Blacksmithing Scene,US,\N,alternative,\N,0
4,tt0000005,6,Blacksmith Scene #1,US,\N,alternative,\N,0
...,...,...,...,...,...,...,...,...
1452559,tt9916560,1,March of Dimes Presents: Once Upon a Dime,US,\N,imdbDisplay,\N,0
1452560,tt9916620,1,The Copeland Case,US,\N,imdbDisplay,\N,0
1452561,tt9916702,1,Loving London: The Playground,US,\N,\N,\N,0
1452562,tt9916756,1,Pretty Pretty Black Girl,US,\N,imdbDisplay,\N,0


In [6]:
# Filter the basics table down to only include the US by using the filter akas dataframe
filter_us_titles = basics['tconst'].isin(akas['titleId'])
basics = basics[filter_us_titles]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"
5,tt0000006,short,Chinese Opium Den,Chinese Opium Den,0,1894,\N,1,Short
6,tt0000007,short,Corbett and Courtney Before the Kinetograph,Corbett and Courtney Before the Kinetograph,0,1894,\N,1,"Short,Sport"
...,...,...,...,...,...,...,...,...,...
10016872,tt9916560,tvMovie,March of Dimes Presents: Once Upon a Dime,March of Dimes Presents: Once Upon a Dime,0,1963,\N,58,Family
10016901,tt9916620,movie,The Copeland Case,The Copeland Case,0,\N,\N,\N,Drama
10016939,tt9916702,short,Loving London: The Playground,Loving London: The Playground,0,\N,\N,\N,"Drama,Short"
10016962,tt9916756,short,Pretty Pretty Black Girl,Pretty Pretty Black Girl,0,2019,\N,\N,Short


In [7]:
# Replace '\N' to 'nan' in basics
basics = basics.replace({'\\N':np.nan})

In [8]:
# Check nulls in 'runtimeMinutes'
basics['runtimeMinutes'].isna().sum()


503119

In [9]:
# Drop rows with nulls on 'runtimeMinutes'
basics.dropna(subset = ['runtimeMinutes'], inplace = True)

In [10]:
# Check nulls in 'runtimeMinutes'
basics['runtimeMinutes'].isna().sum()

0

In [11]:
# Filter to keep only full-length movies (titleType == Movie)
basics = basics.loc[basics['titleType'] == 'movie']

In [12]:
# Convert startYear to a float dtype 
basics['startYear'] = basics['startYear'].astype(float)

In [13]:
# Filter to keep movies with startYear that are >= 2000 and <= 2022
basics = basics.loc[(basics['startYear'] >= 2000) & (basics['startYear'] <= 2022)]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34802,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
61114,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
67666,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
86793,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
93930,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002.0,,126,Drama
...,...,...,...,...,...,...,...,...,...
10016366,tt9915436,movie,Vida em Movimento,Vida em Movimento,0,2019.0,,70,Documentary
10016544,tt9915872,movie,The Last White Witch,My Girlfriend is a Wizard,0,2019.0,,97,"Comedy,Drama,Fantasy"
10016684,tt9916170,movie,The Rehearsal,O Ensaio,0,2019.0,,51,Drama
10016693,tt9916190,movie,Safeguard,Safeguard,0,2020.0,,95,"Action,Adventure,Thriller"


In [14]:
# Eliminate movies that include 'Documentary' in genre 
filter_documentaries = basics['genres'].str.contains('Documentary', na = False)

# Exclude movies that include 'Documentary' in genre
basics = basics[~filter_documentaries]

In [15]:
# Display a final preview of your filtered title basiscs and save to a csv 
basics.info()
basics.head(10)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 88546 entries, 34802 to 10016777
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          88546 non-null  object 
 1   titleType       88546 non-null  object 
 2   primaryTitle    88546 non-null  object 
 3   originalTitle   88546 non-null  object 
 4   isAdult         88546 non-null  object 
 5   startYear       88546 non-null  float64
 6   endYear         0 non-null      object 
 7   runtimeMinutes  88546 non-null  object 
 8   genres          86979 non-null  object 
dtypes: float64(1), object(8)
memory usage: 6.8+ MB


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34802,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
61114,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
67666,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
86793,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
93930,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002.0,,126,Drama
98035,tt0100275,movie,The Wandering Soap Opera,La Telenovela Errante,0,2017.0,,80,"Comedy,Drama,Fantasy"
101033,tt0103340,movie,Life for Life: Maximilian Kolbe,Zycie za zycie. Maksymilian Kolbe,0,2006.0,,90,"Biography,Drama"
106097,tt0108549,movie,West from North Goes South,West from North Goes South,0,2004.0,,96,"Comedy,Mystery"
110468,tt0113026,movie,The Fantasticks,The Fantasticks,0,2000.0,,86,"Musical,Romance"
110531,tt0113092,movie,For the Cause,For the Cause,0,2000.0,,100,"Action,Adventure,Drama"


In [16]:
# Save the basics data to a CSV file in Data folder
#basics.to_csv('D:\My Documents\GitHub\Project-2')

In [17]:
# Load the title rating data into a dataframe
ratings = pd.read_csv('Data/ratings-data.tsv', sep='\t')
ratings.info()
ratings.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1331492 entries, 0 to 1331491
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1331492 non-null  object 
 1   averageRating  1331492 non-null  float64
 2   numVotes       1331492 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 30.5+ MB


Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1988
1,tt0000002,5.8,265
2,tt0000003,6.5,1849
3,tt0000004,5.5,178
4,tt0000005,6.2,2632


In [24]:
# Keep only movies that are included in final title basics dataframe 
filter_basics = ratings['tconst'].isin(basics['tconst'])
ratings = ratings.loc[filter_basics]

In [30]:
# Final preview of ratings dataframe 
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 72482 entries, 17961 to 1331462
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   tconst         72482 non-null  object 
 1   averageRating  72482 non-null  float64
 2   numVotes       72482 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 2.2+ MB


In [37]:
# Save the ratings data to a CSV file in Data folder
#ratings.to_csv('D:\\My Documents\\GitHub\\Project-2\\Data\\ratings.csv')