# Movie Success Analysis

Author: Lerato Matlala

## Data Loading

***Import Libraries***

In [78]:
#Import libraries
import pandas as pd
import numpy as np
import os

***Load Data***

In [37]:
# Read in title Basics.tsv
basics_url ="https://datasets.imdbws.com/title.basics.tsv.gz"

title_basic_df = pd.read_csv(basics_url, sep='\t', low_memory=False)
title_basic_df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [38]:
# Read in title.akas.tsv
title_akas_url = "https://datasets.imdbws.com/title.akas.tsv.gz"
title_akas_df = pd.read_csv(title_akas_url, sep='\t', low_memory=False)
title_akas_df.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


In [39]:
# Read in title.akas.tsv
ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"
title_ratings_df = pd.read_csv(ratings_url, sep='\t', low_memory=False)
title_ratings_df.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2002
1,tt0000002,5.8,269
2,tt0000003,6.5,1894
3,tt0000004,5.5,178
4,tt0000005,6.2,2677


## Data Cleaning

### ***Handling \N Placeholder Values***

In [43]:
#Count null values in title basic before handling \N placeholder values
title_basic_df.isnull().sum()

tconst             0
titleType          0
primaryTitle      11
originalTitle     11
isAdult            0
startYear          0
endYear            0
runtimeMinutes     0
genres            18
dtype: int64

In [44]:
#Count null values in title akas before handling \N placeholder values
title_akas_df.isnull().sum()

titleId              0
ordering             0
title                5
region             117
language             0
types                0
attributes           0
isOriginalTitle      0
dtype: int64

In [45]:
#Count null values in title ratings before handling \N placeholder values
title_ratings_df.isnull().sum()

tconst           0
averageRating    0
numVotes         0
dtype: int64

In [48]:
# Replace '\N' with np.nan for each DataFrame 
title_basic_df.replace({'\\N': np.nan}, inplace=True)
title_akas_df.replace({'\\N': np.nan}, inplace=True)
title_ratings_df.replace({'\\N': np.nan}, inplace=True)

In [49]:
#Count null values in title basic after handling \N placeholder values
title_basic_df.isnull().sum()

tconst                   0
titleType                0
primaryTitle            11
originalTitle           11
isAdult                  1
startYear          1370949
endYear           10137108
runtimeMinutes     7159618
genres              458543
dtype: int64

In [50]:
#Count null values in title akas after handling \N placeholder values
title_akas_df.isnull().sum()

titleId                   0
ordering                  0
title                     5
region              1905307
language            6779972
types              31843105
attributes         37267747
isOriginalTitle        2077
dtype: int64

In [51]:
#Count null values in title ratings after handling \N placeholder values
title_ratings_df.isnull().sum()

tconst           0
averageRating    0
numVotes         0
dtype: int64

### ***Filter Out Movies***

In [52]:
# title_basic_df Dataframe shape before filtering out movies
title_basic_df.shape

(10250934, 9)

In [53]:
#title_akas_df Dataframe shape before filtering out movies
title_akas_df.shape

(37535727, 8)

In [54]:
# title_ratings_df Dataframe shape before filtering out movies
title_ratings_df.shape

(1360911, 3)

-  **Exclude any movie with missing values for genre or runtime**

In [55]:
# Remove rows where 'genres' or 'runtimeMinutes' is missing
filtered_title_basic_df = title_basic_df.dropna(subset=['genres', 'runtimeMinutes'])

# Collect the IDs of the filtered rows
filtered_tconst = filtered_title_basic_df['tconst'].tolist()

# Filter corresponding rows in title_akas_df and title_ratings_df based on the collected IDs
filtered_title_akas_df = title_akas_df[title_akas_df['titleId'].isin(filtered_tconst)]
filtered_title_ratings_df = title_ratings_df[title_ratings_df['tconst'].isin(filtered_tconst)]

In [56]:
# title_basic_df Dataframe shape after filtering out movies with missing values for genre or runtime
filtered_title_basic_df.shape

(3010579, 9)

In [57]:
# title_akas_df Dataframe shape after filtering out movies with missing values for genre or runtime
filtered_title_akas_df.shape

(10156165, 8)

In [58]:
#title_ratings_df Dataframe shape after filtering out movies with missing values for genre or runtime
filtered_title_ratings_df.shape

(956992, 3)

- **Include only full-length movies (titleType = "movie")**

In [59]:
# Include only rows where 'titleType' is 'movie'
filtered_title_basic_df = filtered_title_basic_df[filtered_title_basic_df['titleType'] == 'movie']

# Collect the IDs of the filtered rows
filtered_tconst = filtered_title_basic_df['tconst'].tolist()

# Filter corresponding rows in title_akas_df and title_ratings_df based on the collected IDs
filtered_title_akas_df = title_akas_df[title_akas_df['titleId'].isin(filtered_tconst)]
filtered_title_ratings_df = title_ratings_df[title_ratings_df['tconst'].isin(filtered_tconst)]


In [60]:
# filtered_title_basic_df Dataframe shape after including only rows where 'titleType' is 'movie'
filtered_title_basic_df.shape

(390879, 9)

In [61]:
#filtered_title_akas_df Dataframe shape after including only rows where 'titleType' is 'movie'
filtered_title_akas_df.shape

(2448076, 8)

In [62]:
#filtered_title_ratings_df Dataframe shape after including only rows where 'titleType' is 'movie'
filtered_title_ratings_df.shape

(263403, 3)

- **Include only fictional movies (not from documentary genre)**

In [63]:
# Include only rows where 'genres' does not contain 'Documentary'
filtered_title_basic_df = filtered_title_basic_df[(~filtered_title_basic_df['genres'].str.contains('Documentary',case=False))]

# Collect the IDs of the filtered rows
filtered_tconst = filtered_title_basic_df['tconst'].tolist()

# Filter corresponding rows in title_akas_df and title_ratings_df based on the collected IDs
filtered_title_akas_df = title_akas_df[title_akas_df['titleId'].isin(filtered_tconst)]
filtered_title_ratings_df = title_ratings_df[title_ratings_df['tconst'].isin(filtered_tconst)]


In [64]:
#filtered_title_basic_df Dataframe shape after including only rows where 'genres' does not contain 'Documentary'
filtered_title_basic_df.shape

(295875, 9)

In [65]:
#filtered_title_akas_df Dataframe shape after including only rows where 'genres' does not contain 'Documentary'
filtered_title_akas_df.shape

(2155237, 8)

In [66]:
#filtered_title_ratings_df Dataframe shape after including only rows where 'genres' does not contain 'Documentary'
filtered_title_ratings_df.shape

(220376, 3)

- **Include only movies that were released 2000 - 2021 (include 2000 and 2021)**

In [67]:
# Convert 'startYear' to numeric (integers) if it's in object format
filtered_title_basic_df['startYear'] = pd.to_numeric(filtered_title_basic_df['startYear'], errors='coerce')

# Filter 'startYear' values between 2000 and 2022
filtered_title_basic_df = filtered_title_basic_df[(filtered_title_basic_df['startYear'] >= 2000) & (filtered_title_basic_df['startYear'] <= 2022)]


# Include only rows where 'startYear' is either 2000 or 2001
#filtered_title_basic_df = filtered_title_basic_df[((filtered_title_basic_df['startYear'] == '2000') | (filtered_title_basic_df['startYear'] == '2001'))].copy()

# Collect the IDs of the filtered rows
filtered_tconst = filtered_title_basic_df['tconst'].tolist()

# Filter corresponding rows in title_akas_df and title_ratings_df based on the collected IDs
filtered_title_akas_df = title_akas_df[title_akas_df['titleId'].isin(filtered_tconst)]
filtered_title_ratings_df = title_ratings_df[title_ratings_df['tconst'].isin(filtered_tconst)]

In [68]:
#filtered_title_basic_df Dataframe shape after including only rows where 'startYear' is between 2000 and 2022
filtered_title_basic_df.shape

(148584, 9)

In [69]:
#filtered_title_akas_df Dataframe shape after including only rows where 'startYear' is between 2000 and 2022
filtered_title_akas_df.shape

(1021307, 8)

In [70]:
#filtered_title_ratings_df Dataframe shape after including only rows where 'startYear' is between 2000 and 2022
filtered_title_ratings_df.shape

(111756, 3)

- **Include only movies that were released in the United States**

In [71]:
# Include only rows where 'country' is 'USA'
filtered_title_akas_df = filtered_title_akas_df[filtered_title_akas_df['region'] == 'US']

# Collect the IDs of the filtered rows
filtered_titleId = filtered_title_akas_df['titleId'].tolist()

# Filter corresponding rows in filtered_title_basic_df and filtered_title_ratings_df based on the collected IDs
filtered_title_basic_df = filtered_title_basic_df[filtered_title_basic_df['tconst'].isin(filtered_titleId)]
filtered_title_ratings_df = filtered_title_ratings_df[filtered_title_ratings_df['tconst'].isin(filtered_titleId)]


In [72]:
#filtered_title_basic_df Dataframe shape after filtering out movies that were released in the US
filtered_title_basic_df.shape

(87356, 9)

In [73]:
#filtered_title_akas_df Dataframe shape after filtering out movies that were released in the US
filtered_title_akas_df.shape

(97168, 8)

In [74]:
#filtered_title_ratings_df Dataframe shape after filtering out movies that were released in the US
filtered_title_ratings_df.shape

(72418, 3)

### ***Dataframes Summaries***

**title_basic_df**

In [75]:
filtered_title_basic_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 87356 entries, 34800 to 10250700
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          87356 non-null  object 
 1   titleType       87356 non-null  object 
 2   primaryTitle    87356 non-null  object 
 3   originalTitle   87356 non-null  object 
 4   isAdult         87356 non-null  object 
 5   startYear       87356 non-null  float64
 6   endYear         0 non-null      object 
 7   runtimeMinutes  87356 non-null  object 
 8   genres          87356 non-null  object 
dtypes: float64(1), object(8)
memory usage: 6.7+ MB


**title_akas_df**

In [76]:
filtered_title_akas_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 97168 entries, 203545 to 37534986
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   titleId          97168 non-null  object
 1   ordering         97168 non-null  int64 
 2   title            97168 non-null  object
 3   region           97168 non-null  object
 4   language         960 non-null    object
 5   types            89431 non-null  object
 6   attributes       4366 non-null   object
 7   isOriginalTitle  97168 non-null  object
dtypes: int64(1), object(7)
memory usage: 6.7+ MB


**title_ratings_df**

In [77]:
filtered_title_ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 72418 entries, 17958 to 1360882
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   tconst         72418 non-null  object 
 1   averageRating  72418 non-null  float64
 2   numVotes       72418 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 2.2+ MB


### ***Save Dataframes***

In [79]:
#Create Data folder
os.makedirs('Data/',exist_ok=True) 

# Confirm folder created
os.listdir("Data/")

[]

In [80]:
# Save filtered_title_basic_df DataFrame to a compressed CSV file in the "Data/" folder
filtered_title_basic_df.to_csv('Data/filtered_title_basic.csv.gz', index=False, compression='gzip')

In [81]:
# Save filtered_title_akas_df DataFrame to a compressed CSV file in the "Data/" folder
filtered_title_akas_df.to_csv('Data/filtered_title_akas.csv.gz', index=False, compression='gzip')

In [82]:
# Save filtered_title_ratings_df DataFrame to a compressed CSV file in the "Data/" folder
filtered_title_ratings_df.to_csv('Data/filtered_title_ratings.csv.gz', index=False, compression='gzip')