# Movie Success Analysis

Author: Lerato Matlala

## Data Loading

***Import Libraries***

In [1]:
#Import libraries
import pandas as pd
 

***Load Data***

In [2]:
# Read in title Basics.tsv
title_basic_df = pd.read_csv('title.basics.tsv', sep='\t', low_memory=False)
title_basic_df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [3]:
# Read in title.akas.tsv
title_akas_df = pd.read_csv('title.akas.tsv', sep='\t', low_memory=False)
title_akas_df.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


In [4]:
# Read in title.akas.tsv
title_ratings_df = pd.read_csv('title.ratings.tsv', sep='\t', low_memory=False)
title_ratings_df.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2002
1,tt0000002,5.8,269
2,tt0000003,6.5,1893
3,tt0000004,5.5,178
4,tt0000005,6.2,2678


## Data Cleaning

### ***Filter Out Movies***

In [5]:
# title_basic_df Dataframe shape before filtering out movies
title_basic_df.shape

(10248781, 9)

In [6]:
#title_akas_df Dataframe shape before filtering out movies
title_akas_df.shape

(37527210, 8)

In [7]:
# title_ratings_df Dataframe shape before filtering out movies
title_ratings_df.shape

(1360168, 3)

-  **Exclude any movie with missing values for genre or runtime**

In [8]:
# Remove rows where 'genres' or 'runtimeMinutes' is missing
filtered_title_basic_df = title_basic_df.dropna(subset=['genres', 'runtimeMinutes'])

# Collect the IDs of the filtered rows
filtered_tconst = filtered_title_basic_df['tconst'].tolist()

# Filter corresponding rows in title_akas_df and title_ratings_df based on the collected IDs
filtered_title_akas_df = title_akas_df[title_akas_df['titleId'].isin(filtered_tconst)]
filtered_title_ratings_df = title_ratings_df[title_ratings_df['tconst'].isin(filtered_tconst)]

In [9]:
# title_basic_df Dataframe shape after filtering out movies with missing values for genre or runtime
filtered_title_basic_df.shape

(10248763, 9)

In [10]:
# title_akas_df Dataframe shape after filtering out movies with missing values for genre or runtime
filtered_title_akas_df.shape

(37521841, 8)

In [11]:
#title_ratings_df Dataframe shape after filtering out movies with missing values for genre or runtime
filtered_title_ratings_df.shape

(1360158, 3)

- **Include only full-length movies (titleType = "movie")**

In [12]:
# Include only rows where 'titleType' is 'movie'
filtered_title_basic_df = filtered_title_basic_df[filtered_title_basic_df['titleType'] == 'movie']

# Collect the IDs of the filtered rows
filtered_tconst = filtered_title_basic_df['tconst'].tolist()

# Filter corresponding rows in title_akas_df and title_ratings_df based on the collected IDs
filtered_title_akas_df = title_akas_df[title_akas_df['titleId'].isin(filtered_tconst)]
filtered_title_ratings_df = title_ratings_df[title_ratings_df['tconst'].isin(filtered_tconst)]


In [14]:
# filtered_title_basic_df Dataframe shape after including only rows where 'titleType' is 'movie'
filtered_title_basic_df.shape

(660334, 9)

In [15]:
#filtered_title_akas_df Dataframe shape after including only rows where 'titleType' is 'movie'
filtered_title_akas_df.shape

(2939315, 8)

In [16]:
#filtered_title_ratings_df Dataframe shape after including only rows where 'titleType' is 'movie'
filtered_title_ratings_df.shape

(298622, 3)

- **Include only fictional movies (not from documentary genre)**

In [17]:
# Include only rows where 'genres' does not contain 'Documentary'
filtered_title_basic_df = filtered_title_basic_df[(~filtered_title_basic_df['genres'].str.contains('Documentary'))]

# Collect the IDs of the filtered rows
filtered_tconst = filtered_title_basic_df['tconst'].tolist()

# Filter corresponding rows in title_akas_df and title_ratings_df based on the collected IDs
filtered_title_akas_df = title_akas_df[title_akas_df['titleId'].isin(filtered_tconst)]
filtered_title_ratings_df = title_ratings_df[title_ratings_df['tconst'].isin(filtered_tconst)]


In [18]:
#filtered_title_basic_df Dataframe shape after including only rows where 'genres' does not contain 'Documentary'
filtered_title_basic_df.shape

(534403, 9)

In [19]:
#filtered_title_akas_df Dataframe shape after including only rows where 'genres' does not contain 'Documentary'
filtered_title_akas_df.shape

(2590728, 8)

In [20]:
#filtered_title_ratings_df Dataframe shape after including only rows where 'genres' does not contain 'Documentary'
filtered_title_ratings_df.shape

(252229, 3)

- **Include only movies that were released 2000 - 2021 (include 2000 and 2021)**

In [21]:
# Include only rows where 'startYear' is either 2000 or 2001
filtered_title_basic_df = filtered_title_basic_df[((filtered_title_basic_df['startYear'] == '2000') | (filtered_title_basic_df['startYear'] == '2001'))].copy()

# Collect the IDs of the filtered rows
filtered_tconst = filtered_title_basic_df['tconst'].tolist()

# Filter corresponding rows in title_akas_df and title_ratings_df based on the collected IDs
filtered_title_akas_df = title_akas_df[title_akas_df['titleId'].isin(filtered_tconst)]
filtered_title_ratings_df = title_ratings_df[title_ratings_df['tconst'].isin(filtered_tconst)]

In [22]:
#filtered_title_basic_df Dataframe shape after including only rows where 'startYear' is either 2000 or 2001
filtered_title_basic_df.shape

(8467, 9)

In [23]:
#filtered_title_akas_df Dataframe shape after including only rows where 'startYear' is either 2000 or 2001
filtered_title_akas_df.shape

(53170, 8)

In [24]:
#filtered_title_ratings_df Dataframe shape after including only rows where 'startYear' is either 2000 or 2001
filtered_title_ratings_df.shape

(5863, 3)

- **Include only movies that were released in the United States**

In [25]:
# Include only rows where 'country' is 'USA'
filtered_title_akas_df = filtered_title_akas_df[filtered_title_akas_df['region'] == 'US']

# Collect the IDs of the filtered rows
filtered_titleId = filtered_title_akas_df['titleId'].tolist()

# Filter corresponding rows in filtered_title_basic_df and filtered_title_ratings_df based on the collected IDs
filtered_title_basic_df = filtered_title_basic_df[filtered_title_basic_df['tconst'].isin(filtered_titleId)]
filtered_title_ratings_df = filtered_title_ratings_df[filtered_title_ratings_df['tconst'].isin(filtered_titleId)]


In [26]:
#filtered_title_basic_df Dataframe shape after filtering out movies that were released in the US
filtered_title_basic_df.shape

(3872, 9)

In [27]:
#filtered_title_akas_df Dataframe shape after filtering out movies that were released in the US
filtered_title_akas_df.shape

(4591, 8)

In [28]:
#filtered_title_ratings_df Dataframe shape after filtering out movies that were released in the US
filtered_title_ratings_df.shape

(3216, 3)

### ***Dataframes Summaries***

**title_basic_df**

In [29]:
filtered_title_basic_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3872 entries, 34800 to 10178703
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   tconst          3872 non-null   object
 1   titleType       3872 non-null   object
 2   primaryTitle    3872 non-null   object
 3   originalTitle   3872 non-null   object
 4   isAdult         3872 non-null   object
 5   startYear       3872 non-null   object
 6   endYear         3872 non-null   object
 7   runtimeMinutes  3872 non-null   object
 8   genres          3872 non-null   object
dtypes: object(9)
memory usage: 302.5+ KB


**title_akas_df**

In [30]:
filtered_title_akas_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4591 entries, 203542 to 37244981
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   titleId          4591 non-null   object
 1   ordering         4591 non-null   int64 
 2   title            4591 non-null   object
 3   region           4591 non-null   object
 4   language         4591 non-null   object
 5   types            4591 non-null   object
 6   attributes       4591 non-null   object
 7   isOriginalTitle  4591 non-null   object
dtypes: int64(1), object(7)
memory usage: 322.8+ KB


**title_ratings_df**

In [31]:
filtered_title_ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3216 entries, 17958 to 1337246
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   tconst         3216 non-null   object 
 1   averageRating  3216 non-null   float64
 2   numVotes       3216 non-null   int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 100.5+ KB


### ***Save Dataframes***

In [34]:
# Save filtered_title_basic_df DataFrame to a compressed CSV file in the "Data/" folder
filtered_title_basic_df.to_csv('Data/filtered_title_basic.csv.gz', index=False, compression='gzip')


In [35]:
# Save filtered_title_akas_df DataFrame to a compressed CSV file in the "Data/" folder
filtered_title_akas_df.to_csv('Data/filtered_title_akas.csv.gz', index=False, compression='gzip')

In [36]:
# Save filtered_title_ratings_df DataFrame to a compressed CSV file in the "Data/" folder
filtered_title_ratings_df.to_csv('Data/filtered_title_ratings.csv.gz', index=False, compression='gzip')