In [2]:
import pandas as pd

# Load the IMDB data into Pandas dataframes
title_basics = pd.read_csv("title.basics.tsv", sep="\t", low_memory=False)
title_ratings = pd.read_csv("title.ratings.tsv", sep="\t")
title_akas = pd.read_csv("title.akas.tsv", sep="\t", low_memory=False)

In [3]:
# Filter the dataframes based on the stakeholder's specifications
filtered_title_basics = title_basics[
    (title_basics['titleType'] == 'movie') &
    (title_basics['startYear'].apply(lambda x: x.isnumeric() and 2000 <= int(x) <= 2021)) &
    (~title_basics['genres'].str.contains("Documentary", na=True)) &
    (~title_basics['genres'].isna()) &
    (~title_basics['runtimeMinutes'].isna())
]

filtered_title_akas = title_akas[
    (title_akas['titleId'].isin(filtered_title_basics['tconst'])) &
    (title_akas['region'] == 'US')
]

filtered_title_ratings = title_ratings[
    title_ratings['tconst'].isin(filtered_title_basics['tconst'])
]


In [4]:
# Display .info() for the filtered dataframes
filtered_title_basics.info()
filtered_title_akas.info()
filtered_title_ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 185925 entries, 11636 to 9806237
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          185925 non-null  object
 1   titleType       185925 non-null  object
 2   primaryTitle    185925 non-null  object
 3   originalTitle   185925 non-null  object
 4   isAdult         185925 non-null  object
 5   startYear       185925 non-null  object
 6   endYear         185925 non-null  object
 7   runtimeMinutes  185925 non-null  object
 8   genres          185925 non-null  object
dtypes: object(9)
memory usage: 14.2+ MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 107178 entries, 200648 to 35735965
Data columns (total 8 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   titleId          107178 non-null  object
 1   ordering         107178 non-null  int64 
 2   title            107178 non-null  object
 3

In [6]:
# Save the filtered dataframes as gzip-compressed csv files in your repository
filtered_title_basics.to_csv("Data/filtered_title_basics.csv.gz", index=False, compression="gzip")
filtered_title_akas.to_csv("Data/filtered_title_akas.csv.gz", index=False, compression="gzip")
filtered_title_ratings.to_csv("Data/filtered_title_ratings.csv.gz", index=False, compression="gzip")