In [1]:
import pandas as pd
import numpy as np

In [2]:
akas_url = ('https://datasets.imdbws.com/title.akas.tsv.gz')
basics_url =('https://datasets.imdbws.com/title.basics.tsv.gz')
rating_url= ('https://datasets.imdbws.com/title.ratings.tsv.gz')

In [None]:
basics = pd.read_csv(basics_url, sep ='\t', low_memory =False)
akas = pd.read_csv(akas_url, sep ='\t', low_memory =False)
rating = pd.read_csv(rating_url, sep ='\t', low_memory =False)

In [None]:
basics.head()

In [None]:
akas.head()

In [None]:
rating.head()

# Data Cleaning

In [None]:
#replace "\N " with np.nan
basics = basics.replace({'\\N':np.nan})

In [None]:
#Elimintae movies that are null for runtimeMinutes column
basics = basics.dropna(subset = 'runtimeMinutes')

In [None]:
#Elimintae movies that are null for genre
basics = basics.dropna(subset = 'genres')

In [None]:
#Keep only titleType == Movie
basics = basics.loc[basics['titleType'] == 'movie']

In [None]:
#keep startyear 2000-2022
#errors='coerce' to handle any invalid values
basics['startYear'] = pd.to_numeric(basics['startYear'], errors='coerce')
basics = basics.loc[(basics['startYear'] >= 2000) & (basics['startYear']<=2022)]

In [None]:
#Eliminate movies that include 'Documentary'
is_documentaries = basics['genres'].str.contains('documentary',case = False)
basics = basics[~is_documentaries]

In [None]:
#replace "\N" with np.nan in akas table
akas = akas.replace({"\\N":np.nan})

In [None]:
#replace "\N" with np.nan in rating table
rating = rating.replace({"\\N":np.nan})

In [None]:
#keep only U.S movie in akas
akas = akas.loc[akas['region'] == 'US']

In [None]:
#filter the basics table down to only include the US by using the filter akas
keepers =basics['tconst'].isin(akas['titleId'])
keepers

In [None]:
#filter the basics with keepers
basics = basics[keepers]
basics

In [None]:
keepers = rating['tconst'].isin(akas['titleId'])
keepers

In [None]:
rating = rating[keepers]
rating

# Saving to data folder

In [None]:
import os
os.makedirs('Data/', exist_ok=True)
os.listdir ("Data/")

In [None]:
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)

In [None]:
akas.to_csv("Data/title_akas.csv.gz",compression='gzip',index=False)

In [None]:
rating.to_csv("Data/title_rating.csv.gz",compression='gzip',index=False)

In [None]:
akas.info()

In [None]:
basics.info()

In [None]:
rating.info()