In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
# uploaded file
basics_url="https://datasets.imdbws.com/title.basics.tsv.gz"
basics = pd.read_csv(basics_url, sep='\t', low_memory=False)
df_b = basics
df_b.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [3]:
# replace missing values with NaN
df_b.replace({'\\N':np.nan}, inplace=True)

In [4]:
df_b.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9369947 entries, 0 to 9369946
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 643.4+ MB


In [5]:
# Check for missing values 
df_b.isna().sum()

tconst                  0
titleType               0
primaryTitle           11
originalTitle          11
isAdult                 1
startYear         1257162
endYear           9271369
runtimeMinutes    6788761
genres             430247
dtype: int64

In [6]:
# drop missing values from StartYear, Genres, runtimeMinutes
df_b = df_b.dropna(subset=['startYear', 'genres', 'runtimeMinutes'])

In [7]:
# convert startYear dtype(object) to dytpe(string)
# df_b['startYear'] = df_b['startYear'].astype(float)

# df_b.infer_objects().dtypes

In [8]:
# check for dropna
df_b.isna().sum()

tconst                  0
titleType               0
primaryTitle            1
originalTitle           1
isAdult                 0
startYear               0
endYear           2390107
runtimeMinutes          0
genres                  0
dtype: int64

In [9]:
#Keep only titleType_movie & Keep startYear 2000-2022
keepTs_df= df_b[(df_b['startYear']>='2000') & (df_b['startYear']>='2022') & (df_b['titleType']=='movie')]
keepTs_df

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
116973,tt0119830,movie,One Dog Day,One Dog Day,0,2022,,101,"Action,Comedy"
117694,tt0120589,movie,A Dangerous Practice,A Dangerous Practice,0,2022,,108,Drama
193058,tt0200940,movie,Over-sexed Rugsuckers from Mars,Over-sexed Rugsuckers from Mars,0,2022,,87,"Comedy,Sci-Fi"
202636,tt0211266,movie,Black Star: Autobiography of a Close Friend,Black Star: Autobiography of a Close Friend,0,2022,,85,Documentary
271075,tt0283145,movie,Wielka droga,Wielka droga,0,2022,,87,"Drama,War"
...,...,...,...,...,...,...,...,...,...
9358923,tt9893130,movie,"2025: Blood, White & Blue","2025: Blood, White & Blue",0,2022,,135,"Action,Comedy,Horror"
9358937,tt9893158,movie,Clowning,Clowning,0,2022,,96,"Crime,Romance"
9358938,tt9893160,movie,No Way Out,No Way Out,0,2022,,89,"Action,Crime,Thriller"
9359311,tt9894000,movie,Twice As Strong: Made of Fire,Twice As Strong: Made of Fire,0,2022,,122,Drama


In [10]:
new_df = keepTs_df

In [11]:
# Exclude movies that are included in the documentary category.
is_documentary = new_df['genres'].str.contains('Documentary',case=False)
df = new_df[~is_documentary]

In [12]:
akas_url="https://datasets.imdbws.com/title.akas.tsv.gz"
akas = pd.read_csv(akas_url, sep='\t', low_memory=False)
df_akas = akas
df_akas.head()

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [None]:
df.head()

In [None]:
# replace missing values with NaN
df_akas.replace({'\\N':np.nan}, inplace=True)

In [None]:
df_akas

In [None]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers = df['tconst'].isin(akas['titleId'])
keepers

In [None]:
#Cleaned basics_df
basics_df = df[keepers]
basics_df

In [None]:
ratings_url="Data/title.ratings (1).tsv"
ratings = pd.read_csv(ratings_url, sep='\t', low_memory=False)
df_ratings = ratings
df_ratings.head()

In [None]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers_ratings = basics_df['tconst'].isin(akas['titleId'])
keepers_ratings

In [None]:
# example making new folder with os
import os
os.makedirs('Data/',exist_ok=True) 
# Confirm folder created
os.listdir("Data/")

In [None]:
## Save current dataframe to file.
basics_df.to_csv("Data/title_basics_cleaned_1.csv.gz",compression='gzip',index=False)

In [None]:
## Save current dataframe to file.
akas.to_csv("Data/title.akas_cleaned_1.tsv.gz", compression='gzip', index=False)

In [None]:
## Save current dataframe to file.
df_ratings.to_csv("Data/ratings_cleaned_1.akas.tsv.gz", compression='gzip', index=False)