In [149]:
import pandas as pd
import numpy as np
import seaborn as sns

In [150]:
basics_url="https://datasets.imdbws.com/title.basics.tsv.gz"
basics = pd.read_csv(basics_url, sep='\t', low_memory=False)
df_b = basics
df_b.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,\N
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,\N
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,\N
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,\N
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,\N


In [151]:
# replace missing values with NaN
df_b.replace({'\\N':np.nan}, inplace=True)

In [152]:
# Check for missing values 
df_b.isna().sum()

tconst                  0
titleType               0
primaryTitle           11
originalTitle          11
isAdult                 1
startYear         1245164
endYear           9211067
runtimeMinutes    6795228
genres            9308649
dtype: int64

In [153]:
# drop missing values from StartYear, Genres, runtimeMinutes
df_b = df_b.dropna(subset=['startYear', 'genres', 'runtimeMinutes'])

In [154]:
# check for dropna
df_b.isna().sum()

tconst            0.0
titleType         0.0
primaryTitle      0.0
originalTitle     0.0
isAdult           0.0
startYear         0.0
endYear           0.0
runtimeMinutes    0.0
genres            0.0
dtype: float64

In [155]:
#Keep only titleType_movie & Keep startYear 2000-2022
keepTs_df= df_b[(df_b['startYear']>='2000') & (df_b['startYear']>='2022') & (df_b['titleType']=='movie')]
keepTs_df

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres


In [156]:
# Exclude movies that are included in the documentary category.
is_documentary = keepTs_df['genres'].str.contains('Documentary',case=False)
df = keepTs_df[~is_documentary]

AttributeError: Can only use .str accessor with string values!

In [157]:
new_df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34792,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61094,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
67640,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
77934,tt0079644,movie,November 1828,November 1828,0,2001,,140,"Drama,War"
86770,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"


In [158]:
akas_url="https://datasets.imdbws.com/title.akas.tsv.gz"
akas = pd.read_csv(akas_url, sep='\t', low_memory=False)
df_akas = akas
df_akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


In [159]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers = new_df['tconst'].isin(akas['titleId'])
keepers

34792      True
61094      True
67640      True
77934      True
86770      True
           ... 
9305772    True
9305781    True
9305820    True
9305865    True
9305949    True
Name: tconst, Length: 144368, dtype: bool

In [160]:
#Filter basics
basics = new_df[keepers]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34792,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61094,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
67640,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
77934,tt0079644,movie,November 1828,November 1828,0,2001,,140,"Drama,War"
86770,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
...,...,...,...,...,...,...,...,...,...
9305772,tt9916170,movie,The Rehearsal,O Ensaio,0,2019,,51,Drama
9305781,tt9916190,movie,Safeguard,Safeguard,0,2020,,95,"Action,Adventure,Thriller"
9305820,tt9916270,movie,Il talento del calabrone,Il talento del calabrone,0,2020,,84,Thriller
9305865,tt9916362,movie,Coven,Akelarre,0,2020,,92,"Drama,History"


In [161]:
# replace missing values with NaN
basics.replace({'\\N':np.nan}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  basics.replace({'\\N':np.nan}, inplace=True)


In [162]:
ratings_url="https://datasets.imdbws.com/ratings.akas.tsv.gz"
ratings = pd.read_csv(akas_url, sep='\t', low_memory=False)
df_ratings = ratings
df_ratings.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


In [163]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers_ratings = new_df['tconst'].isin(ratings['titleId'])
keepers_ratings

34792      True
61094      True
67640      True
77934      True
86770      True
           ... 
9305772    True
9305781    True
9305820    True
9305865    True
9305949    True
Name: tconst, Length: 144368, dtype: bool

In [164]:
# example making new folder with os
import os
os.makedirs('Data/',exist_ok=True) 
# Confirm folder created
os.listdir("Data/")

['title.akas.tsv.gz', 'ratings.akas.tsv.gz', 'title_basics.csv.gz']

In [165]:
## Save current dataframe to file.
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)

In [None]:
## Save current dataframe to file.
akas.to_csv("Data/title.akas.tsv.gz", compression='gzip', index=False)

In [None]:
## Save current dataframe to file.
ratings.to_csv("Data/ratings.akas.tsv.gz", compression='gzip', index=False)