In [1]:
import pandas as pd
import numpy as np

In [2]:
basics_url="https://datasets.imdbws.com/title.basics.tsv.gz"
basics = pd.read_csv(basics_url, sep='\t', low_memory=False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [3]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9875263 entries, 0 to 9875262
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 678.1+ MB


In [4]:
basics.shape

(9875263, 9)

In [5]:
basics.duplicated().sum()

0

In [6]:
basics.isna().sum().sum()

37

In [7]:
basics.replace({'\\N':np.nan}, inplace = True)

In [8]:
basics['runtimeMinutes'].value_counts()

30      218155
60      160790
22      160449
15       76181
44       75872
         ...  
961          1
641          1
964          1
616          1
2088         1
Name: runtimeMinutes, Length: 889, dtype: int64

In [9]:
basics.dropna(subset= ['runtimeMinutes'], inplace=True)
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle            1
originalTitle           1
isAdult                 1
startYear          170693
endYear           2860262
runtimeMinutes          0
genres              76915
dtype: int64

In [10]:
basics.dropna(subset= ['genres'], inplace=True)
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle            1
originalTitle           1
isAdult                 0
startYear          165739
endYear           2784940
runtimeMinutes          0
genres                  0
dtype: int64

In [11]:
basics['titleType'].value_counts()

tvEpisode       1442149
short            602313
movie            382856
video            180670
tvMovie           91688
tvSeries          90640
tvSpecial         18222
tvMiniSeries      17240
tvShort            8819
videoGame           321
Name: titleType, dtype: int64

In [12]:
basics['titleType']=='movie'
FLMovie = basics['titleType']=='movie'

In [13]:
basics.loc[FLMovie,:]
basics = basics.loc[FLMovie,:]

In [14]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 382856 entries, 8 to 9875213
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          382856 non-null  object
 1   titleType       382856 non-null  object
 2   primaryTitle    382856 non-null  object
 3   originalTitle   382856 non-null  object
 4   isAdult         382856 non-null  object
 5   startYear       376396 non-null  object
 6   endYear         0 non-null       object
 7   runtimeMinutes  382856 non-null  object
 8   genres          382856 non-null  object
dtypes: object(9)
memory usage: 29.2+ MB


In [15]:
#year  change to a float- .astype(float)
basics['startYear'].value_counts().astype(float)

2017    14376.0
2018    14341.0
2019    14082.0
2016    13963.0
2015    13482.0
         ...   
1899        1.0
1904        1.0
1897        1.0
1896        1.0
1894        1.0
Name: startYear, Length: 130, dtype: float64

In [16]:
basics = basics[(basics['startYear']>='2000') & (basics['startYear']<'2022')]

In [17]:
# Exclude movies that are included in the documentary category.
#Check order in LP
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]

In [18]:
akas="https://datasets.imdbws.com/title.akas.tsv.gz"
akas = pd.read_csv(akas, sep='\t', low_memory=False)
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


In [19]:
akas['region'].value_counts()

DE    4311479
FR    4307420
JP    4305902
IN    4247206
ES    4227222
       ...   
FM          2
TV          1
PW          1
NR          1
NU          1
Name: region, Length: 248, dtype: int64

In [20]:
akas.replace({'\\N':np.nan})

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,,imdbDisplay,,0
1,tt0000001,2,Carmencita,DE,,,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,,imdbDisplay,,0
3,tt0000001,4,Καρμενσίτα,GR,,imdbDisplay,,0
4,tt0000001,5,Карменсита,RU,,imdbDisplay,,0
...,...,...,...,...,...,...,...,...
35994981,tt9916852,5,Episódio #3.20,PT,pt,,,0
35994982,tt9916852,6,Episodio #3.20,IT,it,,,0
35994983,tt9916852,7,एपिसोड #3.20,IN,hi,,,0
35994984,tt9916856,1,The Wind,DE,,imdbDisplay,,0


In [21]:
akas = akas[(akas['region'] == 'US')]

In [22]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers = basics['tconst'].isin(akas['titleId'])
basics =  basics[keepers]

In [23]:
akas['region'].value_counts()

US    1438680
Name: region, dtype: int64

In [24]:
ratings="https://datasets.imdbws.com/title.ratings.tsv.gz"
ratings = pd.read_csv(ratings, sep='\t', low_memory=False)
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1977
1,tt0000002,5.8,264
2,tt0000003,6.5,1824
3,tt0000004,5.6,178
4,tt0000005,6.2,2617


In [25]:
ratings.replace({'\\N':np.nan})

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1977
1,tt0000002,5.8,264
2,tt0000003,6.5,1824
3,tt0000004,5.6,178
4,tt0000005,6.2,2617
...,...,...,...
1314581,tt9916730,8.3,10
1314582,tt9916766,7.0,21
1314583,tt9916778,7.2,36
1314584,tt9916840,7.5,7


In [26]:
# example making new folder with os
import os
os.makedirs('Data/',exist_ok=True) 
# Confirm folder created
os.listdir("Data/")

[]

In [27]:
['title_basics.csv.gz']

['title_basics.csv.gz']

In [28]:
## Save current dataframe to file.
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)

In [29]:
# Open saved file and preview again
basics = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
4,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,,126,Drama
