# # Project 3 _ Part 1


In [1]:
# %%
import pandas as pd 
import numpy as np
import seaborn as sns


# BASIC DATAFRAME

In [2]:
# 
## uploaded file 
basics_url="https://datasets.imdbws.com/title.basics.tsv.gz"
basics = pd.read_csv(basics_url, sep='\t', low_memory=False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [3]:
basics['titleType'].value_counts()

tvEpisode       7104917
short            901753
movie            628502
video            267907
tvSeries         234839
tvMovie          138461
tvMiniSeries      46167
tvSpecial         39288
videoGame         32919
tvShort           10760
tvPilot               2
Name: titleType, dtype: int64

In [4]:
# %%
# replace missing values with NaN
basics.replace({'\\N':np.nan}, inplace=True)

In [5]:
# %%
# get basic's dataframe info
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9405515 entries, 0 to 9405514
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 645.8+ MB


In [6]:
# 
# Check for missing values 
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle           11
originalTitle          11
isAdult                 1
startYear         1262586
endYear           9306349
runtimeMinutes    6767435
genres             431198
dtype: int64

In [7]:
###
# drop missing values from genres, runtimeMinutes

basics = basics.dropna(subset=['genres', 'runtimeMinutes'])

In [8]:
# check for missing values 
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle            1
originalTitle           1
isAdult                 0
startYear           98273
endYear           2522452
runtimeMinutes          0
genres                  0
dtype: int64

In [9]:
basics = basics.loc[basics['titleType']=='movie']
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,,45,Romance
144,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897,,100,"Documentary,News,Sport"
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,,90,Drama
672,tt0000679,movie,The Fairylogue and Radio-Plays,The Fairylogue and Radio-Plays,0,1908,,120,"Adventure,Fantasy"
...,...,...,...,...,...,...,...,...,...
9405280,tt9916362,movie,Coven,Akelarre,0,2020,,92,"Drama,History"
9405364,tt9916538,movie,Kuambil Lagi Hatiku,Kuambil Lagi Hatiku,0,2019,,123,Drama
9405405,tt9916622,movie,Rodolpho Teóphilo - O Legado de um Pioneiro,Rodolpho Teóphilo - O Legado de um Pioneiro,0,2015,,57,Documentary
9405432,tt9916680,movie,De la ilusión al desconcierto: cine colombiano...,De la ilusión al desconcierto: cine colombiano...,0,2007,,100,Documentary


In [10]:
# Exclude movies that are included in the documentary category. 
is_documentary = basics['genres'].str.contains('Documentary', case=False)
df = basics[~is_documentary]


In [11]:
#convert startYear to float
df['startYear']= df['startYear'].astype(float)

#Keep only titleType_movie & Keep startYear 2000-2021
df = df[(df['startYear']>=2000) & (df['startYear']<=2021)]
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['startYear']= df['startYear'].astype(float)


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34793,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
61095,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020.0,,70,Drama
67643,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
77937,tt0079644,movie,November 1828,November 1828,0,2001.0,,140,"Drama,War"
86773,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
...,...,...,...,...,...,...,...,...,...
9405187,tt9916170,movie,The Rehearsal,O Ensaio,0,2019.0,,51,Drama
9405196,tt9916190,movie,Safeguard,Safeguard,0,2020.0,,95,"Action,Adventure,Thriller"
9405235,tt9916270,movie,Il talento del calabrone,Il talento del calabrone,0,2020.0,,84,Thriller
9405280,tt9916362,movie,Coven,Akelarre,0,2020.0,,92,"Drama,History"



# AKAS DATAFRAME / EDA 

In [12]:
#
# Upload 2nd dataframe
akas_url="https://datasets.imdbws.com/title.akas.tsv.gz"
akas = pd.read_csv(akas_url, sep='\t', low_memory=False)
df_akas = akas
df_akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


In [13]:
# replace missing values with NaN
df_akas.replace({'\\N':np.nan}, inplace=True)

In [14]:
# US Filter 
df_filter_akas = df_akas[df_akas['region'] == 'US']
df_filter_akas

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,,imdbDisplay,,0
14,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0
36,tt0000005,1,Blacksmithing Scene,US,,alternative,,0
41,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0
...,...,...,...,...,...,...,...,...
33999069,tt9916702,1,Loving London: The Playground,US,,,,0
33999106,tt9916720,10,The Demonic Nun,US,,tv,,0
33999108,tt9916720,12,The Nun 2,US,,imdbDisplay,,0
33999126,tt9916756,1,Pretty Pretty Black Girl,US,,imdbDisplay,,0


# RATINGS DATAFRAME

In [15]:
#
ratings_url="https://datasets.imdbws.com/title.ratings.tsv.gz"
ratings = pd.read_csv(ratings_url, sep='\t', low_memory=False)
df_ratings = ratings
df_ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1924
1,tt0000002,5.8,260
2,tt0000003,6.5,1736
3,tt0000004,5.6,175
4,tt0000005,6.2,2550


In [16]:
#
# replace missing values with NaN
df_ratings.replace({'\\N':np.nan}, inplace=True)

In [17]:
# Filter the ratings table down to only include the US by using the filter akas dataframe
keeper = df_ratings['tconst'].isin(df_filter_akas['titleId'])
keeper

0           True
1           True
2          False
3          False
4           True
           ...  
1251737    False
1251738     True
1251739    False
1251740    False
1251741    False
Name: tconst, Length: 1251742, dtype: bool

In [18]:
# 
df_ratings_clean = df_ratings[keeper]
df_ratings_clean

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1924
1,tt0000002,5.8,260
4,tt0000005,6.2,2550
5,tt0000006,5.1,175
6,tt0000007,5.4,798
...,...,...,...
1251716,tt9916204,8.2,250
1251722,tt9916348,8.5,17
1251723,tt9916362,6.4,5018
1251727,tt9916428,3.8,14


In [19]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keeper1 = df['tconst'].isin(df_filter_akas['titleId'])
keeper1

34793       True
61095       True
67643       True
77937      False
86773       True
           ...  
9405187     True
9405196     True
9405235    False
9405280     True
9405364    False
Name: tconst, Length: 137297, dtype: bool

In [20]:
df = df[keeper1]
df

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34793,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
61095,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020.0,,70,Drama
67643,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
86773,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
93909,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002.0,,126,Drama
...,...,...,...,...,...,...,...,...,...
9404651,tt9914942,movie,Life Without Sara Amat,La vida sense la Sara Amat,0,2019.0,,74,Drama
9405047,tt9915872,movie,The Last White Witch,My Girlfriend is a Wizard,0,2019.0,,97,"Comedy,Drama,Fantasy"
9405187,tt9916170,movie,The Rehearsal,O Ensaio,0,2019.0,,51,Drama
9405196,tt9916190,movie,Safeguard,Safeguard,0,2020.0,,95,"Action,Adventure,Thriller"


In [21]:
# example making new folder with os
import os
os.makedirs('data/',exist_ok=True) 
# Confirm folder created
os.listdir("data/")

['basics_clean_df.csv.gz',
 'tmdb_api_results_[2000, 2001].json',
 'ratings_clean_df.csv.gz',
 'akas_clean_df.csv.gz',
 'final_tmdb_data_[2000, 2001].csv.gz',
 '.ipynb_checkpoints']

In [22]:
## Save current dataframe to file.
df.to_csv('data/basics_clean_df.csv.gz', compression='gzip', index=False)

In [23]:
## Save current dataframe to file. 
df_filter_akas.to_csv('data/akas_clean_df.csv.gz', compression='gzip', index=False)

In [24]:
## Save current dataframe to file.
df_ratings_clean.to_csv('data/ratings_clean_df.csv.gz', compression='gzip',  index=False)