# # Project 3 _ Part 1


In [1]:
# %%
import pandas as pd 
import numpy as np
import seaborn as sns


# BASIC DATAFRAME

In [2]:
# 
## uploaded file 
basics_url="https://datasets.imdbws.com/title.basics.tsv.gz"
basics = pd.read_csv(basics_url, sep='\t', low_memory=False)
df_b = basics
df_b.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [3]:
# %%
# replace missing values with NaN
df_b.replace({'\\N':np.nan}, inplace=True)

In [4]:
# %%
# get basic's dataframe info
df_b.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9372141 entries, 0 to 9372140
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 643.5+ MB


In [5]:
# 
# Check for missing values 
df_b.isna().sum()

tconst                  0
titleType               0
primaryTitle           11
originalTitle          11
isAdult                 1
startYear         1257693
endYear           9273539
runtimeMinutes    6783224
genres             430259
dtype: int64

In [6]:
###
# drop missing values from startYear, genres, runtimeMinutes

df_b = df_b.dropna(subset=['startYear', 'genres', 'runtimeMinutes', 'endYear'])

In [7]:
##
df_b = df_b.infer_objects().convert_dtypes()


In [8]:
#
# check for missing values 
df_b.isna().sum()

tconst            0
titleType         0
primaryTitle      0
originalTitle     0
isAdult           0
startYear         0
endYear           0
runtimeMinutes    0
genres            0
dtype: int64

In [9]:
#
#Keep only titleType_movie & Keep startYear 2000-2022

keepTs_df= df_b[(df_b['startYear']>='2000') & (df_b['startYear']<='2022') & (df_b['titleType']=='movie')]
keepTs_df

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres


In [10]:
#
new_df = keepTs_df

In [11]:
# Exclude movies that are included in the documentary category. 
is_documentary = new_df['genres'].str.contains('Documentary', case=False)
df = new_df[~is_documentary]



# AKAS DATAFRAME / EDA 

In [12]:
#
# Upload 2nd dataframe
akas_url="https://datasets.imdbws.com/title.akas.tsv.gz"
akas = pd.read_csv(akas_url, sep='\t', low_memory=False)
df_akas = akas
df_akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


In [13]:
# replace missing values with NaN
df_akas.replace({'\\N':np.nan}, inplace=True)

In [14]:
# US Filter 
df_filter_akas = df_akas[df_akas['region'] == 'US']
df_filter_akas

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,,imdbDisplay,,0
14,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0
36,tt0000005,1,Blacksmithing Scene,US,,alternative,,0
41,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0
...,...,...,...,...,...,...,...,...
33840262,tt9916702,1,Loving London: The Playground,US,,,,0
33840299,tt9916720,10,The Demonic Nun,US,,tv,,0
33840301,tt9916720,12,The Nun 2,US,,imdbDisplay,,0
33840318,tt9916756,1,Pretty Pretty Black Girl,US,,imdbDisplay,,0


# RATINGS DATAFRAME

In [15]:
#
ratings_url="https://datasets.imdbws.com/title.ratings.tsv.gz"
ratings = pd.read_csv(ratings_url, sep='\t', low_memory=False)
df_ratings = ratings
df_ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1922
1,tt0000002,5.8,259
2,tt0000003,6.5,1735
3,tt0000004,5.6,174
4,tt0000005,6.2,2548


In [16]:
#
# replace missing values with NaN
df_ratings.replace({'\\N':np.nan}, inplace=True)

In [17]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keeper = df_ratings['tconst'].isin(df_filter_akas['titleId'])
keeper

0           True
1           True
2          False
3          False
4           True
           ...  
1247788    False
1247789     True
1247790    False
1247791    False
1247792    False
Name: tconst, Length: 1247793, dtype: bool

In [18]:
# 
df_ratings_clean = df_ratings[keeper]
df_ratings_clean

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1922
1,tt0000002,5.8,259
4,tt0000005,6.2,2548
5,tt0000006,5.1,175
6,tt0000007,5.4,797
...,...,...,...
1247767,tt9916204,8.2,247
1247773,tt9916348,8.5,17
1247774,tt9916362,6.4,4989
1247778,tt9916428,3.8,14


In [19]:
# example making new folder with os
import os
os.makedirs('Data/',exist_ok=True) 
# Confirm folder created
os.listdir("Data/")

['title_akas_clean_df.csv.gz',
 'ratings_clean_df.csv.gz',
 'title_basics_clean_df.csv.gz']

In [20]:
## Save current dataframe to file.
df.to_csv('Data/title_basics_clean_df.csv.gz', compression='gzip', index=False)

In [21]:
## Save current dataframe to file. 
df_filter_akas.to_csv('Data/title_akas_clean_df.csv.gz', compression='gzip', index=False)

In [22]:
## Save current dataframe to file.
df_ratings_clean.to_csv('Data/ratings_clean_df.csv.gz', compression='gzip',  index=False)
