## Imports

In [1]:
# Imports 
import os
import pandas as pd
import numpy as np

## Create Data Folder

In [2]:
# Create 'Data' folder
os.makedirs('Data/',exist_ok=True)
os.listdir("Data/")

['.ipynb_checkpoints',
 'title-akas-us-only.csv',
 'title.basics.tsv.gz',
 'title.ratings.tsv.gz']

## Load Dataset

In [3]:
# Load first dataset
akas = pd.read_csv('Data/title-akas-us-only.csv', low_memory=False)
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,\N,imdbDisplay,\N,0
1,tt0000002,7,The Clown and His Dogs,US,\N,\N,literal English title,0
2,tt0000005,10,Blacksmith Scene,US,\N,imdbDisplay,\N,0
3,tt0000005,1,Blacksmithing Scene,US,\N,alternative,\N,0
4,tt0000005,6,Blacksmith Scene #1,US,\N,alternative,\N,0


In [4]:
# Load datatypes
akas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1452564 entries, 0 to 1452563
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1452564 non-null  object
 1   ordering         1452564 non-null  int64 
 2   title            1452564 non-null  object
 3   region           1452564 non-null  object
 4   language         1452564 non-null  object
 5   types            1452564 non-null  object
 6   attributes       1452564 non-null  object
 7   isOriginalTitle  1452564 non-null  object
dtypes: int64(1), object(7)
memory usage: 88.7+ MB


## Filtering/Cleaning Steps:
- AKAs:
 - keep only US movies.
 - Replace "\N" with np.nan

In [5]:
# Keep only US movies.
# AKAs US only file already uploaded.

In [6]:
# Replace "\N" with np.nan
akas.replace({'\\N':np.nan}, inplace=True)

In [7]:
# Confirm changes made are permanent
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,,imdbDisplay,,0
1,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0
2,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0
3,tt0000005,1,Blacksmithing Scene,US,,alternative,,0
4,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0


In [8]:
# Save current dataframe to file.
akas.to_csv("Data/title_akas.csv.gz", compression='gzip', index=False)

### IMDB Data Sources:
- Downloads Page - https://developer.imdb.com/non-commercial-datasets/
- Data Dictionary - https://developer.imdb.com/non-commercial-datasets/
- IMDB Logo - https://www.themoviedb.org/about/logos-attribution

![blue_square_1-5bdc75aaebeb75dc7ae79426ddd9be3b2be1e342510f8202baf6bffa71d7f5c4.svg](attachment:blue_square_1-5bdc75aaebeb75dc7ae79426ddd9be3b2be1e342510f8202baf6bffa71d7f5c4.svg)