# Part 1

## Filtering/Cleaning

In [1]:
import pandas as pd
import numpy as np

### Title Basics

In [2]:
basics_url="https://datasets.imdbws.com/title.basics.tsv.gz"
basics = pd.read_csv(basics_url, sep='\t', low_memory=False)

In [None]:
basics.info()

In [None]:
basics.head()

In [None]:
# Change tconst to imdb id
basics =basics.rename(colums={'tconst':'imdb id'})
basics.info()

#### Replace "\N" with np.nan

In [None]:
basics['runtimeMinutes']= basics['runtimeMinutes'].replace('')

In [None]:
basics = basics.replace({'\\N':np.nan})

In [None]:
basics.info()

In [None]:
basics.head()

#### Eliminate movies that are null for runtimeMinutes and genre

In [None]:
basics.dropna(subset =['runtimeMinutes','genres'], inplace= True)

In [None]:
basics.head()

#### Keep only titleType == Movie

In [None]:
basics_mov = basics[basics['titleType']== 'movie']
basics_mov.head()

#### keep startYear 2000-2022

In [None]:
basics_mov['startYear'].value_counts()

In [None]:
basics_mov = basics_mov.astype({"startYear": float})
basics_mov.info()

In [None]:
basics_year = basics_mov.loc[(basics_mov['startYear'] >= 2000) & (basics_mov['startYear'] <= 2022)]
basics_year

#### Eliminate movies that include "Documentary" in genre 

In [None]:
doc = basics_year['genres'].str.contains('Documentary',case=False)
basics_doc = basics_year[~doc]

In [None]:
basics_doc.head()

In [None]:
basics_doc.info()

### AKA

#### keep only US movies

In [None]:
akas_url="https://datasets.imdbws.com/title.akas.tsv.gz"
akas = pd.read_csv(akas_url, sep='\t', low_memory=False)

In [None]:
akas.head()

In [None]:
akas_us = akas[akas['region']== 'US']
akas_us.head()

#### Replace '\N' with np.nan

In [None]:
akas_us = akas_us.replace({'\\N':np.nan})

In [None]:
akas_us.head()

In [None]:
akas_us.info()

### Ratings

In [None]:
ratings_url="https://datasets.imdbws.com/title.ratings.tsv.gz"
ratings = pd.read_csv(ratings_url, sep='\t', low_memory=False)

In [None]:
ratings.head()

#### Replace '\N' with np.nan

In [None]:
ratings = ratings.replace({'\\N':np.nan})

In [None]:
ratings.info()

### Filtered one dataset with another

#### Basics

In [None]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers1 =basics_doc['tconst'].isin(akas_us['titleId'])
keepers1

In [None]:
basics_a = basics_doc.loc[keepers1]
basics_a

#### Ratings

In [None]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers =ratings['tconst'].isin(akas_us['titleId'])
keepers

In [None]:
basics_n = ratings.loc[keepers]
basics_n

## Export data

In [None]:
## Save current dataframe to file.
basics_a.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)

In [None]:
basics_f = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)
basics_f.head()

In [None]:
## Save current dataframe to file.
akas_us.to_csv("Data/title_akas.csv.gz",compression='gzip',index=False)

In [None]:
# Open saved file and preview again
akas_f = pd.read_csv("Data/title_akas.csv.gz", low_memory = False)
akas_f.head()

In [None]:
## Save current dataframe to file.
basics_n.to_csv("Data/title_ratings.csv.gz",compression='gzip',index=False)

In [None]:
# Open saved file and preview again
ratings_f = pd.read_csv("Data/title_ratings.csv.gz", low_memory = False)
ratings_f.head()

In [None]:
# example making new folder with os
import os
os.makedirs('Data/',exist_ok=True) 
# Confirm folder created
os.listdir("Data/")

## Data was collected from TMBD
https://www.themoviedb.org/about/logos-attribution