Milene Carmes Vallejo
12/13/2022

# Import

In [9]:
import pandas as pd
import numpy as np
import os, time, json
import tmdbsimple as tmdb


# Downloads the Files

In [10]:
basics_url = 'https://datasets.imdbws.com/title.basics.tsv.gz'
akas_url = 'https://datasets.imdbws.com/title.akas.tsv.gz'
ratings_url = 'https://datasets.imdbws.com/title.ratings.tsv.gz'

# Loading TSV's with Pandas

In [14]:
basics = pd.read_csv(basics_url, sep = '\t', low_memory = False)
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"
...,...,...,...,...,...,...,...,...,...
9461877,tt9916848,tvEpisode,Episode #3.17,Episode #3.17,0,2010,\N,\N,"Action,Drama,Family"
9461878,tt9916850,tvEpisode,Episode #3.19,Episode #3.19,0,2010,\N,\N,"Action,Drama,Family"
9461879,tt9916852,tvEpisode,Episode #3.20,Episode #3.20,0,2010,\N,\N,"Action,Drama,Family"
9461880,tt9916856,short,The Wind,The Wind,0,2015,\N,27,Short


In [15]:
akas = pd.read_csv(akas_url, sep = '\t', low_memory = False)
akas

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0
...,...,...,...,...,...,...,...,...
34251229,tt9916852,5,Episódio #3.20,PT,pt,\N,\N,0
34251230,tt9916852,6,Episodio #3.20,IT,it,\N,\N,0
34251231,tt9916852,7,एपिसोड #3.20,IN,hi,\N,\N,0
34251232,tt9916856,1,The Wind,DE,\N,imdbDisplay,\N,0


In [16]:
ratings = pd.read_csv(ratings_url, sep = '\t', low_memory = False)
ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1929
1,tt0000002,5.8,261
2,tt0000003,6.5,1745
3,tt0000004,5.6,176
4,tt0000005,6.2,2559
...,...,...,...
1258675,tt9916690,7.4,6
1258676,tt9916720,5.4,287
1258677,tt9916730,8.0,8
1258678,tt9916766,6.7,21


# Required Preprocessing - Details


According to the data dictionary, null values have been encoding as \N.

You will want to find those and replace them with np.nan.

However, the backslash (\) character is a special one that tells the computer to ignore whatever character comes next.

So if we were to say df.replace({'\N':np.nan}), the computer would see \N as an empty string.

To fix this, add a second backslash character, which will tell the computer that you actually WANTED to use a literal \.

df.replace({'\\N':np.nan})


In [17]:
basics= basics.replace({'\\N':np.nan})
akas = akas.replace({'\\N':np.nan})
ratings =ratings.replace({'\\N':np.nan})

## Basics

In [18]:
# check duplicates
basics.duplicated().sum()

0

In [19]:
# Replace "\N" with np.nan
basics= basics.replace({'\\N':np.nan})

In [20]:
# check nan values
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle           11
originalTitle          11
isAdult                 1
startYear         1266972
endYear           9361754
runtimeMinutes    6748869
genres             431846
dtype: int64

#### Eliminate movies that are null for runtimeMinutes and genre

In [21]:
#Eliminate movies that are null for runtimeMinutes and genre
basics = basics.dropna(subset=['runtimeMinutes', 'genres'])

In [22]:
#check nan values 
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle            1
originalTitle           1
isAdult                 0
startYear          122606
endYear           2590792
runtimeMinutes          0
genres                  0
dtype: int64

In [None]:
# check type
basics['titleType'].value_counts()

#### Keep only titleType==Movie

In [None]:
# keep only titleType==Movie
basics = basics.loc[basics['titleType'] == 'movie']
# check type
basics['titleType'].value_counts()

#### filter startYear 2000-2022

In [None]:
# check startYear
basics['startYear'].value_counts()

In [None]:
# check dtype
basics['startYear'].dtype

In [None]:
# we need to change to integer to filter 2000-2021 but first drop nan
basics = basics.dropna(subset=['startYear'])

In [None]:
# change to integer
basics['startYear'] = basics['startYear'].astype(int)
# check
basics['startYear'].dtype

In [None]:
# filter startYear 2000-2022
basics = basics[(basics['startYear'] >=2000) & (basics['startYear'] <=2022)]
# Check 
basics['startYear'].value_counts()

#### Exclude movies that are included in the documentary category.

In [None]:
# Exclude movies that are included in the documentary category.
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]



#### Keep only US movies (Use AKAs table)

## AKAs

#### keep only US movies.

In [5]:
akas = akas[(akas['region'] == 'US')]
# check
akas['region'].value_counts()


NameError: name 'akas' is not defined

In [None]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers =basics['tconst'].isin(akas['titleId'])
basics = basics[keepers]
basics




In [None]:
# Replace "\N" with np.nan
akas= akas.replace({'\\N':np.nan})

## Ratings

In [None]:
# Replace "\N" with np.nan
ratings= ratings.replace({'\\N':np.nan})

In [None]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers1 =ratings['tconst'].isin(akas['titleId'])
ratings = ratings[keepers1]
ratings

# Summary

In [None]:
ratings.info()

In [None]:
akas.info()

In [None]:
basics.info()

# downloading data

In [None]:
basics

In [None]:
akas

In [None]:
ratings

# creating a 'data' folder

In [None]:
# making new folder with os
import os
os.makedirs('Data/',exist_ok=True) 
# Confirm folder created
os.listdir("Data/")


# Open saved file

In [None]:
# Open saved file and preview again
basics = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)
basics.head()



In [None]:
akas = pd.read_csv("Data/title_akas.csv.gz", low_memory = False)
akas.head()

In [None]:
rating = pd.read_csv("Data/title_ratings.csv.gz", low_memory = False)
rating.head()

# Part 2 

## 1 - Designate a folder

In [None]:
folder = "Data/"
os.makedirs(folder, exist_ok = True)
os.listdir(folder)

## 2 - Define Your Functions

In [None]:
def write_json(new_data, filename): 
    """Appends a list of records (new_data) to a json file (filename). 
    Adapted from: https://www.geeksforgeeks.org/append-to-json-file-using-python/"""  
    
    with open(filename,'r+') as file:
        # First we load existing data into a dict.
        file_data = json.load(file)
        ## Choose extend or append
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
             file_data.append(new_data)
        # Sets file's current position at offset.
        file.seek(0)
        # convert back to json.
        json.dump(file_data, file)
