## Connection to Google Drive

In [1]:
# Importing datasets from shared folder
# from google.colab import drive
# drive.mount("/content/drive/")

In [2]:
# File location variable 
# fileloc = '/content/drive/My Drive/DSI_16/Capstone/datasets/'

# File @ local machine
fileloc = 'datasets/'

# Introduction
As the trend of on-demand entertainment becoming more prevelant, there is a change in the trend of content creation being less exclusive to a particular network and more avaiable across various content providers. Due to this, there is a competiton across various entertainment provider to provide the right content to the right user to ensure that their users are engaged and reduce their needs to move to another content provider. 1 key factor that can help in pushing the right content to the right users is to have a robust recommendation system.

A singapore start-up which envision to be the Netflix of Singapore, has hired me as a Data scientist to assist their team in creating a recommendation system. They have kindly provided movie rating data from their system. Due to PDPA, no user's profiles are provided.

## Problem Statement
1. To create a recommendation system based on collaborative filtering to recommend movies to users based on similar movie watched & ratings.
2. Recommendation system should provide new contents (w/o rating) users based on user profile to user.

# Codes Book

## Import Library

In [3]:
import numpy as np
import pandas as pd

# Library for importing and saving large datasets
import glob
import json
import h5py

# For dealing with iterables
from itertools import chain
from collections import ChainMap

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns

# API Call
import requests

from datetime import datetime
import time
import random
import ast

# Additional Setting
pd.set_option("display.max_rows", 201)

## User Defined Functions

### Time Function

In [4]:
# Time function

def t_now(string):
    '''
    Insert Docstring
    '''
    time = datetime.now()
    current_time = time.strftime("%H:%M:%S")
    return f"{string} = {current_time}"

### Title Pull

In [5]:
# Create a function for API call for different movie titles

def title_pull(item_ls, item_release_year):
    '''
    Function: This is 
    '''
    
    global REQUEST_TOKEN
    
    item_name = ''
    
    # Loop to create a string to search movie database
    for i in range(len(item_ls)):
        if i != 0:
            item_name += ":" + item_ls[i]
        else:
            item_name = item_ls[i]
        
        # replace whitespace with %20
        search_item = item_name.replace(" ", "%20")
            
        # Try to search based on movie & release year
        try:
            url = f"https://api.themoviedb.org/3/search/movie?api_key={REQUEST_TOKEN}&query={search_item}&primary_release_year={item_release_year}&page=1" 
            req = requests.get(url)
            sq_dict = req.json()
            res = {key: sq_dict['results'][0][key] for key in sq_dict['results'][0].keys() & {'id', 'title'}}
            #res = {key: sq_dict['results'][0][key] for key in sq_dict['results'][0].keys() & {'id', 'title','genre_ids'}}
            res['search_title'] = search_item.replace("%20", ' ')
            res['type'] = 'movie'
            return res
        except:
            pass

        # Try to search based on TV name & first air date
        try:
            url = f"https://api.themoviedb.org/3/search/tv?api_key={REQUEST_TOKEN}&query={search_item}&first_air_date_year={item_release_year}&page=1"
            req = requests.get(url)
            sq_dict = req.json()
            res = {key: sq_dict['results'][0][key] for key in sq_dict['results'][0].keys() & {'id', 'name'}}
            #res = {key: sq_dict['results'][0][key] for key in sq_dict['results'][0].keys() & {'id', 'name','genre_ids'}}
            res['search_title'] = search_item.replace("%20", ' ')
            res['type'] = 'tv'
            return res
        except:
            pass

        # Try to search based on Movie title only
        try:
            url = f"https://api.themoviedb.org/3/search/movie?api_key={REQUEST_TOKEN}&query={search_item}&page=1" 
            req = requests.get(url)
            sq_dict = req.json()
            res = {key: sq_dict['results'][0][key] for key in sq_dict['results'][0].keys() & {'id', 'title'}}
            #res = {key: sq_dict['results'][0][key] for key in sq_dict['results'][0].keys() & {'id', 'title','genre_ids'}}
            res['search_title'] = search_item.replace("%20", ' ')
            res['type'] = 'movie'
            return res
        except:
            pass

        # Try to search based on TV name only
        try:
            url = f"https://api.themoviedb.org/3/search/tv?api_key={REQUEST_TOKEN}&query={search_item}&page=1"
            req = requests.get(url)
            sq_dict = req.json()
            res = {key: sq_dict['results'][0][key] for key in sq_dict['results'][0].keys() & {'id', 'name'}}
            #res = {key: sq_dict['results'][0][key] for key in sq_dict['results'][0].keys() & {'id', 'name','genre_ids'}}
            res['search_title'] = search_item.replace("%20", ' ')
            res['type'] = 'tv'
            return res
        except:
            return 'NA'

### Metadata API

In [6]:
# Create a function for API call for different movie titles

def movieid_pull(index, item_id, title_type):
    '''
    Function: Perform API call to movie db and extract relevant information base on movie_id

    arg1 (int): moive_id unique to the movie db

    arg2 (str): Whether the title is a movie or a tv show
    '''
    
    global REQUEST_TOKEN, chunk_size

    #Create empty dict
    res = dict()
    sq_dict = dict()

    # # Loop to create a string to search movie database
    # if index % chunk_size == 0:
    #     print(f"{chunk_size}{t_now(' rows processed')}")

    #Check title type
    if title_type == 'movie':
        # Find the top actor/actress of the moive - api order = 0
        url = f"https://api.themoviedb.org/3/movie/{item_id}/credits?api_key={REQUEST_TOKEN}" 
        req = requests.get(url)
        sq_dict = req.json()

        # Find actor/actress
        if sq_dict['cast']:
            res = {key: sq_dict['cast'][0][key] for key in sq_dict['cast'][0].keys() & {'name'}}
        
        # Find Associated Keywords with titles
        url = f"https://api.themoviedb.org/3/movie/{item_id}/keywords?api_key={REQUEST_TOKEN}"
        req = requests.get(url)
        sq_dict = req.json()
        if sq_dict['keywords']:
            res['keywords'] = sq_dict['keywords']

        # Find titles details
        url = f"https://api.themoviedb.org/3/movie/{item_id}?api_key={REQUEST_TOKEN}"
        req = requests.get(url)
        sq_dict = req.json()
        for i in ['genres', 'popularity','vote_count','vote_average']:
            if sq_dict[i]:
                res[i] = sq_dict[i]
        return res
    
    elif title_type == 'tv':
        # Find the top actor/actress/host of the moive - api order = 0
        url = f"https://api.themoviedb.org/3/tv/{item_id}/credits?api_key={REQUEST_TOKEN}" 
        req = requests.get(url)
        sq_dict = req.json()
        if sq_dict['cast']:
            res = {key: sq_dict['cast'][0][key] for key in sq_dict['cast'][0].keys() & {'name'}}

        # Find Associated Keywords with titles
        url = f"https://api.themoviedb.org/3/tv/{item_id}/keywords?api_key={REQUEST_TOKEN}"
        req = requests.get(url)
        sq_dict = req.json()
        if sq_dict['results']:
            res['keywords'] = sq_dict['results']

        # Find titles details
        url = f"https://api.themoviedb.org/3/tv/{item_id}?api_key={REQUEST_TOKEN}"
        req = requests.get(url)
        sq_dict = req.json()
        for i in ['genres', 'popularity','vote_count','vote_average']:
            if sq_dict[i]:
                res[i] = sq_dict[i]
        return res

### make dict

In [7]:
def make_dict(x):
    '''
    Function: Return a dict of 2 vals within a dict = {val1: val2}
    
    arg1 (list): A list of dictionaries

    '''
    
    final_dict = dict()
    try:
        for item in x:
            final_dict.update({item['name'] : item['id']})
        return final_dict
    except:
        return final_dict

### Chunking

In [8]:
def chunking(chunk_size, df):
    '''
    Function return a lsit of chunked dataframe
    
    arg1 (int): Chunk size of each chunk
    arg2 (df) : Dataframe to chunk
    
    return (list): A list of chunked dataframe
    '''
    
    chunk_list = []
    for i in range(0, len(titles),chunk_size):
        if i == 0:
            start_point = 0
        else:
            start_point = i

        df = titles.loc[start_point: start_point + chunk_size -1 ,:]
        chunk_list.append(df)
        
    return chunk_list

## Import Dataset

### Movie Title Dataset

In [9]:
# Import Movie titles file
titles = pd.read_csv(fileloc+ 'movie_titles.txt', sep = ',', names=['movie_id','year_of_release','title'], encoding='Latin_1')

In [10]:
# convert all movie title to lower case
titles['title'] = titles['title'].map(lambda x: str.lower(x))

In [11]:
print(f'shape of titles {titles.shape}')
titles.head()

shape of titles (17770, 3)


Unnamed: 0,movie_id,year_of_release,title
0,1,2003.0,dinosaur planet
1,2,2004.0,isle of man tt 2004 review
2,3,1997.0,character
3,4,1994.0,paula abdul's get up & dance
4,5,2004.0,the rise and fall of ecw


#### Delimit Title for search in MovieDB

In [12]:
# Split title into a list of words - delimiter ":"
titles['split_title'] = titles['title'].str.split(':')
titles.head(10)

Unnamed: 0,movie_id,year_of_release,title,split_title
0,1,2003.0,dinosaur planet,[dinosaur planet]
1,2,2004.0,isle of man tt 2004 review,[isle of man tt 2004 review]
2,3,1997.0,character,[character]
3,4,1994.0,paula abdul's get up & dance,[paula abdul's get up & dance]
4,5,2004.0,the rise and fall of ecw,[the rise and fall of ecw]
5,6,1997.0,sick,[sick]
6,7,1992.0,8 man,[8 man]
7,8,2004.0,what the #$*! do we know!?,[what the #$*! do we know!?]
8,9,1991.0,class of nuke 'em high 2,[class of nuke 'em high 2]
9,10,2001.0,fighter,[fighter]


In [13]:
# Create Chunk Size
chunk_size = 3000

In [14]:
chunk_list = chunking(chunk_size, titles)

#### Invoke API call to extract title id from MovieDB

In [15]:
# API key
REQUEST_TOKEN = 'd8f46f139abc47c1f048f3efc486fe53'

# Target web page:

# To authenticate api call session
url = "https://www.themoviedb.org/authenticate/"+ REQUEST_TOKEN

# Establishing the connection to the web page:
response = requests.get(url)

# You can use status codes to understand how the target server responds to your request.
# Ex., 200 = OK, 400 = Bad Request, 403 = Forbidden, 404 = Not Found.
print(response.status_code)

200


In [None]:
%%time

print(t_now('Start Time'))
print(f"There are {len(chunk_list)} batches to process")

# Saving of titles files
for index, df in enumerate(chunk_list):
    df['moviedb_result'] = df.apply(lambda x: title_pull(x['split_title'], x['year_of_release']),axis=1)
    print(f"{index} batch: {t_now('run time')}")
    df.to_csv(f'{fileloc}titles_added/ta_{index}.csv', index=False)
        
    # Check if url time-out
    if response.status_code != 200:
        print(f'Connection Error, error type: {response.status_code}')
        url = "https://www.themoviedb.org/authenticate/"+ REQUEST_TOKEN
        print(response.status_code)
        
    #generate a random sleep duration to look more 'natural'
    sleep_duration = random.randint(0,2)
    print(f'Sleep Duration: {sleep_duration}')
    print('-' * 50)
    time.sleep(sleep_duration) 

Start Time = 17:31:20
There are 6 batches to process


In [56]:
%%time
# Globbing of saved Chunks
# Import titles chunks into dataframe
ls_of_titles = []

# Specify the pattern matching for glob
rating_files = glob.glob(fileloc + 'titles_added/*')

# Loop to get dataframe into list
for filename in rating_files:
    df = pd.read_csv(filename, sep=',')
    ls_of_titles.append(df)

# Concat dataframe together
titles = pd.concat(ls_of_titles,ignore_index=True)

print('glob completed')

glob completed
Wall time: 70.8 ms


In [57]:
print(titles.shape)

(17770, 5)


In [58]:
titles.head()

Unnamed: 0,movie_id,year_of_release,title,split_title,moviedb_result
0,1,2003.0,dinosaur planet,['dinosaur planet'],"{'name': 'Dinosaur Planet', 'id': 11710, 'sear..."
1,2,2004.0,isle of man tt 2004 review,['isle of man tt 2004 review'],
2,3,1997.0,character,['character'],"{'id': 17139, 'title': 'Character', 'search_ti..."
3,4,1994.0,paula abdul's get up & dance,"[""paula abdul's get up & dance""]","{'id': 274766, 'title': ""Paula Abdul's Get Up ..."
4,5,2004.0,the rise and fall of ecw,['the rise and fall of ecw'],"{'id': 33209, 'title': 'The Rise & Fall of ECW..."


#### Cleaning of Title Dataset

##### Dropping of Null Values

In [59]:
titles.isnull().sum()[titles.isnull().sum()>0]

year_of_release      7
moviedb_result     803
dtype: int64

In [60]:
# Drop null values in moviedb_result and reassigned back to titles
print(f'Shape of titles before dropping: {titles.shape}')
titles = titles.dropna(axis=0, how='all', subset=['moviedb_result'])
print(f'Shape of titles after dropping: {titles.shape}')

Shape of titles before dropping: (17770, 5)
Shape of titles after dropping: (16967, 5)


In [61]:
# Change the wordings from API function call
titles['moviedb_result'] = titles['moviedb_result'].map(lambda x : x.replace('name', 'title'))
titles.reset_index(drop=True,inplace=True)
print(titles.shape)
titles.head()

(16967, 5)


Unnamed: 0,movie_id,year_of_release,title,split_title,moviedb_result
0,1,2003.0,dinosaur planet,['dinosaur planet'],"{'title': 'Dinosaur Planet', 'id': 11710, 'sea..."
1,3,1997.0,character,['character'],"{'id': 17139, 'title': 'Character', 'search_ti..."
2,4,1994.0,paula abdul's get up & dance,"[""paula abdul's get up & dance""]","{'id': 274766, 'title': ""Paula Abdul's Get Up ..."
3,5,2004.0,the rise and fall of ecw,['the rise and fall of ecw'],"{'id': 33209, 'title': 'The Rise & Fall of ECW..."
4,6,1997.0,sick,['sick'],"{'id': 35638, 'title': 'Sick: The Life and Dea..."


In [62]:
titles[titles['year_of_release'].isnull()]

Unnamed: 0,movie_id,year_of_release,title,split_title,moviedb_result
4191,4388,,ancient civilizations: rome and pompeii,"['ancient civilizations', ' rome and pompeii']","{'id': 503681, 'title': 'Ancient Civilizations..."
4577,4794,,ancient civilizations: land of the pharaohs,"['ancient civilizations', ' land of the pharao...","{'id': 503681, 'title': 'Ancient Civilizations..."
6898,7241,,ancient civilizations: athens and greece,"['ancient civilizations', ' athens and greece']","{'id': 503681, 'title': 'Ancient Civilizations..."
10300,10782,,roti kapada aur makaan,['roti kapada aur makaan'],"{'id': 125292, 'title': 'Roti Kapada Aur Makaa..."
15932,16678,,jimmy hollywood,['jimmy hollywood'],"{'id': 31643, 'title': 'Jimmy Hollywood', 'sea..."


In [63]:
# Filter for years of releases that are missing
print(f'Shape of titles before dropping: {titles.shape}')
titles = titles[~(titles['year_of_release'].isnull())]
print(f'Shape of titles before dropping: {titles.shape}')

# Reset titles index after dropping of rows
titles.reset_index(drop=True,inplace=True)

Shape of titles before dropping: (16967, 5)
Shape of titles before dropping: (16962, 5)


<span style='color:magenta'>Remarks:</span> I have removed 808 titles from the Netflix datasets. This amount to about 4.5% loss in original data. This is acceptable as the cleanliness of the dataset is important to model a good recommendation system.

##### Change column datatype using ast.literal.eval

In [64]:
# Check the data type of the 2 columns
print(type(titles.loc[0,'split_title']))
print(type(titles.loc[0,'moviedb_result']))

<class 'str'>
<class 'str'>


In [65]:
# Convert, evaluate stirng representation into actual type
titles['split_title']  = titles['split_title'].apply(ast.literal_eval)
titles['moviedb_result']  = titles['moviedb_result'].apply(ast.literal_eval)

In [66]:
print(type(titles.loc[0,'split_title']))
print(type(titles.loc[0,'moviedb_result']))

<class 'list'>
<class 'dict'>


In [67]:
# Sample of an element
titles.loc[0,'moviedb_result']

{'title': 'Dinosaur Planet',
 'id': 11710,
 'search_title': 'dinosaur planet',
 'type': 'tv'}

In [68]:
# Unpack dictionary into dataframe columns
titles[['moviedb_search_title', 'moviedb_id', 'result_title','type']] =  pd.DataFrame(titles['moviedb_result'].tolist(), index=titles.index)
titles.head()

Unnamed: 0,movie_id,year_of_release,title,split_title,moviedb_result,moviedb_search_title,moviedb_id,result_title,type
0,1,2003.0,dinosaur planet,[dinosaur planet],"{'title': 'Dinosaur Planet', 'id': 11710, 'sea...",Dinosaur Planet,11710,dinosaur planet,tv
1,3,1997.0,character,[character],"{'id': 17139, 'title': 'Character', 'search_ti...",Character,17139,character,movie
2,4,1994.0,paula abdul's get up & dance,[paula abdul's get up & dance],"{'id': 274766, 'title': 'Paula Abdul's Get Up ...",Paula Abdul's Get Up & Dance,274766,paula abdul's get up & dance,movie
3,5,2004.0,the rise and fall of ecw,[the rise and fall of ecw],"{'id': 33209, 'title': 'The Rise & Fall of ECW...",The Rise & Fall of ECW,33209,the rise and fall of ecw,movie
4,6,1997.0,sick,[sick],"{'id': 35638, 'title': 'Sick: The Life and Dea...","Sick: The Life and Death of Bob Flanagan, Supe...",35638,sick,movie


##### Filter Netflix title = MovieDB titles

In [69]:
%%time
# 1 if exact match for each item of list in split_title == moviedb_search_title
titles['match'] = [[1 if element in titles.loc[i, 'moviedb_search_title'].lower() else 0 for element in titles.loc[i,'split_title']][0] for i in range(len(titles))]
titles.head()

Wall time: 391 ms


Unnamed: 0,movie_id,year_of_release,title,split_title,moviedb_result,moviedb_search_title,moviedb_id,result_title,type,match
0,1,2003.0,dinosaur planet,[dinosaur planet],"{'title': 'Dinosaur Planet', 'id': 11710, 'sea...",Dinosaur Planet,11710,dinosaur planet,tv,1
1,3,1997.0,character,[character],"{'id': 17139, 'title': 'Character', 'search_ti...",Character,17139,character,movie,1
2,4,1994.0,paula abdul's get up & dance,[paula abdul's get up & dance],"{'id': 274766, 'title': 'Paula Abdul's Get Up ...",Paula Abdul's Get Up & Dance,274766,paula abdul's get up & dance,movie,1
3,5,2004.0,the rise and fall of ecw,[the rise and fall of ecw],"{'id': 33209, 'title': 'The Rise & Fall of ECW...",The Rise & Fall of ECW,33209,the rise and fall of ecw,movie,0
4,6,1997.0,sick,[sick],"{'id': 35638, 'title': 'Sick: The Life and Dea...","Sick: The Life and Death of Bob Flanagan, Supe...",35638,sick,movie,1


<span style='color:magenta'>Remarks:</span>
To find match as there are instances where the moviedb is returning a wrong moive. This happens as the API call do not have an exact match function and only the top result is returned

In [70]:
# Filter for those that are wrongly match
titles[titles['match']==0].shape[0]

1590

In [71]:
# Filter for those that are match
titles = titles[titles['match']!=0]
titles.reset_index(drop=True, inplace=True)
print(titles.shape)
titles.head()

(15372, 10)


Unnamed: 0,movie_id,year_of_release,title,split_title,moviedb_result,moviedb_search_title,moviedb_id,result_title,type,match
0,1,2003.0,dinosaur planet,[dinosaur planet],"{'title': 'Dinosaur Planet', 'id': 11710, 'sea...",Dinosaur Planet,11710,dinosaur planet,tv,1
1,3,1997.0,character,[character],"{'id': 17139, 'title': 'Character', 'search_ti...",Character,17139,character,movie,1
2,4,1994.0,paula abdul's get up & dance,[paula abdul's get up & dance],"{'id': 274766, 'title': 'Paula Abdul's Get Up ...",Paula Abdul's Get Up & Dance,274766,paula abdul's get up & dance,movie,1
3,6,1997.0,sick,[sick],"{'id': 35638, 'title': 'Sick: The Life and Dea...","Sick: The Life and Death of Bob Flanagan, Supe...",35638,sick,movie,1
4,7,1992.0,8 man,[8 man],"{'id': 196685, 'title': '8 Man - For All Lonel...",8 Man - For All Lonely Nights,196685,8 man,movie,1


##### Check if there are duplicated movies

In [72]:
titles[titles.duplicated(subset='title', keep='first')]

Unnamed: 0,movie_id,year_of_release,title,split_title,moviedb_result,moviedb_search_title,moviedb_id,result_title,type,match
267,305,1996.0,jack,[jack],"{'id': 7095, 'title': 'Jack', 'search_title': ...",Jack,7095,jack,movie,1
327,379,1996.0,crash dive,[crash dive],"{'id': 160019, 'title': 'Crash Dive', 'search_...",Crash Dive,160019,crash dive,movie,1
888,1015,1996.0,dr. quinn,[dr. quinn],"{'id': 25101, 'title': 'Dr. Quinn Medicine Wom...",Dr. Quinn Medicine Woman: The Movie,25101,dr. quinn,movie,1
1107,1260,1999.0,journey to the center of the earth,[journey to the center of the earth],"{'id': 732427, 'title': 'Journey to the Center...",Journey to the Center of the Earth,732427,journey to the center of the earth,movie,1
1322,1505,1964.0,hamlet,[hamlet],"{'id': 261439, 'title': 'Hamlet at Elsinore', ...",Hamlet at Elsinore,261439,hamlet,movie,1
...,...,...,...,...,...,...,...,...,...,...
15274,17658,1994.0,the chase,[the chase],"{'id': 10694, 'title': 'The Chase', 'search_ti...",The Chase,10694,the chase,movie,1
15297,17685,1988.0,alice,[alice],"{'id': 114364, 'title': 'Alice in Wonderland',...",Alice in Wonderland,114364,alice,movie,1
15315,17704,1999.0,taboo,[taboo],"{'id': 20617, 'title': 'Taboo', 'search_title'...",Taboo,20617,taboo,movie,1
15330,17721,1998.0,the love letter,[the love letter],"{'id': 57943, 'title': 'The Love Letter', 'sea...",The Love Letter,57943,the love letter,movie,1


In [73]:
# Check Sample 1
titles[titles['title']=='journey to the center of the earth']

Unnamed: 0,movie_id,year_of_release,title,split_title,moviedb_result,moviedb_search_title,moviedb_id,result_title,type,match
415,486,1959.0,journey to the center of the earth,[journey to the center of the earth],"{'id': 11571, 'title': 'Journey to the Center ...",Journey to the Center of the Earth,11571,journey to the center of the earth,movie,1
1107,1260,1999.0,journey to the center of the earth,[journey to the center of the earth],"{'id': 732427, 'title': 'Journey to the Center...",Journey to the Center of the Earth,732427,journey to the center of the earth,movie,1


In [74]:
# Check Sample 2
titles[titles['title']=='crash dive']

Unnamed: 0,movie_id,year_of_release,title,split_title,moviedb_result,moviedb_search_title,moviedb_id,result_title,type,match
51,63,1943.0,crash dive,[crash dive],"{'id': 74012, 'title': 'Crash Dive', 'search_t...",Crash Dive,74012,crash dive,movie,1
327,379,1996.0,crash dive,[crash dive],"{'id': 160019, 'title': 'Crash Dive', 'search_...",Crash Dive,160019,crash dive,movie,1


<span style='color:magenta'>Remarks:</span>
It seems like there are some movies that are a re-make of the original. This may 'pollute' our model when we pull additional data from MovieDB especially when we are creating an title-based recommendation system later on. There are 2 treatments for this:
1. Create a unique Name that is a combination of title & year_of_release. This will helps us to retain these old movies and may improve our recommendation system.
2. Remove them as the improvement may be insignificant. 

I will choose option (2) for the following reason:
- These old titles may have missing information when I perfrom an API call from MovieDB. This will not value-add to my features in my recommendation system.
- Users tends to watch and rate more recent movies hence if we ignore the materiality of less features, the low ratings of these movies may mean that they will very unlikely to be recommended.

In [75]:
# Sort titles based on latest year_of_release
titles.sort_values(by='year_of_release',ascending=False, inplace=True)
titles.head()

Unnamed: 0,movie_id,year_of_release,title,split_title,moviedb_result,moviedb_search_title,moviedb_id,result_title,type,match
10733,12379,2005.0,national geographic: king tut's final secrets,"[national geographic, king tut's final secrets]","{'id': 312215, 'title': 'National Geographic E...",National Geographic Explorer: King Tut's Final...,312215,national geographic,movie,1
10476,12078,2005.0,care bears: big wish movie,"[care bears, big wish movie]","{'id': 19018, 'title': 'Care Bears: Big Wish M...",Care Bears: Big Wish Movie,19018,care bears,movie,1
10863,12534,2005.0,steamboy,[steamboy],"{'id': 8953, 'title': 'Steamboy', 'search_titl...",Steamboy,8953,steamboy,movie,1
930,1060,2005.0,king's ransom,[king's ransom],"{'id': 27360, 'title': 'King's Ransom', 'searc...",King's Ransom,27360,king's ransom,movie,1
4388,5064,2005.0,left behind: world at war,"[left behind, world at war]","{'id': 38828, 'title': 'Left Behind: World at ...",Left Behind: World at War,38828,left behind,movie,1


In [76]:
# Drop all duplicate titles
print(f'Shape of titles before dropping: {titles.shape}')
titles = titles[~(titles.duplicated(subset='title', keep='first'))]
print(f'Shape of titles after dropping: {titles.shape}')

Shape of titles before dropping: (15372, 10)
Shape of titles after dropping: (14923, 10)


In [77]:
# Sample 1
titles[titles['title']=='journey to the center of the earth']

Unnamed: 0,movie_id,year_of_release,title,split_title,moviedb_result,moviedb_search_title,moviedb_id,result_title,type,match
1107,1260,1999.0,journey to the center of the earth,[journey to the center of the earth],"{'id': 732427, 'title': 'Journey to the Center...",Journey to the Center of the Earth,732427,journey to the center of the earth,movie,1


In [78]:
# Check Sample 2
titles[titles['title']=='crash dive']

Unnamed: 0,movie_id,year_of_release,title,split_title,moviedb_result,moviedb_search_title,moviedb_id,result_title,type,match
327,379,1996.0,crash dive,[crash dive],"{'id': 160019, 'title': 'Crash Dive', 'search_...",Crash Dive,160019,crash dive,movie,1


<span style='color:magenta'>Remarks:</span>
From the sample after dropping duplicates, The older titles are dropped while the latest titles are retained.

In total, I have dropped 2,847 titles. This amount to 16% reduction in original data. However, this is still acceptable as this implies that our dataset is as clean as possible for modeling.

In [None]:
# Save dataframe to HDF5 
hf = pd.HDFStore(fileloc + 'movie_raw.hdf5')
hf['titles_search_mvid'] = titles
hf.close()

#### Invoke Metadata API Call

In [79]:
# API key
REQUEST_TOKEN = 'd8f46f139abc47c1f048f3efc486fe53'

# Target web page:

# To authenticate api call session
url = "https://www.themoviedb.org/authenticate/"+ REQUEST_TOKEN

# Establishing the connection to the web page:
response = requests.get(url)

# You can use status codes to understand how the target server responds to your request.
# Ex., 200 = OK, 400 = Bad Request, 403 = Forbidden, 404 = Not Found.
print(response.status_code)

200


In [None]:
# Chunking the files
chunk_list = chunking(chunk_size,titles)
print(len(chunk_list))

In [None]:
# Perform API call on chunked dataframe & save as csv file
for index, df in enumerate(chunk_list):
    df['search_result'] = df.apply(lambda x: movieid_pull(x.name, x['moviedb_id'], x['type']), axis=1)
    df.to_csv(f'{fileloc}Metadata_added/md_{index}.csv', index=False)
    print(f"{index} batch: {t_now('run time')}")