## Connection to Google Drive

In [None]:
# Importing datasets from shared folder
# from google.colab import drive
# drive.mount("/content/drive/")

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [2]:
# File location variable 
#fileloc = '/content/drive/My Drive/DSI_16/Capstone/datasets/'

# File @ local machine
fileloc = 'datasets/'

# Introduction
As the trend of on-demand entertainment becoming more prevelant, there is a change in the trend of content creation being less exclusive to a particular network and more avaiable across various content providers. Due to this, there is a competiton across various entertainment provider to provide the right content to the right user to ensure that their users are engaged and reduce their needs to move to another content provider. 1 key factor that can help in pushing the right content to the right users is to have a robust recommendation system.

A singapore start-up which envision to be the Netflix of Singapore, has hired me as a Data scientist to assist their team in creating a recommendation system. They have kindly provided movie rating data from their system. Due to PDPA, no user's profiles are provided.

## Problem Statement
1. To create a recommendation system based on collaborative filtering to recommend movies to users based on similar movie watched & ratings.
2. Recommendation system should provide new contents (w/o rating) users based on user profile to user.

# Codes Book

## Import Library

In [3]:
import numpy as np
import pandas as pd

# Library for importing and saving large datasets
import glob
import json
import h5py

# For dealing with iterables
from itertools import chain
from collections import ChainMap

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns

# API Call
import requests

from datetime import datetime
import time
import random
import ast

# Additional Setting
pd.set_option("display.max_rows", 201)


## User Defined Functions

In [3]:
# Time function

def t_now(string):
    '''
    Insert Docstring
    '''
    time = datetime.now()
    current_time = time.strftime("%H:%M:%S")
    return f"{string} = {current_time}"

In [4]:
# Create a function for API call for different movie titles

def title_pull(item_ls, item_release_year):
    '''
    Function: This is 
    '''
    
    global REQUEST_TOKEN
    
    item_name = ''
    
    # Loop to create a string to search movie database
    for i in range(len(item_ls)):
        if i != 0:
            item_name += ":" + item_ls[i]
        else:
            item_name = item_ls[i]
        
        # replace whitespace with %20
        search_item = item_name.replace(" ", "%20")
            
        # Try to search based on movie & release year
        try:
            url = f"https://api.themoviedb.org/3/search/movie?api_key={REQUEST_TOKEN}&query={search_item}&primary_release_year={item_release_year}&page=1" 
            req = requests.get(url)
            sq_dict = req.json()
            res = {key: sq_dict['results'][0][key] for key in sq_dict['results'][0].keys() & {'id', 'title'}}
            #res = {key: sq_dict['results'][0][key] for key in sq_dict['results'][0].keys() & {'id', 'title','genre_ids'}}
            res['search_title'] = search_item.replace("%20", ' ')
            res['type'] = 'movie'
            return res
        except:
            pass

        # Try to search based on TV name & first air date
        try:
            url = f"https://api.themoviedb.org/3/search/tv?api_key={REQUEST_TOKEN}&query={search_item}&first_air_date_year={item_release_year}&page=1"
            req = requests.get(url)
            sq_dict = req.json()
            res = {key: sq_dict['results'][0][key] for key in sq_dict['results'][0].keys() & {'id', 'name'}}
            #res = {key: sq_dict['results'][0][key] for key in sq_dict['results'][0].keys() & {'id', 'name','genre_ids'}}
            res['search_title'] = search_item.replace("%20", ' ')
            res['type'] = 'tv'
            return res
        except:
            pass

        # Try to search based on Movie title only
        try:
            url = f"https://api.themoviedb.org/3/search/movie?api_key={REQUEST_TOKEN}&query={search_item}&page=1" 
            req = requests.get(url)
            sq_dict = req.json()
            res = {key: sq_dict['results'][0][key] for key in sq_dict['results'][0].keys() & {'id', 'title'}}
            #res = {key: sq_dict['results'][0][key] for key in sq_dict['results'][0].keys() & {'id', 'title','genre_ids'}}
            res['search_title'] = search_item.replace("%20", ' ')
            res['type'] = 'movie'
            return res
        except:
            pass

        # Try to search based on TV name only
        try:
            url = f"https://api.themoviedb.org/3/search/tv?api_key={REQUEST_TOKEN}&query={search_item}&page=1"
            req = requests.get(url)
            sq_dict = req.json()
            res = {key: sq_dict['results'][0][key] for key in sq_dict['results'][0].keys() & {'id', 'name'}}
            #res = {key: sq_dict['results'][0][key] for key in sq_dict['results'][0].keys() & {'id', 'name','genre_ids'}}
            res['search_title'] = search_item.replace("%20", ' ')
            res['type'] = 'tv'
            return res
        except:
            return 'NA'

In [5]:
# Create a function for API call for different movie titles

def movieid_pull(index, item_id, title_type):
    '''
    Function: Perform API call to movie db and extract relevant information base on movie_id

    arg1 (int): moive_id unique to the movie db

    arg2 (str): Whether the title is a movie or a tv show
    '''
    
    global REQUEST_TOKEN, chunk_size

    #Create empty dict
    res = dict()
    sq_dict = dict()

    # Loop to create a string to search movie database
    if index % chunk_size == 0:
        print(f"{chunk_size}{t_now(' rows processed')}")

    #Check title type
    if title_type == 'movie':
        # Find the top actor/actress of the moive - api order = 0
        url = f"https://api.themoviedb.org/3/movie/{item_id}/credits?api_key={REQUEST_TOKEN}" 
        req = requests.get(url)
        sq_dict = req.json()

        # Find actor/actress
        if sq_dict['cast']:
            res = {key: sq_dict['cast'][0][key] for key in sq_dict['cast'][0].keys() & {'name'}}
        
        # Find Associated Keywords with titles
        url = f"https://api.themoviedb.org/3/movie/{item_id}/keywords?api_key={REQUEST_TOKEN}"
        req = requests.get(url)
        sq_dict = req.json()
        if sq_dict['keywords']:
            res['keywords'] = sq_dict['keywords']

        # Find titles details
        url = f"https://api.themoviedb.org/3/movie/{item_id}?api_key={REQUEST_TOKEN}"
        req = requests.get(url)
        sq_dict = req.json()
        for i in ['genres', 'popularity','vote_count','vote_average']:
            if sq_dict[i]:
                res[i] = sq_dict[i]
        return res
    
    elif title_type == 'tv':
        # Find the top actor/actress/host of the moive - api order = 0
        url = f"https://api.themoviedb.org/3/tv/{item_id}/credits?api_key={REQUEST_TOKEN}" 
        req = requests.get(url)
        sq_dict = req.json()
        if sq_dict['cast']:
            res = {key: sq_dict['cast'][0][key] for key in sq_dict['cast'][0].keys() & {'name'}}

        # Find Associated Keywords with titles
        url = f"https://api.themoviedb.org/3/tv/{item_id}/keywords?api_key={REQUEST_TOKEN}"
        req = requests.get(url)
        sq_dict = req.json()
        if sq_dict['results']:
            res['keywords'] = sq_dict['results']

        # Find titles details
        url = f"https://api.themoviedb.org/3/tv/{item_id}?api_key={REQUEST_TOKEN}"
        req = requests.get(url)
        sq_dict = req.json()
        for i in ['genres', 'popularity','vote_count','vote_average']:
            if sq_dict[i]:
                res[i] = sq_dict[i]
        return res

In [6]:
def make_dict(x):
    '''
    Function: Return a dict of 2 vals within a dict = {val1: val2}
    
    arg1 (list): A list of dictionaries

    '''
    
    final_dict = dict()
    try:
        for item in x:
            final_dict.update({item['name'] : item['id']})
        return final_dict
    except:
        return final_dict

## Import Dataset

### Movie Title Dataset

In [None]:
# Import Movie titles file
titles = pd.read_csv(fileloc+ 'movie_titles.txt', sep = ',', names=['movie_id','year_of_release','title'], encoding='Latin_1')

In [None]:
# convert all movie title to lower case
titles['title'] = titles['title'].map(lambda x: str.lower(x))

In [None]:
print(f'shape of titles {titles.shape}')
titles.head()

shape of titles (17770, 3)


Unnamed: 0,movie_id,year_of_release,title
0,1,2003.0,dinosaur planet
1,2,2004.0,isle of man tt 2004 review
2,3,1997.0,character
3,4,1994.0,paula abdul's get up & dance
4,5,2004.0,the rise and fall of ecw


#### Cleaning Movie Title dataframe

In [None]:
# Split title into a list of words - delimiter ":"
titles['split_title'] = titles['title'].str.split(':')
titles.head(10)

Unnamed: 0,movie_id,year_of_release,title,split_title
0,1,2003.0,dinosaur planet,[dinosaur planet]
1,2,2004.0,isle of man tt 2004 review,[isle of man tt 2004 review]
2,3,1997.0,character,[character]
3,4,1994.0,paula abdul's get up & dance,[paula abdul's get up & dance]
4,5,2004.0,the rise and fall of ecw,[the rise and fall of ecw]
5,6,1997.0,sick,[sick]
6,7,1992.0,8 man,[8 man]
7,8,2004.0,what the #$*! do we know!?,[what the #$*! do we know!?]
8,9,1991.0,class of nuke 'em high 2,[class of nuke 'em high 2]
9,10,2001.0,fighter,[fighter]


In [None]:
# Create Chunk Size
chunk_size = 1000

In [None]:
# Create Chunk list
chunk_list = []
for i in range(0, len(titles),chunk_size):
    if i == 0:
        start_point = 0
    else:
        start_point = i

    df = titles.loc[start_point: start_point + chunk_size -1 ,:]
    chunk_list.append(df)

#### API Call - Extraction of Moive Details

In [None]:
# API key
REQUEST_TOKEN = 'd8f46f139abc47c1f048f3efc486fe53'

# Target web page:

# To authenticate api call session
url = "https://www.themoviedb.org/authenticate/"+ REQUEST_TOKEN

# Establishing the connection to the web page:
response = requests.get(url)

# You can use status codes to understand how the target server responds to your request.
# Ex., 200 = OK, 400 = Bad Request, 403 = Forbidden, 404 = Not Found.
print(response.status_code)

200


In [None]:
%%time

print(t_now('Start Time'))
print(f"There are {len(chunk_list)} batches to process")

for index, df in enumerate(chunk_list):
    df['moviedb_result'] = df.apply(lambda x: title_pull(x['split_title'], x['year_of_release']),axis=1)
    print(f"{index} batch: {t_now('run time')}")
    df.to_csv(f'{fileloc}titles_added/ta_{index}.csv', index=False)
        
    # Check if url time-out
    if response.status_code != 200:
        print(f'Connection Error, error type: {response.status_code}')
        url = "https://www.themoviedb.org/authenticate/"+ REQUEST_TOKEN
        print(response.status_code)
        
    #generate a random sleep duration to look more 'natural'
    sleep_duration = random.randint(0,2)
    print(f'Sleep Duration: {sleep_duration}')
    print('-' * 50)
    time.sleep(sleep_duration) 

Start Time = 02:53:51
There are 18 batches to process


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


0 batch: run time = 03:02:59
Sleep Duration: 2
--------------------------------------------------
1 batch: run time = 03:12:10
Sleep Duration: 1
--------------------------------------------------
2 batch: run time = 03:22:00
Sleep Duration: 1
--------------------------------------------------
3 batch: run time = 03:31:24
Sleep Duration: 2
--------------------------------------------------
4 batch: run time = 03:40:34
Sleep Duration: 1
--------------------------------------------------
5 batch: run time = 03:49:35
Sleep Duration: 1
--------------------------------------------------
6 batch: run time = 03:58:45
Sleep Duration: 2
--------------------------------------------------
7 batch: run time = 04:08:09
Sleep Duration: 1
--------------------------------------------------
8 batch: run time = 04:17:11
Sleep Duration: 2
--------------------------------------------------
9 batch: run time = 04:26:15
Sleep Duration: 1
--------------------------------------------------
10 batch: run time =

In [None]:
# Globbing of saved Chunks
%%time
# Import titles chunks into dataframe
ls_of_titles = []

# Specify the pattern matching for glob
rating_files = glob.glob(fileloc + 'titles_added/*')

# Loop to get dataframe into list
for filename in rating_files:
    df = pd.read_csv(filename, sep=',')
    ls_of_titles.append(df)

# Concat dataframe together
titles = pd.concat(ls_of_titles,ignore_index=True)

print('glob completed')

glob completed
CPU times: user 144 ms, sys: 29.6 ms, total: 173 ms
Wall time: 252 ms


In [None]:
print(titles.shape)

(17770, 5)


In [None]:
titles.head()

Unnamed: 0,movie_id,year_of_release,title,split_title,moviedb_result
0,1,2003.0,dinosaur planet,['dinosaur planet'],"{'id': 11710, 'name': 'Dinosaur Planet', 'sear..."
1,2,2004.0,isle of man tt 2004 review,['isle of man tt 2004 review'],
2,3,1997.0,character,['character'],"{'id': 17139, 'title': 'Character', 'search_ti..."
3,4,1994.0,paula abdul's get up & dance,"[""paula abdul's get up & dance""]","{'id': 274766, 'title': ""Paula Abdul's Get Up ..."
4,5,2004.0,the rise and fall of ecw,['the rise and fall of ecw'],"{'id': 33209, 'title': 'The Rise & Fall of ECW..."


In [None]:
titles['moviedb_result'].isnull().sum()

801

In [None]:
# Drop null values in moviedb_result and reassigned back to titles
titles = titles.dropna(axis=0, how='all', subset=['moviedb_result'])
titles['moviedb_result'] = titles['moviedb_result'].map(lambda x : x.replace('name', 'title'))
titles.reset_index(drop=True,inplace=True)
print(titles.shape)
titles.head()

(16969, 5)


Unnamed: 0,movie_id,year_of_release,title,split_title,moviedb_result
0,1,2003.0,dinosaur planet,['dinosaur planet'],"{'id': 11710, 'title': 'Dinosaur Planet', 'sea..."
1,3,1997.0,character,['character'],"{'id': 17139, 'title': 'Character', 'search_ti..."
2,4,1994.0,paula abdul's get up & dance,"[""paula abdul's get up & dance""]","{'id': 274766, 'title': ""Paula Abdul's Get Up ..."
3,5,2004.0,the rise and fall of ecw,['the rise and fall of ecw'],"{'id': 33209, 'title': 'The Rise & Fall of ECW..."
4,6,1997.0,sick,['sick'],"{'id': 35638, 'title': 'Sick: The Life and Dea..."


<span style='color:magenta'>Remarks:</span> Given that the null values in the moviedb pull is less than 5% (~4.5%),
I choose to drop the data as it will not affect the integrity of the data too much.

In [None]:
titles.head()

Unnamed: 0,movie_id,year_of_release,title,split_title,moviedb_result
0,1,2003.0,dinosaur planet,['dinosaur planet'],"{'id': 11710, 'title': 'Dinosaur Planet', 'sea..."
1,3,1997.0,character,['character'],"{'id': 17139, 'title': 'Character', 'search_ti..."
2,4,1994.0,paula abdul's get up & dance,"[""paula abdul's get up & dance""]","{'id': 274766, 'title': ""Paula Abdul's Get Up ..."
3,5,2004.0,the rise and fall of ecw,['the rise and fall of ecw'],"{'id': 33209, 'title': 'The Rise & Fall of ECW..."
4,6,1997.0,sick,['sick'],"{'id': 35638, 'title': 'Sick: The Life and Dea..."


In [6]:
# Check the data type of the 2 columns
print(type(titles.loc[0,'split_title']))
print(type(titles.loc[0,'moviedb_result']))

NameError: name 'titles' is not defined

In [None]:
# Convert, evaluate stirng representation into actual type
titles['split_title']  = titles['split_title'].apply(ast.literal_eval)
titles['moviedb_result']  = titles['moviedb_result'].apply(ast.literal_eval)

In [7]:
print(type(titles.loc[0,'split_title']))
print(type(titles.loc[0,'moviedb_result']))

NameError: name 'titles' is not defined

In [None]:
titles.loc[0,'moviedb_result']

{'id': 11710,
 'search_title': 'dinosaur planet',
 'title': 'Dinosaur Planet',
 'type': 'tv'}

In [None]:
# Unpack dictionary into dataframe columns
titles[['moviedb_id','moviedb_search_title','result_title','type']] =  pd.DataFrame(titles['moviedb_result'].tolist(), index=titles.index)

In [None]:
titles.head()

Unnamed: 0,movie_id,year_of_release,title,split_title,moviedb_result,moviedb_id,moviedb_search_title,result_title,type
0,1,2003.0,dinosaur planet,[dinosaur planet],"{'id': 11710, 'title': 'Dinosaur Planet', 'sea...",11710,Dinosaur Planet,dinosaur planet,tv
1,3,1997.0,character,[character],"{'id': 17139, 'title': 'Character', 'search_ti...",17139,Character,character,movie
2,4,1994.0,paula abdul's get up & dance,[paula abdul's get up & dance],"{'id': 274766, 'title': 'Paula Abdul's Get Up ...",274766,Paula Abdul's Get Up & Dance,paula abdul's get up & dance,movie
3,5,2004.0,the rise and fall of ecw,[the rise and fall of ecw],"{'id': 33209, 'title': 'The Rise & Fall of ECW...",33209,The Rise & Fall of ECW,the rise and fall of ecw,movie
4,6,1997.0,sick,[sick],"{'id': 35638, 'title': 'Sick: The Life and Dea...",35638,"Sick: The Life and Death of Bob Flanagan, Supe...",sick,movie


In [None]:
%%time
# 1 if exact match for each item of list in split_title == moviedb_search_title
titles['match'] = [[1 if element in titles.loc[i, 'moviedb_search_title'].lower() else 0 for element in titles.loc[i,'split_title']][0] for i in range(len(titles))]

CPU times: user 323 ms, sys: 969 µs, total: 324 ms
Wall time: 326 ms


In [None]:
%%html
<span style='color:orange'>Remarks:</span>
To find match as there are instances where the moviedb is returning a wrong moive. This happens as the API call do not have an exact match function and only the top result is returned

In [None]:
titles.head()

Unnamed: 0,movie_id,year_of_release,title,split_title,moviedb_result,moviedb_id,moviedb_search_title,result_title,type,match
0,1,2003.0,dinosaur planet,[dinosaur planet],"{'id': 11710, 'title': 'Dinosaur Planet', 'sea...",11710,Dinosaur Planet,dinosaur planet,tv,1
1,3,1997.0,character,[character],"{'id': 17139, 'title': 'Character', 'search_ti...",17139,Character,character,movie,1
2,4,1994.0,paula abdul's get up & dance,[paula abdul's get up & dance],"{'id': 274766, 'title': 'Paula Abdul's Get Up ...",274766,Paula Abdul's Get Up & Dance,paula abdul's get up & dance,movie,1
3,5,2004.0,the rise and fall of ecw,[the rise and fall of ecw],"{'id': 33209, 'title': 'The Rise & Fall of ECW...",33209,The Rise & Fall of ECW,the rise and fall of ecw,movie,0
4,6,1997.0,sick,[sick],"{'id': 35638, 'title': 'Sick: The Life and Dea...",35638,"Sick: The Life and Death of Bob Flanagan, Supe...",sick,movie,1


In [None]:
# Filter for those that are wrongly match
titles[titles['match']==0].shape[0]

1602

In [None]:
%%html
<span style='color:orange'>Remarks:</span>
1602 titles will be dropped. In total, I will be dropping 2403 titles (1602 + 801) which will result in approximately 13.5% loss of original data.
This is acceptable as the accuracy of the match of Netflix titles to pulled data from moivedb is important as it will affect the predictiability and accuracy of the recommendation

In [None]:
# Filter for those that are match
titles = titles[titles['match']!=0]
titles.reset_index(drop=True, inplace=True)
print(titles.shape)
titles.head()

(15367, 10)


Unnamed: 0,movie_id,year_of_release,title,split_title,moviedb_result,moviedb_id,moviedb_search_title,result_title,type,match
0,1,2003.0,dinosaur planet,[dinosaur planet],"{'id': 11710, 'title': 'Dinosaur Planet', 'sea...",11710,Dinosaur Planet,dinosaur planet,tv,1
1,3,1997.0,character,[character],"{'id': 17139, 'title': 'Character', 'search_ti...",17139,Character,character,movie,1
2,4,1994.0,paula abdul's get up & dance,[paula abdul's get up & dance],"{'id': 274766, 'title': 'Paula Abdul's Get Up ...",274766,Paula Abdul's Get Up & Dance,paula abdul's get up & dance,movie,1
3,6,1997.0,sick,[sick],"{'id': 35638, 'title': 'Sick: The Life and Dea...",35638,"Sick: The Life and Death of Bob Flanagan, Supe...",sick,movie,1
4,7,1992.0,8 man,[8 man],"{'id': 196685, 'title': '8 Man - For All Lonel...",196685,8 Man - For All Lonely Nights,8 man,movie,1


In [None]:
# Remove non-relevant columns
titles.drop(columns=['split_title', 'moviedb_result', 'moviedb_search_title','result_title','match'], inplace=True, errors='ignore')
titles.head(20)

Unnamed: 0,movie_id,year_of_release,title,moviedb_id,type
0,1,2003.0,dinosaur planet,11710,tv
1,3,1997.0,character,17139,movie
2,4,1994.0,paula abdul's get up & dance,274766,movie
3,6,1997.0,sick,35638,movie
4,7,1992.0,8 man,196685,movie
5,9,1991.0,class of nuke 'em high 2,26230,movie
6,11,1999.0,full frame: documentary shorts,413109,movie
7,12,1947.0,my favorite brunette,18649,movie
8,13,2003.0,lord of the rings: the return of the king: ext...,122,movie
9,14,1982.0,nature: antarctica,14951,tv


In [None]:
# Save data to File
titles.to_csv(fileloc + 'new_titles.csv',index=False)

In [None]:
# API Call to get associated keywords & top actor/actress/host
# x.name is to return the index of the dataframe
print(t_now('function start'))
titles['search_result'] = titles.apply(lambda x: movieid_pull(x.name, x['moviedb_id'], x['type']), axis=1)
print(t_now('function end'))

function start = 06:19:05
1000 rows processed = 06:19:05
1000 rows processed = 06:35:55
1000 rows processed = 06:52:49
1000 rows processed = 07:09:24
1000 rows processed = 07:26:00
1000 rows processed = 07:41:54
1000 rows processed = 07:57:59
1000 rows processed = 08:14:27
1000 rows processed = 08:30:30
1000 rows processed = 08:46:25
1000 rows processed = 09:02:01
1000 rows processed = 09:18:04
1000 rows processed = 09:33:38
1000 rows processed = 09:49:19
1000 rows processed = 10:05:13
1000 rows processed = 10:20:36
function end = 10:26:20


In [None]:
titles.head(200)

Unnamed: 0,movie_id,year_of_release,title,moviedb_id,type,search_result
0,1,2003.0,dinosaur planet,11710,tv,"{'name': 'Scott Sampson', 'keywords': [{'name'..."
1,3,1997.0,character,17139,movie,"{'name': 'Jan Decleir', 'keywords': [{'id': 36..."
2,4,1994.0,paula abdul's get up & dance,274766,movie,"{'name': 'Paula Abdul', 'popularity': 0.6}"
3,6,1997.0,sick,35638,movie,"{'name': 'Bob Flanagan', 'keywords': [{'id': 2..."
4,7,1992.0,8 man,196685,movie,"{'name': 'Kai Shishido', 'popularity': 0.6, 'v..."
5,9,1991.0,class of nuke 'em high 2,26230,movie,"{'name': 'Brick Bronsky', 'keywords': [{'id': ..."
6,11,1999.0,full frame: documentary shorts,413109,movie,"{'name': 'Frankie Murphy-Giesing', 'genres': [..."
7,12,1947.0,my favorite brunette,18649,movie,"{'name': 'Bob Hope', 'keywords': [{'id': 1735,..."
8,13,2003.0,lord of the rings: the return of the king: ext...,122,movie,"{'name': 'Elijah Wood', 'keywords': [{'id': 60..."
9,14,1982.0,nature: antarctica,14951,tv,"{'keywords': [{'name': 'nature documentary', '..."


In [None]:
# Return result of TV
titles.loc[0,'search_result']

{'genres': [{'id': 99, 'name': 'Documentary'}],
 'keywords': [{'id': 10506, 'name': 'prehistoric'},
  {'id': 11162, 'name': 'miniseries'},
  {'id': 12616, 'name': 'dinosaur'},
  {'id': 166958, 'name': 'prehistoric creature'},
  {'id': 183414, 'name': 'prehistoric times'}],
 'name': 'Scott Sampson',
 'popularity': 1.086,
 'vote_average': 7.4,
 'vote_count': 5}

In [None]:
# Return results of movie
titles.loc[1,'search_result']

{'genres': [{'id': 36, 'name': 'History'}, {'id': 18, 'name': 'Drama'}],
 'keywords': [{'id': 3683, 'name': 'law and ethics'}],
 'name': 'Jan Decleir',
 'popularity': 3.635,
 'vote_average': 7.5,
 'vote_count': 59}

In [None]:
# Save dataframe to HDF5 
#titles.to_csv(fileloc + 'new_titles2.csv',index=False)
save_hdf5 = pd.HDFStore('datasets/movie.hdf5')
save_hdf5['titles_search_mvid'] = titles
save_hdf5.close()

#### Cleaning of Keywords & Genres

In [47]:
titles = pd.read_csv(fileloc + 'new_titles2.csv')

with pd.HDFStore('datasets/movie.hdf5') as store:
    titles = store['titles_search_mvid']

In [48]:
titles.head()

Unnamed: 0,movie_id,year_of_release,title,moviedb_id,type,search_result
0,1,2003.0,dinosaur planet,11710,tv,"{'name': 'Scott Sampson', 'keywords': [{'name'..."
1,3,1997.0,character,17139,movie,"{'name': 'Jan Decleir', 'keywords': [{'id': 36..."
2,4,1994.0,paula abdul's get up & dance,274766,movie,"{'name': 'Paula Abdul', 'popularity': 0.6}"
3,6,1997.0,sick,35638,movie,"{'name': 'Bob Flanagan', 'keywords': [{'id': 2..."
4,7,1992.0,8 man,196685,movie,"{'name': 'Kai Shishido', 'popularity': 0.6, 'v..."


In [49]:
# Check the data type of the element
type(titles.loc[0,'search_result'])

str

In [50]:
# Convert, evaluate stirng representation into actual type
titles['search_result']  = titles['search_result'].apply(ast.literal_eval)

In [51]:
# Re-check data type of the element
type(titles.loc[0,'search_result'])

dict

In [52]:
# Example of returned result
titles.loc[0,'search_result']

{'name': 'Scott Sampson',
 'keywords': [{'name': 'prehistoric', 'id': 10506},
  {'name': 'miniseries', 'id': 11162},
  {'name': 'dinosaur', 'id': 12616},
  {'name': 'prehistoric creature', 'id': 166958},
  {'name': 'prehistoric times', 'id': 183414}],
 'genres': [{'id': 99, 'name': 'Documentary'}],
 'popularity': 1.086,
 'vote_count': 5,
 'vote_average': 7.4}

In [53]:
# Convert search_result columns into multiple columns based on dictionary
titles[['featured_actor', 'keywords', 'genre','popularity', 'vote_count', 'vote_average']] =  pd.DataFrame(titles['search_result'].tolist(), index=titles.index)

In [54]:
titles.loc[0,'keywords']

[{'name': 'prehistoric', 'id': 10506},
 {'name': 'miniseries', 'id': 11162},
 {'name': 'dinosaur', 'id': 12616},
 {'name': 'prehistoric creature', 'id': 166958},
 {'name': 'prehistoric times', 'id': 183414}]

In [55]:
titles.head(10)

Unnamed: 0,movie_id,year_of_release,title,moviedb_id,type,search_result,featured_actor,keywords,genre,popularity,vote_count,vote_average
0,1,2003.0,dinosaur planet,11710,tv,"{'name': 'Scott Sampson', 'keywords': [{'name'...",Scott Sampson,"[{'name': 'prehistoric', 'id': 10506}, {'name'...","[{'id': 99, 'name': 'Documentary'}]",1.086,5.0,7.4
1,3,1997.0,character,17139,movie,"{'name': 'Jan Decleir', 'keywords': [{'id': 36...",Jan Decleir,"[{'id': 3683, 'name': 'law and ethics'}]","[{'id': 36, 'name': 'History'}, {'id': 18, 'na...",3.635,59.0,7.5
2,4,1994.0,paula abdul's get up & dance,274766,movie,"{'name': 'Paula Abdul', 'popularity': 0.6}",Paula Abdul,,,0.6,,
3,6,1997.0,sick,35638,movie,"{'name': 'Bob Flanagan', 'keywords': [{'id': 2...",Bob Flanagan,"[{'id': 2843, 'name': 'fetishism'}, {'id': 373...","[{'id': 99, 'name': 'Documentary'}]",4.868,18.0,6.7
4,7,1992.0,8 man,196685,movie,"{'name': 'Kai Shishido', 'popularity': 0.6, 'v...",Kai Shishido,,,0.6,1.0,1.0
5,9,1991.0,class of nuke 'em high 2,26230,movie,"{'name': 'Brick Bronsky', 'keywords': [{'id': ...",Brick Bronsky,"[{'id': 207679, 'name': 'troma'}]","[{'id': 35, 'name': 'Comedy'}, {'id': 27, 'nam...",5.677,33.0,4.3
6,11,1999.0,full frame: documentary shorts,413109,movie,"{'name': 'Frankie Murphy-Giesing', 'genres': [...",Frankie Murphy-Giesing,,"[{'id': 53, 'name': 'Thriller'}, {'id': 80, 'n...",0.976,1.0,
7,12,1947.0,my favorite brunette,18649,movie,"{'name': 'Bob Hope', 'keywords': [{'id': 1735,...",Bob Hope,"[{'id': 1735, 'name': 'amateur detective'}, {'...","[{'id': 35, 'name': 'Comedy'}, {'id': 53, 'nam...",2.605,24.0,7.1
8,13,2003.0,lord of the rings: the return of the king: ext...,122,movie,"{'name': 'Elijah Wood', 'keywords': [{'id': 60...",Elijah Wood,"[{'id': 603, 'name': 'elves'}, {'id': 604, 'na...","[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",54.024,16266.0,8.5
9,14,1982.0,nature: antarctica,14951,tv,"{'keywords': [{'name': 'nature documentary', '...",,"[{'name': 'nature documentary', 'id': 221355}]","[{'id': 99, 'name': 'Documentary'}, {'id': 107...",30.758,12.0,8.6


In [56]:
# Return a dictionary from a list of dictionary
titles['keywords'] = titles['keywords'].apply(make_dict)
titles['genre'] = titles['genre'].apply(make_dict)

In [57]:
titles.head()

Unnamed: 0,movie_id,year_of_release,title,moviedb_id,type,search_result,featured_actor,keywords,genre,popularity,vote_count,vote_average
0,1,2003.0,dinosaur planet,11710,tv,"{'name': 'Scott Sampson', 'keywords': [{'name'...",Scott Sampson,"{'prehistoric': 10506, 'miniseries': 11162, 'd...",{'Documentary': 99},1.086,5.0,7.4
1,3,1997.0,character,17139,movie,"{'name': 'Jan Decleir', 'keywords': [{'id': 36...",Jan Decleir,{'law and ethics': 3683},"{'History': 36, 'Drama': 18}",3.635,59.0,7.5
2,4,1994.0,paula abdul's get up & dance,274766,movie,"{'name': 'Paula Abdul', 'popularity': 0.6}",Paula Abdul,{},{},0.6,,
3,6,1997.0,sick,35638,movie,"{'name': 'Bob Flanagan', 'keywords': [{'id': 2...",Bob Flanagan,"{'fetishism': 2843, 'funeral': 3739, 'sadomaso...",{'Documentary': 99},4.868,18.0,6.7
4,7,1992.0,8 man,196685,movie,"{'name': 'Kai Shishido', 'popularity': 0.6, 'v...",Kai Shishido,{},{},0.6,1.0,1.0


In [58]:
# Check for null values
titles.isnull().sum()[titles.isnull().sum()!=0]

year_of_release       5
featured_actor     1174
vote_count         1526
vote_average       1528
dtype: int64

In [59]:
titles[titles['year_of_release'].isnull()]

Unnamed: 0,movie_id,year_of_release,title,moviedb_id,type,search_result,featured_actor,keywords,genre,popularity,vote_count,vote_average
3805,4388,,ancient civilizations: rome and pompeii,503681,movie,{'popularity': 0.6},,{},{},0.6,,
4158,4794,,ancient civilizations: land of the pharaohs,503681,movie,{'popularity': 0.6},,{},{},0.6,,
6278,7241,,ancient civilizations: athens and greece,503681,movie,{'popularity': 0.6},,{},{},0.6,,
9345,10782,,roti kapada aur makaan,125292,movie,"{'name': 'Shashi Kapoor', 'popularity': 0.6, '...",Shashi Kapoor,{},{},0.6,1.0,8.0
14428,16678,,jimmy hollywood,31643,movie,"{'name': 'Joe Pesci', 'genres': [{'id': 35, 'n...",Joe Pesci,{},{'Comedy': 35},4.604,35.0,5.2


In [60]:
titles = titles[titles['year_of_release'].notnull()]
titles.reset_index(drop=True, inplace=True)
print(titles.shape)

(15362, 12)


<span style='color:magenta'>Remarks:</span>
Dropping these 5 rows of titles as they are missing most of the essential information ~ most variables are missing. In total, we dropped 2408 rows of titles.

In [61]:
# Fillna for missing values
titles['featured_actor'].fillna('NA', inplace=True)
titles['vote_count'].fillna(0, inplace=True)
titles['vote_average'].fillna(0, inplace=True)

In [62]:
# Recheck if there are any null values
titles.isnull().sum()[titles.isnull().sum()!=0]

Series([], dtype: int64)

In [63]:
titles.head()

Unnamed: 0,movie_id,year_of_release,title,moviedb_id,type,search_result,featured_actor,keywords,genre,popularity,vote_count,vote_average
0,1,2003.0,dinosaur planet,11710,tv,"{'name': 'Scott Sampson', 'keywords': [{'name'...",Scott Sampson,"{'prehistoric': 10506, 'miniseries': 11162, 'd...",{'Documentary': 99},1.086,5.0,7.4
1,3,1997.0,character,17139,movie,"{'name': 'Jan Decleir', 'keywords': [{'id': 36...",Jan Decleir,{'law and ethics': 3683},"{'History': 36, 'Drama': 18}",3.635,59.0,7.5
2,4,1994.0,paula abdul's get up & dance,274766,movie,"{'name': 'Paula Abdul', 'popularity': 0.6}",Paula Abdul,{},{},0.6,0.0,0.0
3,6,1997.0,sick,35638,movie,"{'name': 'Bob Flanagan', 'keywords': [{'id': 2...",Bob Flanagan,"{'fetishism': 2843, 'funeral': 3739, 'sadomaso...",{'Documentary': 99},4.868,18.0,6.7
4,7,1992.0,8 man,196685,movie,"{'name': 'Kai Shishido', 'popularity': 0.6, 'v...",Kai Shishido,{},{},0.6,1.0,1.0


In [64]:
# Generate a keyword dictionary
keywords_dict = dict()
titles['keywords'].apply(keywords_dict.update);

# Change keyword column into list of keywords only
titles['keywords'] = titles['keywords'].apply(lambda x: list(chain(x.keys())))

In [65]:
def genre_dict_update(x,key_1,key_dict):
    if x != dict():
        if key_1 == 'tv':
            key_dict['tv'].update(x)
        if key_1 == 'movie':
            key_dict['movie'].update(x)

In [66]:
# Create a genre dictionary, The genre is split into 2 - tv & movie due to the way the API is strcutre and I am not able to confirm if the same id is used.
g_dict = {'tv':{}, 'movie':{}}

In [67]:
# update genre_dict based on titles type
for i in range(len(titles)):
    genre_dict_update(titles.loc[i,'genre'],titles.loc[i,'type'],g_dict)

In [68]:
g_dict

{'tv': {'Documentary': 99,
  'Family': 10751,
  'Animation': 16,
  'Comedy': 35,
  'Action & Adventure': 10759,
  'Sci-Fi & Fantasy': 10765,
  'Reality': 10764,
  'Drama': 18,
  'Crime': 80,
  'Mystery': 9648,
  'Western': 37,
  'Soap': 10766,
  'Kids': 10762,
  'War & Politics': 10768,
  'Adventure': 12,
  'Science Fiction': 878,
  'Horror': 27,
  'News': 10763,
  'War': 10752,
  'Music': 10402,
  'Talk': 10767},
 'movie': {'History': 36,
  'Drama': 18,
  'Documentary': 99,
  'Comedy': 35,
  'Horror': 27,
  'Science Fiction': 878,
  'Thriller': 53,
  'Crime': 80,
  'Mystery': 9648,
  'Romance': 10749,
  'Adventure': 12,
  'Fantasy': 14,
  'Action': 28,
  'Music': 10402,
  'TV Movie': 10770,
  'Family': 10751,
  'Animation': 16,
  'War': 10752,
  'Western': 37}}

<span style='color:magenta'>Remarks:</span>
From the result, it seems that both TV & Moives are using the same id for the same genre. I will be combining them together.

In [69]:
genre_dict = dict()
genre_dict.update(g_dict['tv'])
genre_dict.update(g_dict['movie'])

In [70]:
# Change keyword column into list of genre id only
titles['genre'] = titles['genre'].apply(lambda x: list(chain(x.keys())))

In [71]:
titles.head()

Unnamed: 0,movie_id,year_of_release,title,moviedb_id,type,search_result,featured_actor,keywords,genre,popularity,vote_count,vote_average
0,1,2003.0,dinosaur planet,11710,tv,"{'name': 'Scott Sampson', 'keywords': [{'name'...",Scott Sampson,"[prehistoric, miniseries, dinosaur, prehistori...",[Documentary],1.086,5.0,7.4
1,3,1997.0,character,17139,movie,"{'name': 'Jan Decleir', 'keywords': [{'id': 36...",Jan Decleir,[law and ethics],"[History, Drama]",3.635,59.0,7.5
2,4,1994.0,paula abdul's get up & dance,274766,movie,"{'name': 'Paula Abdul', 'popularity': 0.6}",Paula Abdul,[],[],0.6,0.0,0.0
3,6,1997.0,sick,35638,movie,"{'name': 'Bob Flanagan', 'keywords': [{'id': 2...",Bob Flanagan,"[fetishism, funeral, sadomasochism, terminal i...",[Documentary],4.868,18.0,6.7
4,7,1992.0,8 man,196685,movie,"{'name': 'Kai Shishido', 'popularity': 0.6, 'v...",Kai Shishido,[],[],0.6,1.0,1.0


In [72]:
# Remove search_result column
titles.drop(columns=['search_result','moviedb_id'], inplace=True, errors='ignore')

In [73]:
titles.head()

Unnamed: 0,movie_id,year_of_release,title,type,featured_actor,keywords,genre,popularity,vote_count,vote_average
0,1,2003.0,dinosaur planet,tv,Scott Sampson,"[prehistoric, miniseries, dinosaur, prehistori...",[Documentary],1.086,5.0,7.4
1,3,1997.0,character,movie,Jan Decleir,[law and ethics],"[History, Drama]",3.635,59.0,7.5
2,4,1994.0,paula abdul's get up & dance,movie,Paula Abdul,[],[],0.6,0.0,0.0
3,6,1997.0,sick,movie,Bob Flanagan,"[fetishism, funeral, sadomasochism, terminal i...",[Documentary],4.868,18.0,6.7
4,7,1992.0,8 man,movie,Kai Shishido,[],[],0.6,1.0,1.0


In [74]:
# Save dataframe to HDF5 files
#titles.to_csv('datasets/new_titles_cleaned.csv')
save_hdf5 = pd.HDFStore('datasets/movie.hdf5')
save_hdf5['titles_cleaned'] = titles
save_hdf5.close()

### Users Rating

In [5]:
# Import rating files into dataframe
ls_of_ratings = []

# Specify the pattern matching for glob
rating_files = glob.glob('datasets/training_set/*')

# Loop to get dataframe into list
for filename in rating_files:
    df = pd.read_csv(filename, sep=',', names=['customer_id','rating','date'],skiprows=1)
    df['movie_id'] = int(filename.split('mv_')[1].split('.')[0])
    ls_of_ratings.append(df)

# Concat dataframe together
ratings = pd.concat(ls_of_ratings,ignore_index=True)


# covert dataframe into memory-saving datatype
ratings['date'] = pd.to_datetime(ratings['date'])
ratings[['customer_id','rating', 'rating']].astype('int32')

# Save file to HDF5
save_hdf5 = pd.HDFStore('datasets/movie.hdf5')
save_hdf5['full_ratings'] = ratings
save_hdf5.close()

print('glob & HDF5 saving completed')

glob completed


In [6]:
# Check if the dataframe is saved.
hf = h5py.File('datasets/movie.hdf5','r')
hf.keys()

<KeysViewHDF5 ['full_ratings', 'users', 'users_sim']>