# Introduction

As the trend of on-demand entertainment becoming more prevelant, there is a change in the trend of content creation being less exclusive to a particular network and more avaiable across various content providers. Due to this, there is a competiton across various entertainment provider to provide the right content to the right user to ensure that their users are engaged and reduce their needs to move to another content provider. 1 key factor that can help in pushing the right content to the right users is to have a robust recommendation system.

A singapore start-up which envision to be the Netflix of Singapore, has hired me as a Data scientist to assist their team in creating a recommendation system. They have kindly provided movie rating data from their system. Due to PDPA, no user's profiles are provided.

## Problem Statement
1. To create a recommendation system based on collaborative filtering to recommend movies to users based on similar movie watched & ratings.
2. Recommendation system should provide new contents (w/o rating) users based on user profile to user.

## Import Library

In [1]:
import numpy as np
import pandas as pd
import glob
import csv

import matplotlib.pyplot as plt
import seaborn as sns

# API Call
import requests

from datetime import datetime


# Additional Setting
pd.set_option("display.max_rows", 201)

## Import Data sets 

### Import Movie Rating File by Users

In [2]:
# %%time
# # Import rating files into dataframe
# ls_of_ratings = []

# # Specify the pattern matching for glob
# rating_files = glob.glob('datasets/training_set/*')

# # Loop to get dataframe into list
# for filename in rating_files:
#     df = pd.read_csv(filename, sep=',', names=['customer_id','rating','date'],skiprows=1)
#     df['movie_id'] = int(filename.split('mv_')[1].split('.')[0])
#     ls_of_ratings.append(df)

# # Concat dataframe together
# ratings = pd.concat(ls_of_ratings,ignore_index=True)

# print('glob completed')

In [3]:
# print(ratings.shape)
# ratings.head()

### Importing of Movie title file

In [4]:
# Import Movie titles file
titles = pd.read_csv('datasets/movie_titles.txt', sep = ',', names=['movie_id','year_of_release','title'], encoding='Latin_1')

In [5]:
# convert all movie title to lower case
titles['title'] = titles['title'].map(lambda x: str.lower(x))

In [6]:
print(f'shape of titles {titles.shape}')
titles.head()

shape of titles (17770, 3)


Unnamed: 0,movie_id,year_of_release,title
0,1,2003.0,dinosaur planet
1,2,2004.0,isle of man tt 2004 review
2,3,1997.0,character
3,4,1994.0,paula abdul's get up & dance
4,5,2004.0,the rise and fall of ecw


#### Cleaning of titles dataframe

In [7]:
# Split title into a list of words - delimiter ":"
titles['split_title'] = titles['title'].map(lambda x : [x for x in x.split(':')])
titles.head()

Unnamed: 0,movie_id,year_of_release,title,split_title
0,1,2003.0,dinosaur planet,[dinosaur planet]
1,2,2004.0,isle of man tt 2004 review,[isle of man tt 2004 review]
2,3,1997.0,character,[character]
3,4,1994.0,paula abdul's get up & dance,[paula abdul's get up & dance]
4,5,2004.0,the rise and fall of ecw,[the rise and fall of ecw]


In [8]:
print(titles.info())
print('-' * 50)
print(titles.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17770 entries, 0 to 17769
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   movie_id         17770 non-null  int64  
 1   year_of_release  17763 non-null  float64
 2   title            17770 non-null  object 
 3   split_title      17770 non-null  object 
dtypes: float64(1), int64(1), object(2)
memory usage: 555.4+ KB
None
--------------------------------------------------
(17770, 4)


In [17]:
chunk_size = 1000

In [18]:
chunk_list = []
for i in range(chunk_size, len(titles),500):
    start_point = i - chunk_size
    df = titles.loc[start_point:i,:]
    #print(type(df))
    chunk_list.append(df)

In [19]:
# chunk_list = chunk_list[0:2]

In [20]:
chunk_list[0].head()

Unnamed: 0,movie_id,year_of_release,title,split_title
0,1,2003.0,dinosaur planet,[dinosaur planet]
1,2,2004.0,isle of man tt 2004 review,[isle of man tt 2004 review]
2,3,1997.0,character,[character]
3,4,1994.0,paula abdul's get up & dance,[paula abdul's get up & dance]
4,5,2004.0,the rise and fall of ecw,[the rise and fall of ecw]


## API Call

### Connect to TheMoiveDB.org

In [21]:
# API key
REQUEST_TOKEN = 'd8f46f139abc47c1f048f3efc486fe53'

# Target web page:

# To authenticate api call session
url = "https://www.themoviedb.org/authenticate/"+ REQUEST_TOKEN

# Establishing the connection to the web page:
response = requests.get(url)

# You can use status codes to understand how the target server responds to your request.
# Ex., 200 = OK, 400 = Bad Request, 403 = Forbidden, 404 = Not Found.
print(response.status_code)

200


In [43]:
# Create a function for API call for different movie titles

def title_pull(item_ls, item_release_year):
    '''
    Insert docstring
    '''
    
    global REQUEST_TOKEN
    
    item_name = ''
    
    # Loop to create a string to search movie database
    for i in range(len(item_ls)):
        if i != 0:
            item_name += ":" + item_ls[i]
        else:
            item_name = item_ls[i]
        
        # replace whitespace with %20
        search_item = item_name.replace(" ", "%20")
            
        # Try to search based on movie & release year
        try:
            url = f"https://api.themoviedb.org/3/search/movie?api_key={REQUEST_TOKEN}&query={search_item}&primary_release_year={item_release_year}&page=1" 
            req = requests.get(url)
            sq_dict = req.json()
            res = {key: sq_dict['results'][0][key] for key in sq_dict['results'][0].keys() & {'id', 'title'}}
            res['search_title'] = search_item.replace("%20", ' ')
            res['type'] = 'movie'
            return res
        except:
            pass

        # Try to search based on TV name & first air date
        try:
            url = f"https://api.themoviedb.org/3/search/tv?api_key={REQUEST_TOKEN}&query={search_item}&first_air_date_year={item_release_year}&page=1"
            req = requests.get(url)
            sq_dict = req.json()
            res = {key: sq_dict['results'][0][key] for key in sq_dict['results'][0].keys() & {'id', 'name'}}
            res['search_title'] = search_item.replace("%20", ' ')
            res['type'] = 'tv'
            return res
        except:
            pass

        # Try to search based on Movie title only
        try:
            url = f"https://api.themoviedb.org/3/search/movie?api_key={REQUEST_TOKEN}&query={search_item}&page=1" 
            req = requests.get(url)
            sq_dict = req.json()
            res = {key: sq_dict['results'][0][key] for key in sq_dict['results'][0].keys() & {'id', 'title'}}
            res['search_title'] = search_item.replace("%20", ' ')
            res['type'] = 'movie'
            return res
        except:
            pass

        # Try to search based on TV name only
        try:
            url = f"https://api.themoviedb.org/3/search/tv?api_key={REQUEST_TOKEN}&query={search_item}&page=1"
            req = requests.get(url)
            sq_dict = req.json()
            res = {key: sq_dict['results'][0][key] for key in sq_dict['results'][0].keys() & {'id', 'name'}}
            res['search_title'] = search_item.replace("%20", ' ')
            res['type'] = 'tv'
            return res
        except:
            return 'NA'
        
    # generate a random sleep duration to look more 'natural'
    sleep_duration = random.randint(2,5)
    print(f'Sleep Duration: {sleep_duration}')
    print('-' * 50)
    time.sleep(sleep_duration)    
    
    # Check if url time-out
    if response.status_code != 200:
        url = "https://www.themoviedb.org/authenticate/"+ REQUEST_TOKEN
    else:
        print(f'Connection Error, error type: {response.status_code}')

        

In [50]:
def t_now(string):
    time = datetime.now()
    current_time = start.strftime("%H:%M:%S")
    return f"{string} = {current_time}"

In [52]:
%%time

print(t_now('Start Time'))

for index, df in enumerate(chunk_list):
    print(f"{index} batch: {t_now('run time')}")
    df['moviedb_result'] = df.apply(lambda x: title_pull(x['split_title'], x['year_of_release']),axis=1)
    df.to_csv(f'datasets/titles_added/ta_{index}.csv')

Start Time = 21:10:58
0 batch: run time = 21:10:58


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


1 batch: run time = 21:10:58
Wall time: 9.38 s


In [None]:
%%time

start = datetime.now()

current_time = start.strftime("%H:%M:%S")
print("Start Time =", current_time)


# Create a column of pulled result based on title & release_year
titles['moviedb_result'] = titles.apply(lambda x: loop_title_pull(x['movie_id'], x['split_title'], x['year_of_release']),axis=1)

# Create a column of pulled result based on title & release_year
# test['moviedb_result'] = test.apply(lambda x: loop_title_pull(x['split_title'], x['year_of_release']),axis=1)

In [19]:
titles.head()

Unnamed: 0,movie_id,year_of_release,title,split_title
0,1,2003.0,dinosaur planet,[dinosaur planet]
1,2,2004.0,isle of man tt 2004 review,[isle of man tt 2004 review]
2,3,1997.0,character,[character]
3,4,1994.0,paula abdul's get up & dance,[paula abdul's get up & dance]
4,5,2004.0,the rise and fall of ecw,[the rise and fall of ecw]


In [None]:
%%time
titles[['search_result', 'search_title', 'type']] = pd.DataFrame(titles['moviedb_result'].tolist(), index=test.index)


## test[['search_result', 'search_title', 'type']] = pd.DataFrame(test['moviedb_result'].tolist(), index=test.index)

# Slower method but may be more readable
#test['search_result'], test['search_title'], test['type'] = zip(*test['moviedb_result'])

In [None]:
# Change html whitespace to normal whitespace
titles['search_title'] = titles['search_title'].map(lambda x: x.replace('%20', " "))

# test['search_title'] = test['search_title'].map(lambda x: x.replace('%20', " "))

In [None]:
titles.head()

In [None]:
titles[titles['search_result']=='NA']['type'].count()

### Merging of Movie title & Rating DataFrame

In [None]:
ratings_set = set(ratings['movie_id'])
titles_set = set(titles['movie_id'])
print(f" elements present in title df but not in rating df : {titles_set.difference(ratings_set)}")
print(f" elements present in rating df but not in title df : {ratings_set.difference(titles_set)}")

In [None]:
# Merge movie title with rating
df = pd.merge(ratings, titles, on='movie_id', how='inner')
df.shape

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
df['title'].nunique()