<p style="background-color:#368f8b;font-family:Trebuchet MS;font-weight:bold;color:#eff7f6;font-size:40px;text-align:center;border-radius:100px 100px">Table of Contents</p>

**In this notebook, we will cover:**
* [Overview](#0)
* [Exploratory Data Analysis](#1)
* [Recommender System](#2)
    1. [Hybrid Recommendation](#3)
    2. [Deep Learning (Tensorflow)](#4)

<a id=0 a/>
<p style="background-color:#368f8b;font-family:Trebuchet MS;font-weight:bold;color:#eff7f6;font-size:40px;text-align:center;border-radius:100px 100px">Overview</p>

<a id=1 a/>
<p style="background-color:#368f8b;font-family:Trebuchet MS;font-weight:bold;color:#eff7f6;font-size:40px;text-align:center;border-radius:100px 100px">Exploratory Data Analysis</p>

First thing first, there's always an EDA to give us a sense of what data we are dealing with. It's also useful to acquire some insights, informations, and even mistakes from data

In [None]:
!pip install -q tensorflow-recommenders
!pip install -q surprise

In [None]:
import string
import re
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
#import tensorflow as tf
#import tensorflow_recommenders as tfrs
import lightgbm as lgb
import xgboost as xgb
from collections import Counter
from typing import Dict, Text
from ast import literal_eval
from datetime import datetime
from wordcloud import WordCloud
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import warnings
warnings.filterwarnings('ignore')

In [None]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [None]:
movies = pd.read_csv('/gdrive/MyDrive/CSE6740/movies_metadata.csv')
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

In [None]:
credits = pd.read_csv('/gdrive/MyDrive/CSE6740/credits.csv')
keywords = pd.read_csv('/gdrive/MyDrive/CSE6740/keywords.csv')
movies = pd.read_csv('/gdrive/MyDrive/CSE6740/movies_metadata.csv').\
            drop(['belongs_to_collection', 'homepage', 'imdb_id', 'poster_path', 'status', 'title', 'video'], axis=1).\
            drop([19730, 29503, 35587]) # Incorrect data type

movies['id'] = movies['id'].astype('int64')

# Merge the 'movies', 'keywords', and 'credits' DataFrames based on the 'id' column
df = movies.merge(keywords, on='id').merge(credits, on='id')

# Fill NaN values in the 'original_language' column of 'df' with an empty string
df['original_language'] = df['original_language'].fillna('')
df['runtime'] = df['runtime'].fillna(0)
df['tagline'] = df['tagline'].fillna('')

# Drop rows containing NaN values in 'df'
df.dropna(inplace=True)

In [None]:
# Define a function to extract text information from a column containing lists of dictionaries
def get_text(text, obj='name'):
    text = literal_eval(text)

    if len(text) == 1:
        for i in text:
            return i[obj]
    else:
        s = []
        for i in text:
            s.append(i[obj])
        return ', '.join(s)

# Apply the 'get_text' function to specific columns in the DataFrame 'df'
df['genres'] = df['genres'].apply(get_text)
# df['production_companies'] = df['production_companies'].apply(get_text)
# df['production_countries'] = df['production_countries'].apply(get_text)
# df['crew'] = df['crew'].apply(get_text)
# df['spoken_languages'] = df['spoken_languages'].apply(get_text)
df['keywords'] = df['keywords'].apply(get_text)

# # New columns
# # Create new columns using the 'get_text' function for the 'cast' column
# df['characters'] = df['cast'].apply(get_text, obj='character')
# df['actors'] = df['cast'].apply(get_text)

# df.drop('cast', axis=1, inplace=True)

# Remove duplicate rows based on the 'original_title' column and reset the DataFrame index
df = df[~df['original_title'].duplicated()]
df = df.reset_index(drop=True)

In [None]:
df['release_date'] = pd.to_datetime(df['release_date'])
df['budget'] = df['budget'].astype('float64')
df['popularity'] = df['popularity'].astype('float64')

In [None]:
df.head(83)

Unnamed: 0,adult,budget,genres,id,original_language,original_title,overview,popularity,production_companies,production_countries,...,runtime,spoken_languages,tagline,vote_average,vote_count,keywords,crew,characters,actors,weighted_average
0,False,30000000.0,"Animation, Comedy, Family",862,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,Pixar Animation Studios,United States of America,...,81.0,English,,7.7,5415.0,"jealousy, toy, boy, friendship, friends, rival...","John Lasseter, Joss Whedon, Andrew Stanton, Jo...","Woody (voice), Buzz Lightyear (voice), Mr. Pot...","Tom Hanks, Tim Allen, Don Rickles, Jim Varney,...",7.698867
1,False,65000000.0,"Adventure, Fantasy, Family",8844,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,"TriStar Pictures, Teitler Film, Interscope Com...",United States of America,...,104.0,"English, Français",Roll the dice and unleash the excitement!,6.9,2413.0,"board game, disappearance, based on children's...","Larry J. Franco, Jonathan Hensleigh, James Hor...","Alan Parrish, Samuel Alan Parrish / Van Pelt, ...","Robin Williams, Jonathan Hyde, Kirsten Dunst, ...",6.898453
2,False,0.0,"Romance, Comedy",15602,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.712900,"Warner Bros., Lancaster Gate",United States of America,...,101.0,English,Still Yelling. Still Fighting. Still Ready for...,6.5,92.0,"fishing, best friend, duringcreditsstinger, ol...","Howard Deutch, Mark Steven Johnson, Mark Steve...","Max Goldman, John Gustafson, Ariel Gustafson, ...","Walter Matthau, Jack Lemmon, Ann-Margret, Soph...",6.473290
3,False,16000000.0,"Comedy, Drama, Romance",31357,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,Twentieth Century Fox Film Corporation,United States of America,...,127.0,English,Friends are the people who let you be yourself...,6.1,34.0,"based on novel, interracial relationship, sing...","Forest Whitaker, Ronald Bass, Ronald Bass, Ezr...","Savannah 'Vannah' Jackson, Bernadine 'Bernie' ...","Whitney Houston, Angela Bassett, Loretta Devin...",6.063852
4,False,0.0,Comedy,11862,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,"Sandollar Productions, Touchstone Pictures",United States of America,...,106.0,English,Just When His World Is Back To Normal... He's ...,5.7,173.0,"baby, midlife crisis, confidence, aging, daugh...","Alan Silvestri, Elliot Davis, Nancy Meyers, Na...","George Banks, Nina Banks, Franck Eggelhoffer, ...","Steve Martin, Diane Keaton, Martin Short, Kimb...",5.699219
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78,False,0.0,"Family, Drama",46785,fa,بادکنک سفید,Several people try to take advantage of a litt...,1.196256,"I.R.I.B. Channel 2, Ferdos Films",Iran,...,85.0,فارسی,,7.5,20.0,"fish, money, celebration, goldfish","Abbas Kiarostami, Jafar Panahi","Razieh, Ali, Mother, Old Lady, Soldier","Aida Mohammadkhani, Mohsen Kafili, Fereshteh S...",7.259240
79,False,8000000.0,"Drama, Crime",400,en,Things to Do in Denver When You're Dead,A mafia film in Tarantino style with a star-st...,4.486179,Miramax Films,United States of America,...,116.0,English,Protect. Love. Honor. Avenge.,6.7,87.0,"father son relationship, bounty hunter, boat, ...","Gary Fleder, Scott Rosenberg, Cary Woods, Mich...","Jimmy 'The Saint' Tosnia, Pieces, Franchise, E...","Andy García, Christopher Lloyd, William Forsyt...",6.665139
80,False,900000.0,"Drama, Comedy",880,nl,Antonia,"After World War II, Antonia and her daughter, ...",2.030174,"Bergen Film, Bard Entertainments, NPS Televisie","Netherlands, United Kingdom, Belgium",...,102.0,Nederlands,A motion picture that celebrates everything yo...,7.2,26.0,"suicide, underdog, free love, philosophy, rape...","Marleen Gorris, Marleen Gorris, Gerard Corneli...","Antonia, Danielle, Allegonde, Thérèse, Thérèse...","Willeke van Ammelrooy, Els Dottermans, Dora va...",7.040087
81,False,0.0,"Romance, Drama",146599,en,Once Upon a Time... When We Were Colored,This film relates the story of a tightly conne...,0.252287,"BET Pictures, United Image Entertainme",United States of America,...,115.0,English,,4.5,2.0,"racial segregation, family relationships, rura...","Tim Reid, Paul W. Cooper, Clifton L. Taulbert","Poppa, Ma Ponk, Ma Pearl, Miss Alice (as Salli...","Al Freeman, Jr., Phylicia Rashād, Paula Kelly,...",5.192503


<a id=2 a/>
<p style="background-color:#368f8b;font-family:Trebuchet MS;font-weight:bold;color:#eff7f6;font-size:40px;text-align:center;border-radius:100px 100px">Recommender System</p>

There are lots of methods that you can use to build recommender system. This time, we are going to explore two of them from which you can create build recommender system that can give an output of a recommended movies to the users based on different features

<a id=3 a/>
<h1 style="font-family: Trebuchet MS; font-size: 25px; color: #3a5a40; text-align: left; "><b>● Hybrid</b></h1>

For those of you who have been learning recommender system for a while, you might be familiar with weighted average. The idea behind it is to give a "fair" rating for each movie. For this particular notebook, we will take it to the next level with the help of bag of words

If you see the dataset we had, there are tons of valuable information such as genre, overview, etc. Later, we are going to use this information to make our recommender system more robust. we extract those information inside bag of words then combined it with weighted average to get the final similarity for the movies

![image.png](https://cdn.analyticsvidhya.com/wp-content/uploads/2019/06/bayesianParameter.jpg.jpg)

Calculated weighted average scores for movies based on their average ratings and vote counts.

This approach helps consider both the **popularity and rating quality of movies.**

In [None]:
R = df['vote_average']
v = df['vote_count']

# We will only consider movies that have more votes than at least 80% of the movies in our dataset
m = df['vote_count'].quantile(0.5)
C = df['vote_average'].mean()

df['weighted_average'] = (R*v + C*m)/(v+m)

In [None]:
scaler = MinMaxScaler()

# Select columns for scaling ('popularity' and 'weighted_average') and transform them
scaled = scaler.fit_transform(df[['popularity', 'weighted_average']])

weighted_df = pd.DataFrame(scaled, columns=['popularity', 'weighted_average'])

weighted_df.index = df['original_title']

People watch a movie not just because they see a good rating for that movie, but also because of the hype of certain movie. So, in this case, put popularity into a consideration is a wise choice

Let's take **40% weight for weigthed average** and **60% weight for popularity** considering people don't want to miss a hype movie even the reviews and ratings are poor. You can play around with the number. Next up, we create a new column called score which stores the result

In [None]:
weighted_df['score'] = weighted_df['weighted_average']*0.4 + weighted_df['popularity'].astype('float64')*0.6

In [None]:
weighted_df_sorted = weighted_df.sort_values(by='score', ascending=False)
weighted_df_sorted.head(10)

Unnamed: 0_level_0,popularity,weighted_average,score
original_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Minions,1.0,0.677477,0.870991
Big Hero 6,0.390602,0.847629,0.573413
Baby Driver,0.416507,0.774127,0.559555
Guardians of the Galaxy Vol. 2,0.338511,0.823218,0.532394
Deadpool,0.343132,0.79918,0.525551
Pulp Fiction,0.257449,0.908521,0.517878
Gone Girl,0.282748,0.859761,0.513553
Avatar,0.338036,0.774871,0.51277
John Wick,0.335843,0.750389,0.501662
The Dark Knight,0.224968,0.90863,0.498433


We already got the first result of our recommender system, but we don't stop right here

As mentioned earlier, **we will combine those score with the similarity score**

In [None]:
hybrid_df = df[['original_title', 'adult', 'genres', 'overview', 'production_companies', 'tagline', 'keywords', 'crew', 'characters', 'actors']]

In [None]:
def separate(text):
    clean_text = []
    for t in text.split(','):
        cleaned = re.sub('\(.*\)', '', t) # Remove text inside parentheses
        cleaned = cleaned.translate(str.maketrans('','', string.digits))
        cleaned = cleaned.replace(' ', '')
        cleaned = cleaned.translate(str.maketrans('','', string.punctuation)).lower()
        clean_text.append(cleaned)
    return ' '.join(clean_text)

def remove_punc(text):
    cleaned = text.translate(str.maketrans('','', string.punctuation)).lower()
    clean_text = cleaned.translate(str.maketrans('','', string.digits))
    return clean_text

In [None]:
hybrid_df['adult'] = hybrid_df['adult'].apply(remove_punc)
hybrid_df['genres'] = hybrid_df['genres'].apply(remove_punc)
hybrid_df['overview'] = hybrid_df['overview'].apply(remove_punc)
hybrid_df['production_companies'] = hybrid_df['production_companies'].apply(separate)
hybrid_df['tagline'] = hybrid_df['tagline'].apply(remove_punc)
hybrid_df['keywords'] = hybrid_df['keywords'].apply(separate)
hybrid_df['crew'] = hybrid_df['crew'].apply(separate)
hybrid_df['characters'] = hybrid_df['characters'].apply(separate)
hybrid_df['actors'] = hybrid_df['actors'].apply(separate)

hybrid_df['bag_of_words'] = ''
hybrid_df['bag_of_words'] = hybrid_df[hybrid_df.columns[1:]].apply(lambda x: ' '.join(x), axis=1)
hybrid_df.set_index('original_title', inplace=True)

hybrid_df = hybrid_df[['bag_of_words']]
hybrid_df.head()

Unnamed: 0_level_0,bag_of_words
original_title,Unnamed: 1_level_1
Toy Story,false animation comedy family led by woody and...
Jumanji,false adventure fantasy family when siblings j...
Grumpier Old Men,false romance comedy a family wedding reignite...
Waiting to Exhale,false comedy drama romance cheated on mistreat...
Father of the Bride Part II,false comedy just when george banks has recove...


A common method to find similarity between 2 movies is a method called **cosine similarity**. There are of course a bunch of methods that you can try such as euclidean and sigmoidto know which one performs best

However, calculate similarity for all the movies require an expensive resources. So, because we have limited memory, we **only take the first 10000 movies from `weighted_df_sorted`**

In [None]:
hybrid_df_top10000 = weighted_df_sorted[:20000].merge(hybrid_df, left_index=True, right_index=True, how='left')

In [None]:
tfidf = TfidfVectorizer(stop_words='english', min_df=5)
tfidf_matrix = tfidf.fit_transform(hybrid_df['bag_of_words'])
tfidf_matrix.shape

(42373, 75210)

In [None]:
cos_sim = cosine_similarity(tfidf_matrix)
cos_sim.shape

In [None]:
def predict(title, similarity_weight=0.7, top_n=10):
    data = hybrid_df.reset_index()
    index_movie = data[data['original_title'] == title].index
    similarity = cos_sim[index_movie].T

    sim_df = pd.DataFrame(similarity, columns=['similarity'])
    final_df = pd.concat([data, sim_df], axis=1)
    # You can also play around with the number
    final_df['final_score'] = final_df['score']*(1-similarity_weight) + final_df['similarity']*similarity_weight

    final_df_sorted = final_df.sort_values(by='final_score', ascending=False).head(top_n)
    final_df_sorted.set_index('original_title', inplace=True)
    return final_df_sorted[['score', 'similarity', 'final_score']]

In [None]:
predict('Minions', similarity_weight=0.7, top_n=10)

Unnamed: 0_level_0,score,similarity,final_score
original_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Minions,0.841413,1.0,0.952424
Banana,0.273268,0.449811,0.396848
Despicable Me,0.310727,0.383789,0.361871
Despicable Me 2,0.306867,0.341491,0.331103
Minions: Orientation Day,0.255404,0.341261,0.315504
The Lorax,0.241978,0.279704,0.268386
Mower Minions,0.235257,0.273719,0.262181
The Secret Life of Pets,0.221552,0.26145,0.249481
"Monsters, Inc.",0.340782,0.208199,0.247974
The Iron Giant,0.331968,0.202154,0.241098


In [None]:
# hybrid_df.to_csv('hybrid_df.csv')

In [None]:
# import pickle
# pickle.dump(tfidf_matrix, open('cosine_similarity.pkl', 'wb'))

<a id=4 a/>
<h1 style="font-family: Trebuchet MS; font-size: 25px; color: #3a5a40; text-align: left; "><b>● Deep Learning</b></h1>

Official documentation: https://www.tensorflow.org/recommenders

Tensorflow comes with a library called **TensorFlow Recommenders (TFRS)** for building a recommender system. It's built on Keras and aims to have a gentle learning curve while still giving you the flexibility to build complex models.

This time, we use multi-objective approach that **applies both implicit (movie watches) and explicit signals (ratings)**. In the end, we can predict what movies should the user watch along **with the given rating corresponds to historical data**

In [None]:
# convert timestamps into datetime format, merge two DataFrames
# filter out rows that do not have matching data, and reset the index in preparation for further analysis or processing.
ratings_full = pd.read_csv('/gdrive/MyDrive/CSE6740/ratings.csv')

# Convert the 'timestamp' column to a 'date' column in datetime format
# Purpose: To transform the timestamp into a human-readable date for analysis.
ratings_full['date'] = ratings_full['timestamp'].apply(lambda x: datetime.fromtimestamp(x))
ratings_full.drop('timestamp', axis=1, inplace=True)

ratings_full = ratings_full.merge(df[['id', 'original_title', 'genres', 'overview']], left_on='movieId',right_on='id', how='left')

In [None]:
ratings_full = ratings_full[~ratings_full['id'].isna()]
ratings_full.drop('id', axis=1, inplace=True)
ratings_full.reset_index(drop=True, inplace=True)

ratings_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10969295 entries, 0 to 10969294
Data columns (total 7 columns):
 #   Column          Dtype         
---  ------          -----         
 0   userId          int64         
 1   movieId         int64         
 2   rating          float64       
 3   date            datetime64[ns]
 4   original_title  object        
 5   genres          object        
 6   overview        object        
dtypes: datetime64[ns](1), float64(1), int64(2), object(3)
memory usage: 585.8+ MB


In [None]:
# convert timestamps into datetime format, merge two DataFrames
ratings_df = ratings_full.sample(frac=1, random_state=42).reset_index(drop=True).iloc[:100000,:]

In [None]:
ratings_df

Unnamed: 0,userId,movieId,rating,date,original_title,genres,overview
0,165860,218,4.0,1996-07-06 21:47:51,The Terminator,"Action, Thriller, Science Fiction","In the post-apocalyptic future, reigning tyran..."
1,40655,8487,4.0,2008-02-24 03:44:23,Wild Wild West,"Action, Adventure, Comedy, Science Fiction, We...",Legless Southern inventor Dr. Arliss Loveless ...
2,126998,2116,4.0,1999-12-16 01:26:12,Out of Time,"Thriller, Crime, Drama","Matt Lee Whitlock, respected chief of police i..."
3,196152,4993,5.0,2003-10-07 08:31:16,5 Card Stud,"Action, Western, Thriller",The players in an ongoing poker game are being...
4,56836,637,3.0,2007-11-26 21:20:12,La vita è bella,"Comedy, Drama",A touching story of an Italian book seller of ...
...,...,...,...,...,...,...,...
99995,231987,261,3.0,2005-12-27 19:49:03,Cat on a Hot Tin Roof,"Drama, Romance","Brick, an alcoholic ex-football player, drinks..."
99996,250367,288,3.0,1996-10-17 22:27:09,High Noon,Western,High Noon is about a recently freed leader of ...
99997,241155,62,4.0,1997-10-17 09:29:28,2001: A Space Odyssey,"Science Fiction, Mystery, Adventure",Humanity finds a mysterious object buried bene...
99998,80331,2028,4.0,2000-02-09 18:31:02,Say Anything...,"Comedy, Drama, Romance",A budding romance between noble underachiever ...


In [None]:
movies_df = df[['id', 'original_title']]
movies_df.rename(columns={'id':'movieId'}, inplace=True)

movies_df

Unnamed: 0,movieId,original_title
0,862,Toy Story
1,8844,Jumanji
2,15602,Grumpier Old Men
3,31357,Waiting to Exhale
4,11862,Father of the Bride Part II
...,...,...
42368,289923,The Burkittsville 7
42369,222848,Caged Heat 3000
42370,111109,Siglo ng Pagluluwal
42371,227506,Satana likuyushchiy


In [None]:
# prepare data by organizing user IDs, movie titles, and ratings into TensorFlow datasets for subsequent model training
ratings_df['userId'] = ratings_df['userId'].astype(str)

# tf.data.Dataset.from_tensor_slices: we want to use tf.Dataset type, which is easier for training, so we transform ratings_df and movies_df into Dataset
ratings = tf.data.Dataset.from_tensor_slices(dict(ratings_df[['userId', 'original_title', 'rating']]))
movies = tf.data.Dataset.from_tensor_slices(dict(movies_df[['original_title']]))

ratings = ratings.map(lambda x: {
    "original_title": x["original_title"],
    "userId": x["userId"],
    "rating": float(x["rating"])
})

movies = movies.map(lambda x: x["original_title"])

In [None]:
# split the data into a training set (train) and a test set (test) for model
# The shuffling and use of a random seed are important for ensuring that the data is randomly and consistently split for training and testing.
print('Total Data: {}'.format(len(ratings)))

# shuffle: Randomly shuffles the elements of this dataset.
# [1, 0, 2] -> [2, 1, 0]
# reshuffle_each_iteration= False means not to reshuffle after each iteration
tf.random.set_seed(42)
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

# take: Creates a Dataset with at most count elements from this dataset.
train = ratings.take(80_000) # creates a training dataset train by taking the first 35,000 elements from the shuffled dataset

# skip: Creates a Dataset that skips count elements from this dataset.
test = ratings.skip(80_000).take(20_000) # taking the next 8,188 elements

Total Data: 100000


In [None]:
ratings

<_MapDataset element_spec={'original_title': TensorSpec(shape=(), dtype=tf.string, name=None), 'userId': TensorSpec(shape=(), dtype=tf.string, name=None), 'rating': TensorSpec(shape=(), dtype=tf.float32, name=None)}>

In [None]:
# batches movie titles and user IDs for efficient handling
# identifies the unique values of movies and users in the respective datasets (data preprocessing / feature engineering)

# batch: Combines consecutive elements of this dataset into batches.
# example : dataset = tf.data.Dataset.range(8)
#	dataset = dataset.batch(3)
# e.g. [ array([0,1,2]), array([3,4,5]), array([6,7]) ]
movie_titles = movies.batch(1_000)
user_ids = ratings.batch(1_000).map(lambda x: x["userId"])

unique_movie_titles = np.unique(np.concatenate(list(movie_titles)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

print('Unique Movies: {}'.format(len(unique_movie_titles)))
print('Unique users: {}'.format(len(unique_user_ids)))

Unique Movies: 42373
Unique users: 61540


In [None]:
train

<_TakeDataset element_spec={'original_title': TensorSpec(shape=(), dtype=tf.string, name=None), 'userId': TensorSpec(shape=(), dtype=tf.string, name=None), 'rating': TensorSpec(shape=(), dtype=tf.float32, name=None)}>

This code defines a custom recommendation model using TensorFlow Recommenders (TFRS)

In [None]:
# tfrs.models.Model: This base class makes it easy to define custom training and test losses for such complex models
class MovieModel(tfrs.models.Model):

  def __init__(self, rating_weight: float, retrieval_weight: float) -> None: # two weight parameters : rating_weight and retrieval_weight
    # Constructor method initializes the model.
    super().__init__() # Calls the constructor of the parent class (tfrs.models.Model) to initialize

    embedding_dimension = 64 # mapping high-dimensional discrete data to a lower-dimensional continuous vector space
                             # model can better capture their similarities and associations

    # User and movie models.
    # StringLookup : maps movie or user ids to integer indices ("vocabulary" to define the unique values )
    # Embedding : based on the integer indices obtained from the previous layer, maps the integer indices to lower-dimensional embedding vectors

    # tf.keras.layers.Layer: This is the class from which all layers inherit.
      # tf.keras.Sequential: groups a linear stack of layers into a tf.keras.Model.
      # tf.keras.layers.StringLookup: A preprocessing layer which maps string features to integer indices.
        #vocabulary=unique_movie_titles: applied to a unique set of movie titles
      #mask_token=None: no masking
      # tf.keras.layers.Embedding: Turns positive integers (indexes) into dense vectors of fixed size. #e.g. [[4], [20]] -> [[0.25, 0.1], [0.6, -0.2]]
        #len(unique_movie_titles) + 1 denotes the rows in the embedding matrix,
      #embedding_dimension signifies the dimensionality of the embedding vectors.
    self.movie_model: tf.keras.layers.Layer = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unique_movie_titles, mask_token=None),
      tf.keras.layers.Embedding(len(unique_movie_titles) + 1, embedding_dimension)
    ])

    self.user_model: tf.keras.layers.Layer = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unique_user_ids, mask_token=None),
      tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
    ])


    # A small model to input user and movie embeddings and predict ratings
    # consists of three dense layers with activation functions, which are used to learn the relationships between user and movie embeddings.

    #tf.keras.Sequential: This function initializes a Sequential model, allowing the user to stack layers one after another in a sequence
      #tf.keras.layers.Dense(units = 256, activation=’relu’): This function creates a densely connected neural network layer
      #with 256 units/neurons and uses the ‘ReLU’ activation function.
      #tf.keras.layers.Dense(1): model's output. It's a regression-type model (as the output is a single value)
      #since there is no activation function specified for this layer, implying it would output continuous values directly.
    self.rating_model = tf.keras.Sequential([
        tf.keras.layers.Dense(256, activation="relu"),
        tf.keras.layers.Dense(128, activation="relu"),
        tf.keras.layers.Dense(1),
    ])

    # Recommender systems are often composed of two components:
    # 1. a ranker model, scoring the candidates retrieved by the retrieval model to return a ranked shortlist of a few dozen candidates.
    # 2. a retrieval model, retrieving O(thousands) candidates from a corpus of O(millions) candidates.

    # Mean Squared Error (MSE) as the loss function and Root Mean Squared Error (RMSE) as a metric

    # tfrs.tasks.Ranking: designed for ranker models in recommendation systems.
    # It is used for training models to rank items or candidates according to their relevance or predicted scores.
      # loss: Loss function. Defaults to BinaryCrossentropy. tf.keras.losses.MeanSquaredError: Computes the mean of squares of errors between labels and predictions.
      # metrics: List of Keras metrics to be evaluated. tf.keras.metrics.RootMeanSquaredError: Computes root mean squared error metric between y_true and y_pred.
    self.rating_task: tf.keras.layers.Layer = tfrs.tasks.Ranking(
        loss=tf.keras.losses.MeanSquaredError(),
        metrics=[tf.keras.metrics.RootMeanSquaredError()],
    )
    # tfrs.tasks.Retrieval: A factorized retrieval task.
      # metrics: Object for evaluating top-K metrics over a corpus of candidates. These metrics measure how good the model is at
      #picking the true candidate out of all possible candidates in the system
      # tfrs.metrics.FactorizedTopK: calculates the top K factorized metrics for the recommendations made by the retrieval model
        #movies.batch(128): making it easier to process large datasets in smaller chunks
        #map(self.movie_model): intended to transform movie data into embedding representations for retrieval purposes.
    self.retrieval_task: tf.keras.layers.Layer = tfrs.tasks.Retrieval(
        metrics=tfrs.metrics.FactorizedTopK(
            candidates=movies.batch(128).map(self.movie_model)
        )
    )

    # The loss weights.
    self.rating_weight = rating_weight
    self.retrieval_weight = retrieval_weight

  def call(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:
    # Forward pass method to compute predictions.
    # Retrieve user embeddings based on the "userId" feature.

    user_embeddings = self.user_model(features["userId"])
    movie_embeddings = self.movie_model(features["original_title"])
    # These embedding vectors mean user and movie in a lower-dimensional continuous vector space.

    return (
        user_embeddings,
        movie_embeddings,
        self.rating_model(
            tf.concat([user_embeddings, movie_embeddings], axis=1) # put user and movie embeddings together
        ),  # takes the combined embedding vector as input, put them to NN layers
    )

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    # Method to compute the loss during training.

    ratings = features.pop("rating")

    user_embeddings, movie_embeddings, rating_predictions = self(features)

    # Compute the loss for each task
    # real rating v.s. prediction rating
    rating_loss = self.rating_task(
        labels=ratings,
        predictions=rating_predictions,
    )
    retrieval_loss = self.retrieval_task(user_embeddings, movie_embeddings)

    # Combine losses using the specified weights.
    return (self.rating_weight * rating_loss
            + self.retrieval_weight * retrieval_loss)


In [None]:
#compile(optimizer): This parameter configures the optimizer used during training.
	#tf.keras.optimizers.Adagrad(0.1): The model is compiled with the Adagrad optimizer, a learning rate of 0.1.
  #This configures the model for training by specifying the optimizer and its learning rate.
model = MovieModel(rating_weight=1.0, retrieval_weight=1.0)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

cached_train = train.shuffle(100_000).batch(8_192).cache()
cached_test = test.batch(4_096).cache()

model.fit(cached_train, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x7a3f8898f790>

In [None]:
#model.evaluate() is a method in TensorFlow used to evaluate the model on a given dataset.
	#cached_test represents the test dataset that was preprocessed and cached.
	#return_dict=True signifies that the evaluation results will be returned as a dictionary.
metrics = model.evaluate(cached_test, return_dict=True)

print(f"\nRetrieval top-100 accuracy: {metrics['factorized_top_k/top_100_categorical_accuracy']:.3f}")
print(f"Ranking RMSE: {metrics['root_mean_squared_error']:.3f}")


Retrieval top-100 accuracy: 0.137
Ranking RMSE: 1.109


In [None]:
# model.save_weights('tfrs.h5')
metrics

{'root_mean_squared_error': 1.1086097955703735,
 'factorized_top_k/top_1_categorical_accuracy': 0.0006000000284984708,
 'factorized_top_k/top_5_categorical_accuracy': 0.008700000122189522,
 'factorized_top_k/top_10_categorical_accuracy': 0.018850000575184822,
 'factorized_top_k/top_50_categorical_accuracy': 0.07705000042915344,
 'factorized_top_k/top_100_categorical_accuracy': 0.13680000603199005,
 'loss': 30787.26171875,
 'regularization_loss': 0,
 'total_loss': 30787.26171875}

In [None]:
def predict_movie(user, top_n=3):
    # Create a model that takes in raw query features, and
    index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
    # recommends movies out of the entire movies dataset.
    # This index is used to perform nearest neighbor search in the embeddings space.


    #index_from_dataset(candidates): Builds the retrieval index. When called multiple times the existing index will be dropped and a new one created.
    #candidates  = Dataset of candidate embeddings or (candidate identifier, candidate embedding) pairs
      #tf.data.Dataset.zip(): Creates a Dataset by zipping together the given datasets.
        #movies.batch(100).map(model.movie_model: applies the model.movie_model (presumably an embedding model) to the movie dataset, mapping it to the corresponding embeddings.

    index.index_from_dataset(
      tf.data.Dataset.zip((movies.batch(100), movies.batch(100).map(model.movie_model)))
    )

    # Get recommendations.
    #tf.constant: Creates a constant tensor from a tensor-like object.
    _, titles = index(tf.constant([str(user)]))

    print('Top {} recommendations for user {}:\n'.format(top_n, user))
    for i, title in enumerate(titles[0, :top_n].numpy()):
        print('{}. {}'.format(i+1, title.decode("utf-8")))

def predict_rating(user, movie):
    trained_movie_embeddings, trained_user_embeddings, predicted_rating = model({
          "userId": np.array([str(user)]),
          "original_title": np.array([movie])
      })
    print("Predicted rating for {}: {}".format(movie, predicted_rating.numpy()[0][0]))

In [None]:
predict_movie(123, 10)

Top 10 recommendations for user 123:

1. The Most Dangerous Game
2. The 39 Steps
3. Live and Let Die
4. Crustacés et coquillages
5. Sleepless in Seattle
6. Jay and Silent Bob Strike Back
7. The Butterfly Effect
8. Die Frau mit den 5 Elefanten
9. Hostel
10. La Mort en direct


In [None]:
predict_rating(123,'Minions')

Predicted rating for Minions: 2.670973300933838


So, le't examine **User 123** from historical data

In [None]:
ratings_df[ratings_df['userId'] == 46]

Unnamed: 0,userId,movieId,rating,date,original_title,genres,overview
50033,46,587,3.0,2001-01-13 01:36:57,Big Fish,"Adventure, Fantasy, Drama",Throughout his life Edward Bloom has always be...
91598,46,39400,2.0,2006-02-15 23:59:23,Le fatiche di Ercole,"Fantasy, Adventure",In this melange of characters and events from ...


In [None]:
# Get meta data for predicted movie
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
# recommends movies out of the entire movies dataset.
index.index_from_dataset(
  tf.data.Dataset.zip((movies.batch(100), movies.batch(100).map(model.movie_model)))
)

# Get recommendations
# Perform nearest neighbor search for a specific user ID ('123')
_, titles = index(tf.constant(['123']))

# Create a DataFrame with the top 5 movie titles recommended for the user
pred_movies = pd.DataFrame({'original_title': [i.decode('utf-8') for i in titles[0, :5].numpy()]})

# Merge the recommended movie titles with additional movie information
pred_df = pred_movies.merge(ratings_df[['original_title', 'genres', 'overview']], on='original_title', how='left')

# Remove duplicate movie titles and reset the index
pred_df = pred_df[~pred_df['original_title'].duplicated()]
pred_df.reset_index(drop=True, inplace=True)
pred_df.index = np.arange(1, len(pred_df) + 1)


pred_df

Unnamed: 0,original_title,genres,overview
1,The Most Dangerous Game,"Adventure, Horror, Thriller",When legendary hunter Bob Rainsford is shipwre...
2,The 39 Steps,"Action, Thriller, Mystery","While on vacation in London, Canadian Richard ..."
3,Live and Let Die,"Adventure, Action, Thriller",James Bond must investigate a mysterious murde...
4,Crustacés et coquillages,Comedy,Crustaces et Coquillages is a fresh French com...
5,Sleepless in Seattle,"Comedy, Drama, Romance",A young boy who tries to set his dad up on a d...


At a glance, we can see if **User 123** love watching Drama movies most of the time. He/She also gives a good rating for that genre. In our recommendation, We give 5 more Drama movies that we expect him/her to love the movies in a similar way with the previous watched movies.

In our dataset, we don't see any Animation movies that have been watched by **User 123**. So, it's not a surprise if the estimated rating for Minions is quite low

### XGBoost model

https://towardsdatascience.com/how-to-build-a-movie-recommendation-system-67e321339109



In [None]:
# from surprise import SVD, SVDpp
from scipy.sparse import csr_matrix
import numpy as np
# import surprise
# from surprise import Reader, Dataset
# from surprise.model_selection import train_test_split


In [None]:
# convert timestamps into datetime format, merge two DataFrames
# filter out rows that do not have matching data, and reset the index in preparation for further analysis or processing.
ratings_full = pd.read_csv('/gdrive/MyDrive/CSE6740/ratings.csv')

ratings_full = ratings_full.merge(df['id'], left_on='movieId',right_on='id', how='left')

ratings_full = ratings_full[~ratings_full['id'].isna()]
ratings_full.drop('id', axis=1, inplace=True)
ratings_full.reset_index(drop=True, inplace=True)

ratings_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10969295 entries, 0 to 10969294
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 334.8 MB


In [None]:
content_df = df[['id', 'genres', 'keywords']]
content_df.head()

Unnamed: 0,id,genres,keywords
0,862,"Animation, Comedy, Family","jealousy, toy, boy, friendship, friends, rival..."
1,8844,"Adventure, Fantasy, Family","board game, disappearance, based on children's..."
2,15602,"Romance, Comedy","fishing, best friend, duringcreditsstinger, ol..."
3,31357,"Comedy, Drama, Romance","based on novel, interracial relationship, sing..."
4,11862,Comedy,"baby, midlife crisis, confidence, aging, daugh..."


In [None]:
def remove_punc(text):
    cleaned = text.translate(str.maketrans('','', string.punctuation)).lower()
    clean_text = cleaned.translate(str.maketrans('','', string.digits))
    return clean_text

def separate(text):
    clean_text = []
    for t in text.split(','):
        cleaned = re.sub('\(.*\)', '', t) # Remove text inside parentheses
        cleaned = cleaned.translate(str.maketrans('','', string.digits))
        cleaned = cleaned.replace(' ', '')
        cleaned = cleaned.translate(str.maketrans('','', string.punctuation)).lower()
        clean_text.append(cleaned)
    return ' '.join(clean_text)

content_df['genres'] = content_df['genres'].apply(remove_punc)
content_df['keywords'] = content_df['keywords'].apply(separate)
content_df['bag_of_words'] = content_df[content_df.columns[1:]].apply(lambda x: ' '.join(x), axis=1)
content_df.drop(['genres','keywords'], axis=1, inplace=True)

In [None]:
tfidf = TfidfVectorizer(stop_words='english', min_df=8)
tfidf_matrix = tfidf.fit_transform(content_df['bag_of_words'])
content_df.drop('bag_of_words', axis=1, inplace=True)
tfidf_matrix.shape

(42373, 3363)

In [None]:
TFIDF = pd.DataFrame(tfidf_matrix.todense())
TFIDF.shape

(42373, 3363)

In [None]:
content_df = pd.concat([content_df , TFIDF],axis=1)

In [None]:
ratings_df = ratings_full.sample(frac=1, random_state=42).reset_index(drop=True).iloc[:100000,:]
ratings_df['userId'] = ratings_df['userId'].astype(int)

ratings_df = ratings_df.merge(content_df, left_on='movieId',right_on='id', how='left')
train_size = 80000
test_size = 20000
train_data = ratings_df.iloc[:train_size,:]
test_data = ratings_df.iloc[train_size:100000,:]

test_data.reset_index(drop=True, inplace=True)

In [None]:
test_data.head()

Unnamed: 0,userId,movieId,rating,timestamp,id,0,1,2,3,4,...,3353,3354,3355,3356,3357,3358,3359,3360,3361,3362
0,162580,966,3.0,944926328,966,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,79952,74458,2.0,1436491747,74458,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,56159,788,2.0,967597802,788,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,85115,213,5.0,939029448,213,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,198327,3638,1.0,1037385948,3638,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# It is to specify how to read the data frame.
reader = Reader(rating_scale=(1,5))
# create the traindata from the data frame
train_data_mf = Dataset.load_from_df(train_data[['userId', 'movieId', 'rating']], reader)
test_data_mf = Dataset.load_from_df(test_data[['userId', 'movieId', 'rating']], reader)
# build the train set from traindata.
#It is of dataset format from surprise library
trainset = train_data_mf.build_full_trainset()
testset = test_data_mf.build_full_trainset()

svd = SVD(n_factors=100, biased=True, random_state=15, verbose=True)
svd.fit(trainset)

#getting predictions of train set
train_preds = svd.test(trainset.build_testset())
train_pred_mf = np.array([pred.est for pred in train_preds])

test_preds = svd.test(testset.build_testset())
test_pred_mf = np.array([pred.est for pred in test_preds])

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19


In [None]:
def get_error_metrics(y_true, y_pred):
    rmse = np.sqrt(np.mean([ (y_true.values[i] - y_pred[i])**2 for i in range(len(y_pred)) ]))
    mape = np.mean(np.abs( (y_true - y_pred)/y_true )) * 100
    return rmse, mape

train_rmse, train_mape = get_error_metrics(train_data['rating'], train_pred_mf)
test_rmse, test_mape = get_error_metrics(test_data['rating'], test_pred_mf)
print(train_rmse, train_mape, test_rmse, test_mape)

1.2075542139065774 42.48565243119254 1.1589504897361724 42.08563925547099


In [None]:
train_pred_df = pd.Series(train_pred_mf)
train_rating_df = train_data['rating']

train_rating_df.corr(train_pred_df, method='spearman')

-0.0030702329552459878

Next, let’s create a function which takes the sparse matrix as input and gives the average ratings of a movie given by all users, and the average rating of all movies given by a single user.

In [None]:
# Creating a sparse matrix
train_sparse_matrix = csr_matrix((train_data.rating.values, (train_data.userId.values, train_data.movieId.values)))

train_averages = dict()
# get the global average of ratings in our train set.
train_global_average = train_sparse_matrix.sum()/train_sparse_matrix.count_nonzero()
train_averages['global'] = train_global_average

test_sparse_matrix = csr_matrix((test_data.rating.values, (test_data.userId.values,test_data.movieId.values)))

test_averages = dict()
# get the global average of ratings in our train set.
test_global_average = test_sparse_matrix.sum()/test_sparse_matrix.count_nonzero()
test_averages['global'] = test_global_average

In [None]:

sum_of_ratings = train_sparse_matrix.sum(axis=1).A1
sum_of_ratings.shape

is_rated = train_sparse_matrix!=0
no_of_ratings = is_rated.sum(axis=1).A1
u,m = train_sparse_matrix.shape


In [None]:
train_sparse_matrix[0]

<1x173492 sparse matrix of type '<class 'numpy.float64'>'
	with 0 stored elements in Compressed Sparse Row format>

In [None]:
# get the user averages in dictionary (key: user_id/movie_id, value: avg rating)
def get_average_ratings(sparse_matrix, of_users):
  # average ratings of user/axes
  ax = 1 if of_users else 0 # 1 - User axes,0 - Movie axes
  # ".A1" is for converting Column_Matrix to 1-D numpy array
  sum_of_ratings = sparse_matrix.sum(axis=ax).A1
  # Boolean matrix of ratings ( whether a user rated that movie or not)
  is_rated = sparse_matrix!=0
  # no of ratings that each user OR movie..
  no_of_ratings = is_rated.sum(axis=ax).A1
  # max_user and max_movie ids in sparse matrix
  u,m = sparse_matrix.shape
  print(u, m)
  # create a dictionary of users and their average ratings..
  average_ratings = { i : sum_of_ratings[i]/no_of_ratings[i]
  for i in range(u if of_users else m)
  if no_of_ratings[i] !=0}
  #return that dictionary of average ratings
  return average_ratings

train_averages['user'] = get_average_ratings(train_sparse_matrix, of_users=True)
train_averages['movie'] = get_average_ratings(train_sparse_matrix, of_users=False)

test_averages['user'] = get_average_ratings(test_sparse_matrix, of_users=True)
test_averages['movie'] = get_average_ratings(test_sparse_matrix, of_users=False)


270897 173492
270897 173492
270856 168409
270856 168409


In [None]:
len(test_averages['movie'].keys())

2211

In [None]:
top_sim_users_result = []
top_sim_movies_result = []
avg_result = []
for i in range(train_size):
  user = train_data['userId'][i]
  movie = train_data['movieId'][i]

  avg_result.append([train_averages['user'][user], train_averages['movie'][movie]])

  # compute the similar Users of the "user"
  user_sim = cosine_similarity(train_sparse_matrix[user], train_sparse_matrix).ravel()
  top_sim_users = user_sim.argsort()[::-1][1:] # we are ignoring 'The User' from its similar users.
  # get the ratings of most similar users for this movie
  top_ratings = train_sparse_matrix[top_sim_users, movie].toarray().ravel()
  # we will make it's length "5" by adding movie averages to
  top_sim_users_ratings = list(top_ratings[top_ratings != 0][:5])
  top_sim_users_ratings.extend([train_averages['movie'][movie]]*(5 -len(top_sim_users_ratings)))
  top_sim_users_result.append(top_sim_users_ratings)

  # compute the similar movies of the "movie"
  movie_sim = cosine_similarity(train_sparse_matrix[:,movie].T,train_sparse_matrix.T).ravel()
  top_sim_movies = movie_sim.argsort()[::-1][1:]
  # we are ignoring 'The User' from its similar users.
  # get the ratings of most similar movie rated by this user
  top_ratings = train_sparse_matrix[user, top_sim_movies].toarray().ravel()
  # we will make it's length "5" by adding user averages to
  top_sim_movies_ratings = list(top_ratings[top_ratings != 0][:5])
  top_sim_movies_ratings.extend([train_averages['user'][user]]*(5-len(top_sim_movies_ratings)))
  top_sim_movies_result.append(top_sim_movies_ratings)

avg_result_df = pd.DataFrame(avg_result, columns=['UAvg','MAvg'])
top_sim_user_df = pd.DataFrame(top_sim_users_result, columns=['sur1','sur2','sur3','sur4','sur5'])
top_sim_movie_df = pd.DataFrame(top_sim_movies_result, columns=['smr1','smr2','smr3','smr4','smr5'])
mf_df = pd.DataFrame(train_pred_mf,columns=['mf_svd'])

train_final_df = pd.concat([avg_result_df, top_sim_user_df,top_sim_movie_df, mf_df],axis=1)

In [None]:
test_df.to_excel('test.xlsx', index=False)

In [None]:
top_sim_users_result = []
top_sim_movies_result = []
avg_result = []
for i in range(test_size):
  user = test_data['userId'][i]
  movie = test_data['movieId'][i]

  avg_result.append([test_averages['user'][user], test_averages['movie'][movie]])

  # compute the similar Users of the "user"
  user_sim = cosine_similarity(test_sparse_matrix[user], test_sparse_matrix).ravel()
  top_sim_users = user_sim.argsort()[::-1][1:] # we are ignoring 'The User' from its similar users.
  # get the ratings of most similar users for this movie
  top_ratings = test_sparse_matrix[top_sim_users, movie].toarray().ravel()
  # we will make it's length "5" by adding movie averages to
  top_sim_users_ratings = list(top_ratings[top_ratings != 0][:5])
  top_sim_users_ratings.extend([test_averages['movie'][movie]]*(5 -len(top_sim_users_ratings)))
  top_sim_users_result.append(top_sim_users_ratings)

  # compute the similar movies of the "movie"
  movie_sim = cosine_similarity(test_sparse_matrix[:,movie].T,test_sparse_matrix.T).ravel()
  top_sim_movies = movie_sim.argsort()[::-1][1:]
  # we are ignoring 'The User' from its similar users.
  # get the ratings of most similar movie rated by this user
  top_ratings = test_sparse_matrix[user, top_sim_movies].toarray().ravel()
  # we will make it's length "5" by adding user averages to
  top_sim_movies_ratings = list(top_ratings[top_ratings != 0][:5])
  top_sim_movies_ratings.extend([test_averages['user'][user]]*(5-len(top_sim_movies_ratings)))
  top_sim_movies_result.append(top_sim_movies_ratings)

avg_result_df = pd.DataFrame(avg_result, columns=['UAvg','MAvg'])
top_sim_user_df = pd.DataFrame(top_sim_users_result, columns=['sur1','sur2','sur3','sur4','sur5'])
top_sim_movie_df = pd.DataFrame(top_sim_movies_result, columns=['smr1','smr2','smr3','smr4','smr5'])
mf_df = pd.DataFrame(test_pred_mf,columns=['mf_svd'])

test_final_df = pd.concat([avg_result_df, top_sim_user_df,top_sim_movie_df, mf_df],axis=1)

In [None]:
avg_rate_train = pd.DataFrame([train_averages['global']]*80000, columns=['GAvg'])
avg_rate_test = pd.DataFrame([test_averages['global']]*20000, columns=['GAvg'])

train_df = pd.concat([train_data, train_final_df, avg_rate_train],axis=1)
test_df = pd.concat([test_data, test_final_df, avg_rate_test],axis=1)

In [None]:
train_data[train_data['userId']==123]

Unnamed: 0,userId,movieId,rating,date,original_title,genres,overview
41409,123,1994,5.0,2000-09-10 07:01:28,The Most Dangerous Game,"Adventure, Horror, Thriller",When legendary hunter Bob Rainsford is shipwre...


In [None]:
import xgboost as xgb

# prepare train data
x_train = train_df[['userId','movieId','UAvg','MAvg','GAvg','sur1','sur2','sur3','sur4','sur5','smr1','smr2','smr3','smr4','smr5']]
y_train = train_data['rating']

x_test = test_df[['userId','movieId','UAvg','MAvg','GAvg','sur1','sur2','sur3','sur4','sur5','smr1','smr2','smr3','smr4','smr5']]
y_test = test_data['rating']
# initialize XGBoost model
xgb_model = xgb.XGBRegressor(silent=False, n_jobs=13,random_state=0,n_estimators=100)
# fit the model
xgb_model.fit(x_train, y_train)

# from the trained model, get the predictions
y_train_pred = xgb_model.predict(x_train)
y_test_pred = xgb_model.predict(x_test)
# get the rmse and mape of train data
xgb_train_rmse, xgb_train_mape = get_error_metrics(train_data['rating'], y_train_pred)
xgb_test_rmse, xgb_test_mape = get_error_metrics(test_data['rating'], y_test_pred)

print(xgb_train_rmse, xgb_train_mape, xgb_test_rmse, xgb_test_mape)

0.1246651306073701 2.008551476296806 0.06756645842547057 0.7522913959371665


In [None]:
FI = xgb_model.feature_importances_.tolist()
FI_column = zip(x_test.columns, FI )
FI_column_sorted = sorted(FI_column, key=lambda x: x[1], reverse=True)
FI_column_sorted

[('UAvg', 0.6265078783035278),
 ('smr1', 0.1362086832523346),
 ('smr2', 0.08238785713911057),
 ('smr3', 0.04491965472698212),
 ('MAvg', 0.03635847941040993),
 ('smr4', 0.023386387154459953),
 ('smr5', 0.01588730327785015),
 ('sur4', 0.005519349128007889),
 ('sur5', 0.005332049913704395),
 ('sur1', 0.004942768719047308),
 ('movieId', 0.004885702393949032),
 ('sur3', 0.004569360055029392),
 ('sur2', 0.0045484318397939205),
 ('userId', 0.004546145908534527),
 ('GAvg', 0.0)]

In [None]:
# prepare train data
x_train = train_df[['UAvg','MAvg','smr1','smr2','smr3']]
y_train = train_data['rating']

x_test = test_df[['UAvg','MAvg','smr1','smr2','smr3']]
y_test = test_data['rating']
# initialize XGBoost model
xgb_model = xgb.XGBRegressor(silent=False, n_jobs=13,random_state=0,n_estimators=100)
# fit the model
xgb_model.fit(x_train, y_train)

# from the trained model, get the predictions
y_train_pred = xgb_model.predict(x_train)
y_test_pred = xgb_model.predict(x_test)
# get the rmse and mape of train data
xgb_train_rmse, xgb_train_mape = get_error_metrics(train_data['rating'], y_train_pred)
xgb_test_rmse, xgb_test_mape = get_error_metrics(test_data['rating'], y_test_pred)

print(xgb_train_rmse, xgb_train_mape, xgb_test_rmse, xgb_test_mape)

0.22411115439293539 3.4552286847322464 0.08047082552579447 0.8958205398269115


In [None]:
train_pred_df = pd.Series(train_pred_mf)
train_rating_df = train_data['rating']

train_rating_df.corr(train_pred_df, method='spearman')

In [None]:
# prepare train data
x_train = train_df[['userId', 'movieId']]
y_train = train_data['rating']

x_test = test_df[['userId', 'movieId']]
y_test = test_data['rating']
# initialize XGBoost model
xgb_model = xgb.XGBRegressor(silent=False, n_jobs=13,random_state=0,n_estimators=100)
# fit the model
xgb_model.fit(x_train, y_train)

# from the trained model, get the predictions
y_train_pred = xgb_model.predict(x_train)
y_test_pred = xgb_model.predict(x_test)
# get the rmse and mape of train data
xgb_train_rmse, xgb_train_mape = get_error_metrics(train_data['rating'], y_train_pred)
xgb_test_rmse, xgb_test_mape = get_error_metrics(test_data['rating'], y_test_pred)

print(xgb_train_rmse, xgb_train_mape, xgb_test_rmse, xgb_test_mape)

0.9921664119981856 35.85956286735998 1.0444666681635366 38.44016613589771


In [None]:
train_data.head()

Unnamed: 0,userId,movieId,rating,timestamp,id,original_title,0,1,2,3,...,2614,2615,2616,2617,2618,2619,2620,2621,2622,2623
0,165860,218,4.0,836689671,218,The Terminator,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,40655,8487,4.0,1203824663,8487,Wild Wild West,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,126998,2116,4.0,945307572,2116,Out of Time,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,196152,4993,5.0,1065515476,4993,5 Card Stud,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,56836,637,3.0,1196112012,637,La vita è bella,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
y_train = train_data['rating']
y_test = test_data['rating']
train_data.drop(['rating', 'timestamp', 'id'], axis=1, inplace=True)
test_data.drop(['rating', 'timestamp', 'id'], axis=1, inplace=True)

In [None]:
# initialize XGBoost model 2
xgb_model = xgb.XGBRegressor(silent=False, n_jobs=13,random_state=0,n_estimators=100)
# fit the model
xgb_model.fit(train_data, y_train)

# from the trained model, get the predictions
y_train_pred = xgb_model.predict(train_data)
y_test_pred = xgb_model.predict(test_data)

def get_error_metrics(y_true, y_pred):
    rmse = np.sqrt(np.mean([ (y_true.values[i] - y_pred[i])**2 for i in range(len(y_pred)) ]))
    mape = np.mean(np.abs( (y_true - y_pred)/y_true )) * 100
    return rmse, mape

# get the rmse and mape of train data
xgb_train_rmse, xgb_train_mape = get_error_metrics(y_train, y_train_pred)
xgb_test_rmse, xgb_test_mape = get_error_metrics(y_test, y_test_pred)

print(xgb_train_rmse, xgb_train_mape, xgb_test_rmse, xgb_test_mape)

0.9556686391939742 34.10710725403304 1.0068179588854576 36.72305305921889


In [None]:
train_data.head()

Unnamed: 0,userId,movieId,0,1,2,3,4,5,6,7,...,12,13,14,15,16,17,18,19,20,21
0,165860,218,0.441426,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.559793,0.422383,0.0,0.0,0.0
1,40655,8487,0.334106,0.408382,0.0,0.255347,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.423696,0.0,0.0,0.0,0.545306
2,126998,2116,0.0,0.0,0.0,0.0,0.710255,0.0,0.381911,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.591339,0.0,0.0,0.0
3,196152,4993,0.467298,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.447139,0.0,0.0,0.762692
4,56836,637,0.0,0.0,0.0,0.7776,0.0,0.0,0.628759,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
FI = xgb_model.feature_importances_.tolist()
FI_column = zip(train_data.columns, FI )
FI_column_sorted = sorted(FI_column, key=lambda x: x[1], reverse=True)
FI_column_sorted

[(15, 0.08950704336166382),
 (9, 0.07807755470275879),
 (18, 0.07694049179553986),
 (0, 0.06137620657682419),
 ('movieId', 0.05325445532798767),
 (3, 0.05249348282814026),
 (6, 0.05194515362381935),
 (4, 0.04849595949053764),
 (16, 0.04812892526388168),
 (1, 0.046701978892087936),
 (7, 0.045905355364084244),
 (12, 0.042659126222133636),
 (20, 0.04154558479785919),
 (21, 0.04153696820139885),
 (14, 0.040531985461711884),
 (5, 0.035254038870334625),
 (11, 0.03393784165382385),
 (8, 0.031488120555877686),
 (10, 0.026519296690821648),
 (13, 0.022417258471250534),
 (2, 0.019944721832871437),
 ('userId', 0.01133846677839756),
 (17, 0.0),
 (19, 0.0)]

In [None]:
y_train

0        4.0
1        4.0
2        4.0
3        5.0
4        3.0
        ... 
79995    5.0
79996    4.0
79997    4.0
79998    5.0
79999    4.0
Name: rating, Length: 80000, dtype: float64

In [None]:
# y_train = train_data['rating']
# y_test = test_data['rating']
# train_data.drop(['rating', 'timestamp', 'id'], axis=1, inplace=True)
# test_data.drop(['rating', 'timestamp', 'id'], axis=1, inplace=True)
train_data.drop(['original_title'], axis=1, inplace=True)
test_data.drop(['original_title'], axis=1, inplace=True)


# initialize XGBoost model 3
xgb_model = xgb.XGBRegressor(silent=False, n_jobs=13,random_state=0,n_estimators=100)
# fit the model
xgb_model.fit(train_data, y_train)

# from the trained model, get the predictions
y_train_pred = xgb_model.predict(train_data)
y_test_pred = xgb_model.predict(test_data)

def get_error_metrics(y_true, y_pred):
    rmse = np.sqrt(np.mean([ (y_true.values[i] - y_pred[i])**2 for i in range(len(y_pred)) ]))
    mape = np.mean(np.abs( (y_true - y_pred)/y_true )) * 100
    return rmse, mape

# get the rmse and mape of train data
xgb_train_rmse, xgb_train_mape = get_error_metrics(y_train, y_train_pred)
xgb_test_rmse, xgb_test_mape = get_error_metrics(y_test, y_test_pred)

print(xgb_train_rmse, xgb_train_mape, xgb_test_rmse, xgb_test_mape)

0.9760101680017416 35.19276657545732 1.0108300520106692 37.16934085834121


In [None]:
y_train = train_data['rating']
y_test = test_data['rating']
train_data.drop(['rating', 'timestamp', 'id'], axis=1, inplace=True)
test_data.drop(['rating', 'timestamp', 'id'], axis=1, inplace=True)

# initialize XGBoost model 5
xgb_model = xgb.XGBRegressor(silent=False, n_jobs=13,random_state=0,n_estimators=100)
# fit the model
xgb_model.fit(train_data, y_train)

# from the trained model, get the predictions
y_train_pred = xgb_model.predict(train_data)
y_test_pred = xgb_model.predict(test_data)

def get_error_metrics(y_true, y_pred):
    rmse = np.sqrt(np.mean([ (y_true.values[i] - y_pred[i])**2 for i in range(len(y_pred)) ]))
    mape = np.mean(np.abs( (y_true - y_pred)/y_true )) * 100
    return rmse, mape

# get the rmse and mape of train data
xgb_train_rmse, xgb_train_mape = get_error_metrics(y_train, y_train_pred)
xgb_test_rmse, xgb_test_mape = get_error_metrics(y_test, y_test_pred)

print(xgb_train_rmse, xgb_train_mape, xgb_test_rmse, xgb_test_mape)

0.9601123177457371 34.41019697570398 1.0041091859501878 36.83605694963724


In [None]:
test_data.shape

(20000, 3365)

In [None]:
drive.flush_and_unmount()