In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from ast import literal_eval
import builtins

import json
from itertools import islice

from sklearn import preprocessing
from lightfm.evaluation import auc_score, precision_at_k
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm import cross_validation
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

## Model 1: Simple Recommender

In [None]:
df_movie_features = pd.read_csv('./data/data_movies_features.csv', encoding='utf_8')
df_movie_features.head()

Unnamed: 0,id,original_language,popularity,runtime,title,vote_average,vote_count,year,genres_list,production_companies_list,actor,director_name,keywords
0,862,en,21.946943,81.0,Toy Story,7.7,5415.0,1995.0,"['Animation', 'Comedy', 'Family']",Pixar Animation Studios,"['Tom Hanks', 'Tim Allen', 'Don Rickles', 'Jim...",John Lasseter,"['jealousy', 'toy', 'boy', 'friendship', 'frie..."
1,8844,en,17.015539,104.0,Jumanji,6.9,2413.0,1995.0,"['Adventure', 'Fantasy', 'Family']",TriStar Pictures Teitler Film Interscope Commu...,"['Robin Williams', 'Jonathan Hyde', 'Kirsten D...",Joe Johnston,"['board game', 'disappearance', ""based on chil..."
2,15602,en,11.7129,101.0,Grumpier Old Men,6.5,92.0,1995.0,"['Romance', 'Comedy']",Warner Bros. Lancaster Gate,"['Walter Matthau', 'Jack Lemmon', 'Ann-Margret...",Howard Deutch,"['fishing', 'best friend', 'duringcreditssting..."
3,31357,en,3.859495,127.0,Waiting to Exhale,6.1,34.0,1995.0,"['Comedy', 'Drama', 'Romance']",Twentieth Century Fox Film Corporation,"['Whitney Houston', 'Angela Bassett', 'Loretta...",Forest Whitaker,"['based on novel', 'interracial relationship',..."
4,11862,en,8.387519,106.0,Father of the Bride Part II,5.7,173.0,1995.0,['Comedy'],Sandollar Productions Touchstone Pictures,"['Steve Martin', 'Diane Keaton', 'Martin Short...",Charles Shyer,"['baby', 'midlife crisis', 'confidence', 'agin..."


\begin{equation}\large
Weighted\;Rating =(\frac{v}{v+m}.R )+(\frac{m}{v+m}.C)
\end{equation}


* v is the number of votes for the movie

* m is the minimum votes required to be listed in the chart

* R is the average rating of the movie

* C is the mean vote across the whole dataset

In [None]:
# Function caculating weighted_rating score of movie x (x is a row in the dataframe)
def weighted_rating(x):
    m = x['vote_count'].quantile(0.95)
    C = x['vote_average'].mean()
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [None]:
# Divide the list of sequences into individual components corresponding to each genre of the movie
df_movie_features['genres_list'] = df_movie_features['genres_list'].apply(eval)
s = df_movie_features.apply(lambda x: pd.Series(x['genres_list']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'genre'
gen_md = df_movie_features.drop('genres_list', axis=1).join(s)

In [None]:
# Function recommend the best rated movies to the users based on the given genre
def build_chart(genre, percentile=0.85):
    # Retrieve movies by genre
    df = gen_md[gen_md['genre'] == genre]
    # the number of votes for the movies
    vote_counts = df[df['vote_count'].notnull()]['vote_count'].astype('int')
    # the average rating of the movie
    vote_averages = df[df['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(percentile)

    # Retrieve movies that meet the conditions: m is the minimum votes required to be listed in the qualified
    qualified = df[(df['vote_count'] >= m) & (df['vote_count'].notnull()) & (df['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity']]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')

    # Calculate the score for every movie
    qualified['wr'] = qualified.apply(weighted_rating, axis=1)
    # Sort the scores
    qualified = qualified.sort_values('wr', ascending=False).head(250)

    return qualified

In [None]:
build_chart('Animation')

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
300,The Lion King,1994.0,5520,8,21.605761,7.686761
9511,Up,2009.0,7048,7,19.330884,6.868372
17376,Inside Out,2015.0,6737,7,23.985587,6.863067
10447,Despicable Me,2010.0,6595,7,22.274502,6.860500
8920,WALL·E,2008.0,6439,7,16.088366,6.857566
...,...,...,...,...,...,...
8084,Happy Feet,2006.0,1457,5,15.088318,5.370165
6165,Shark Tale,2004.0,1612,5,17.999273,5.348022
11384,Cars 2,2011.0,2088,5,13.693002,5.294013
20066,Sausage Party,2016.0,2310,5,17.569630,5.274169


## Model 2: Content Based Recommender

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# This function takes a row from the DataFrame and creates a 'soup' string by combining the text content
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['actor']) + ' ' + x['director_name'] + ' ' + ' '.join(x['genres_list'])+ ' ' + ' '.join(x['title'])

In [None]:
# Apply the 'create_soup' function to each row in the DataFrame to create a new column 'soup'
df_movie_features['soup'] = df_movie_features.apply(create_soup, axis=1)
df_movie_features.head()

Unnamed: 0,id,original_language,popularity,runtime,title,vote_average,vote_count,year,genres_list,production_companies_list,actor,director_name,keywords,soup
0,862,en,21.946943,81.0,Toy Story,7.7,5415.0,1995.0,"[Animation, Comedy, Family]",Pixar Animation Studios,"['Tom Hanks', 'Tim Allen', 'Don Rickles', 'Jim...",John Lasseter,"['jealousy', 'toy', 'boy', 'friendship', 'frie...","[ ' j e a l o u s y ' , ' t o y ' , ' b o ..."
1,8844,en,17.015539,104.0,Jumanji,6.9,2413.0,1995.0,"[Adventure, Fantasy, Family]",TriStar Pictures Teitler Film Interscope Commu...,"['Robin Williams', 'Jonathan Hyde', 'Kirsten D...",Joe Johnston,"['board game', 'disappearance', ""based on chil...","[ ' b o a r d g a m e ' , ' d i s a p p e ..."
2,15602,en,11.7129,101.0,Grumpier Old Men,6.5,92.0,1995.0,"[Romance, Comedy]",Warner Bros. Lancaster Gate,"['Walter Matthau', 'Jack Lemmon', 'Ann-Margret...",Howard Deutch,"['fishing', 'best friend', 'duringcreditssting...","[ ' f i s h i n g ' , ' b e s t f r i e n ..."
3,31357,en,3.859495,127.0,Waiting to Exhale,6.1,34.0,1995.0,"[Comedy, Drama, Romance]",Twentieth Century Fox Film Corporation,"['Whitney Houston', 'Angela Bassett', 'Loretta...",Forest Whitaker,"['based on novel', 'interracial relationship',...","[ ' b a s e d o n n o v e l ' , ' i n t ..."
4,11862,en,8.387519,106.0,Father of the Bride Part II,5.7,173.0,1995.0,[Comedy],Sandollar Productions Touchstone Pictures,"['Steve Martin', 'Diane Keaton', 'Martin Short...",Charles Shyer,"['baby', 'midlife crisis', 'confidence', 'agin...","[ ' b a b y ' , ' m i d l i f e c r i s i ..."


\begin{equation}\large
   \cos\theta = \frac{\overrightarrow{a}.\overrightarrow{b}}{\lVert{\overrightarrow{a}}\rVert {\lVert{\overrightarrow{b}}\rVert}}
\end{equation}

\begin{equation}
   \lVert{\overrightarrow{a}}\rVert = \sqrt{a_{1}^2 + a_{2}^2 +a_{3}^2 + ... + a_{n}^2}
\end{equation}
   
\begin{equation}
   \lVert{\overrightarrow{b}}\rVert = \sqrt{b_{1}^2 + b_{2}^2 +b_{3}^2 + ... + b_{n}^2}
\end{equation}

In [None]:
# Initialize a CountVectorizer
count = CountVectorizer(stop_words='english')

# Fit the CountVectorizer to the 'soup' column in the DataFrame
# This will learn the vocabulary of the text data
count.fit(df_movie_features['soup'])

# Transform the 'soup' column into a document-term matrix
# This will represent each document as a vector in a high-dimensional space
count_matrix = count.transform(df_movie_features['soup'])

# Compute the cosine similarity between all pairs of documents in the document-term matrix
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [None]:
df_movie_features = df_movie_features.reset_index()
indices = pd.Series(df_movie_features.index, index=df_movie_features['title'])

In [None]:
def get_recommendations(titles, cosine_sim=cosine_sim):
    id = indices.get(titles, None)
    if (id is None):
        a=df_movie_features.query('title.str.contains(@titles)').sort_values(by=['vote_average', 'vote_count'], ascending=False).reset_index()
        id = indices[a['title'][0]]
    if (id.shape != ()):
        id = id[0]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[id]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return df_movie_features[['id', 'title', 'director_name', 'actor', 'genres_list', 'keywords']].iloc[movie_indices]

In [None]:
get_recommendations('Harry Potter', cosine_sim)

Unnamed: 0,id,title,director_name,actor,genres_list,keywords
10784,12444,Harry Potter and the Deathly Hallows: Part 1,David Yates,"['Daniel Radcliffe', 'Emma Watson', 'Rupert Gr...","[Adventure, Fantasy, Family]","['corruption', 'isolation', 'radio', 'magic', ..."
11417,12445,Harry Potter and the Deathly Hallows: Part 2,David Yates,"['Daniel Radcliffe', 'Rupert Grint', 'Emma Wat...","[Family, Fantasy, Adventure]","['self sacrifice', 'magic', 'frog', 'sorcerer'..."
17398,259316,Fantastic Beasts and Where to Find Them,David Yates,"['Eddie Redmayne', 'Colin Farrell', 'Katherine...","[Adventure, Family, Fantasy]","['robbery', 'magic', 'teleportation', 'suitcas..."
8423,675,Harry Potter and the Order of the Phoenix,David Yates,"['Daniel Radcliffe', 'Rupert Grint', 'Emma Wat...","[Adventure, Fantasy, Family, Mystery]","['prophecy', 'witch', 'loss of lover', 'magic'..."
20299,294272,Pete's Dragon,David Lowery,"['Bryce Dallas Howard', 'Oakes Fegley', 'Wes B...","[Adventure, Family, Fantasy]","['feral child', 'remake', 'dragon', 'orphan', ..."
8531,2274,The Seeker: The Dark Is Rising,David L. Cunningham,"['Ian McShane', 'Christopher Eccleston', 'Greg...","[Adventure, Drama, Fantasy, Family, Thriller]","['fight', 'dynasty', 'chosen one', 'earth', 'i..."
13924,18224,Bionicle 3: Web of Shadows,David Molina,"['Kathleen Barr', 'Trevor Devall', 'Brian Drum...","[Action, Adventure, Animation, Family, Fantasy]","['return', 'hero', 'enemy']"
5342,10601,Peter Pan,P.J. Hogan,"['Jeremy Sumpter', 'Jason Isaacs', 'Rachel Hur...","[Adventure, Fantasy, Family]","['flying', 'liberation', 'fairy', 'peter pan',..."
20294,258489,The Legend of Tarzan,David Yates,"['Alexander Skarsgård', 'Margot Robbie', 'Chri...","[Action, Adventure]","['africa', 'feral child', 'tarzan', 'jungle', ..."
18377,34204,Return to Halloweentown,David Jackson,"['Sara Paxton', 'Judith Hoag', 'Debbie Reynold...","[Adventure, Comedy, Family, Fantasy, TV Movie]","['holiday', 'witch', 'magic', 'disney channel']"


In [None]:
get_recommendations('Down Periscope', cosine_sim)

Unnamed: 0,id,title,director_name,actor,genres_list,keywords
3557,9942,Major League,David S. Ward,"['Tom Berenger', 'Charlie Sheen', 'Corbin Bern...",[Comedy],"['baseball', 'sport', 'sabotage', 'sombrero', ..."
6764,11067,Major League II,David S. Ward,"['Charlie Sheen', 'Tom Berenger', 'Corbin Bern...",[Comedy],"['baseball', 'sport', 'sequel', 'sports league']"
5222,10804,King Ralph,David S. Ward,"['John Goodman', ""Peter O'Toole"", 'John Hurt',...","[Comedy, Family]","['love at first sight', ""love of one's life"", ..."
433,18133,The Program,David S. Ward,"['James Caan', 'Halle Berry', 'Omar Epps', 'Cr...",[Drama],"['american football', 'sport']"
1135,9611,Romy and Michele's High School Reunion,David Mirkin,"['Mira Sorvino', 'Lisa Kudrow', 'Janeane Garof...",[Comedy],"['smoking', 'overweight child', 'graduation', ..."
1367,24560,Sour Grapes,Larry David,"['Steven Weber', 'Craig Bierko', 'Viola Harris...",[Comedy],"['greed', 'dollar', 'jackpot', 'atlantic city'..."
1535,14013,BASEketball,David Zucker,"['Trey Parker', 'Matt Stone', 'Yasmine Bleeth'...",[Comedy],"['stadium', 'invention', 'sport', 'mascot', 'p..."
3577,2171,Wet Hot American Summer,David Wain,"['Janeane Garofalo', 'David Hyde Pierce', 'Mic...",[Comedy],"['adolescence', 'summer camp', 'independent fi..."
4020,31805,Road to Morocco,David Butler,"['Bing Crosby', 'Bob Hope', 'Dorothy Lamour', ...",[Comedy],"['morocco', 'castaway', 'road movie']"
4494,25297,Cannery Row,David S. Ward,"['Nick Nolte', 'Debra Winger', 'Audra Lindley'...",[Drama],['cannery']


## Model 3 & 4: Collaborative model and Hybrid model using LightFM

In [None]:
df_rating = pd.read_csv('./data/ratings.csv')
df_rating.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556


In [None]:
df_rating.drop(columns = 'timestamp', inplace=True)
df_rating.columns = ['user_id','movie_id','rating']
df_rating.head()

Unnamed: 0,user_id,movie_id,rating
0,1,110,1.0
1,1,147,4.5
2,1,858,5.0
3,1,1221,5.0
4,1,1246,5.0


In [None]:
df_rating.user_id.nunique(), df_rating.movie_id.nunique()

(270896, 45115)

### Prepare movie features

1. Apply the same encoder that we used to split train/test data

2. Columns refer to the column names of the item features (product_id excluded)

3. To prepare the item_features, need to use the Dataset class in LightFM API.

4. First fit the dataset instance and then call function build_item_features to generate the item features for modeling.

In [None]:
# df_movie_features = pd.read_csv('./data/data.csv', encoding='utf_8')
df_movie_features = pd.read_csv('./data/data_movies_features.csv', encoding='utf_8')
df_movie_features.rename(columns = {'id':'movie_id'}, inplace=True)
df_movie_features

Unnamed: 0,movie_id,original_language,popularity,runtime,title,vote_average,vote_count,year,genres_list,production_companies_list,actor,director_name,keywords
0,862,en,21.946943,81.0,Toy Story,7.7,5415.0,1995.0,"['Animation', 'Comedy', 'Family']",Pixar Animation Studios,"['Tom Hanks', 'Tim Allen', 'Don Rickles', 'Jim...",John Lasseter,"['jealousy', 'toy', 'boy', 'friendship', 'frie..."
1,8844,en,17.015539,104.0,Jumanji,6.9,2413.0,1995.0,"['Adventure', 'Fantasy', 'Family']",TriStar Pictures Teitler Film Interscope Commu...,"['Robin Williams', 'Jonathan Hyde', 'Kirsten D...",Joe Johnston,"['board game', 'disappearance', ""based on chil..."
2,15602,en,11.712900,101.0,Grumpier Old Men,6.5,92.0,1995.0,"['Romance', 'Comedy']",Warner Bros. Lancaster Gate,"['Walter Matthau', 'Jack Lemmon', 'Ann-Margret...",Howard Deutch,"['fishing', 'best friend', 'duringcreditssting..."
3,31357,en,3.859495,127.0,Waiting to Exhale,6.1,34.0,1995.0,"['Comedy', 'Drama', 'Romance']",Twentieth Century Fox Film Corporation,"['Whitney Houston', 'Angela Bassett', 'Loretta...",Forest Whitaker,"['based on novel', 'interracial relationship',..."
4,11862,en,8.387519,106.0,Father of the Bride Part II,5.7,173.0,1995.0,['Comedy'],Sandollar Productions Touchstone Pictures,"['Steve Martin', 'Diane Keaton', 'Martin Short...",Charles Shyer,"['baby', 'midlife crisis', 'confidence', 'agin..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
22452,84419,en,0.222814,65.0,House of Horrors,6.3,8.0,1946.0,"['Horror', 'Mystery', 'Thriller']",Universal Pictures,"['Rondo Hatton', 'Robert Lowery', 'Virginia Gr...",Jean Yarbrough,"['revenge', 'murder', 'serial killer', 'new yo..."
22453,289923,en,0.386450,30.0,The Burkittsville 7,7.0,1.0,2000.0,['Horror'],Neptune Salad Entertainment Pirie Productions,"['Monty Bane', 'Lucy Butler', 'David Grammer',...",Ben Rock,"['witch', 'mythology', 'legend', 'serial kille..."
22454,222848,en,0.661558,85.0,Caged Heat 3000,3.5,1.0,1995.0,['Science Fiction'],Concorde-New Horizons,"['Lisa Boyle', 'Kena Land', 'Zaneta Polard', '...",Aaron Osborne,[]
22455,30840,en,5.683753,104.0,Robin Hood,5.7,26.0,1991.0,"['Drama', 'Action', 'Romance']",Westdeutscher Rundfunk (WDR) Working Title Fil...,"['Patrick Bergin', 'Uma Thurman', 'David Morri...",John Irvin,[]


In [None]:
all_movie_ids = __builtins__.list(set(df_rating['movie_id']))

df_movie_features['movie_id'] = df_movie_features['movie_id'].apply(lambda x: 'other' if x not in all_movie_ids else x)
df_movie_features = df_movie_features[df_movie_features['movie_id'] != 'other']

In [None]:
len(__builtins__.list(set(df_movie_features['movie_id']))) == len(__builtins__.list(set(df_rating['movie_id'])))

False

In [None]:
all_movie_ids_features = __builtins__.list(set(df_movie_features['movie_id']))

df_rating['movie_id'] = df_rating['movie_id'].apply(lambda x: 'other' if x not in all_movie_ids_features else x)
df_rating = df_rating[df_rating['movie_id'] != 'other']

In [None]:
len(__builtins__.list(set(df_movie_features['movie_id']))) == len(__builtins__.list(set(df_rating['movie_id'])))

True

In [None]:
ratings = df_rating.to_dict('records')

for line in islice(ratings, 2):
    print(json.dumps(line, indent=4))

{
    "user_id": 1,
    "movie_id": 858,
    "rating": 5.0
}
{
    "user_id": 1,
    "movie_id": 1246,
    "rating": 5.0
}


### Building the ID mapping

#### CF model

In [None]:
# the Dataset class in LightFM API
dataset = Dataset()

# Fit the dataset instance
dataset.fit((x['user_id'] for x in ratings), (x['movie_id'] for x in ratings))

# quick check to determine the number of unique users and items in the data
num_users, num_movies = dataset.interactions_shape()
print(f'Num users: {num_users}, num_movies: {num_movies}.')

Num users: 261810, num_movies: 4125.


#### Hybrid model

1. Columns refer to the column names of the item features used to fit model (movies_id excluded)

2. To prepare the item_features, need to use the Dataset class in LightFM API.

3. First fit the dataset instance and then call function build_item_features to generate the item features for modeling.

In [None]:
def generate_feature_list(df, columns):
    '''
    Generate the list of features of corresponding columns to list
    In order to fit the lightfm Dataset
    '''
    features = df[columns].apply(lambda x: ','.join(x.map(str)), axis = 1)
    features = features.str.split(',')
    features = features.apply(pd.Series).stack().reset_index(drop = True)
    return features

def prepare_item_features(df, columns, id_col_name):
    '''
    Prepare the corresponding feature formats for
    the lightfm.dataset's build_item_features function
    '''
    features = df[columns].apply(lambda x: ','.join(x.map(str)), axis = 1)
    features = features.str.split(',')
    features = __builtins__.list(zip(df[id_col_name], features))
    return features

In [None]:
# Retrieve column names of the item features used to fit model (movies_id excluded)
columns = df_movie_features.columns.to_list()
columns.remove('movie_id')

In [None]:
# the Dataset class in LightFM API
dataset2 = Dataset()

fitting_item_features = generate_feature_list(df_movie_features, columns)
lightfm_features = prepare_item_features(df_movie_features, columns, 'movie_id')

# Fit the dataset instance
dataset2.fit((x['user_id'] for x in ratings), (x['movie_id'] for x in ratings), item_features = fitting_item_features)

# Call function build_item_features to generate the item features for modeling
item_features = dataset2.build_item_features(lightfm_features, normalize = True)

### Building the Interaction matrix

#### CF model

The build_interactions method returns 2 COO sparse matrices, namely the interactions and weights matrices.

In [None]:
(interactions, weights) = dataset.build_interactions(((x['user_id'], x['movie_id'], x['rating']) for x in ratings))

print(repr(interactions))

<261810x4125 sparse matrix of type '<class 'numpy.int32'>'
	with 8058110 stored elements in COOrdinate format>


Split train - test set

In [None]:
train_interactions, test_interactions = cross_validation.random_train_test_split(
    weights, test_percentage=0.2,
    random_state=np.random.RandomState(42))

In [None]:
print(f"Shape of train interactions: {train_interactions.shape}")
print(f"Shape of test interactions: {test_interactions.shape}")

Shape of train interactions: (261810, 4125)
Shape of test interactions: (261810, 4125)


#### Hybrid model

In [None]:
(interactions2, weights2) = dataset2.build_interactions(((x['user_id'], x['movie_id'], x['rating']) for x in ratings))

print(repr(interactions2))

<261810x4125 sparse matrix of type '<class 'numpy.int32'>'
	with 8058110 stored elements in COOrdinate format>


In [None]:
train_interactions2, test_interactions2 = cross_validation.random_train_test_split(
    weights2, test_percentage=0.2,
    random_state=np.random.RandomState(42))

In [None]:
print(f"Shape of train interactions: {train_interactions2.shape}")
print(f"Shape of test interactions: {test_interactions2.shape}")

Shape of train interactions: (261810, 4125)
Shape of test interactions: (261810, 4125)


### Building model

#### CF model

In [None]:
model_cf = LightFM(loss = 'warp',
                   no_components = 160,
                   item_alpha = 1e-7,
                   learning_rate = 0.02,
                   max_sampled = 50)

model_cf.fit(interactions=train_interactions, epochs = 20, num_threads = 4)

<lightfm.lightfm.LightFM at 0x4b95a2dd0>

#### Hybrid model

In [None]:
model_hybrid = LightFM(loss = 'warp',
                no_components = 160,
                item_alpha = 1e-7,
                learning_rate = 0.02,
                max_sampled = 50)

model_hybrid.fit(interactions=train_interactions2, item_features = item_features, epochs = 50, num_threads = 4)

<lightfm.lightfm.LightFM at 0x4b95a2380>

### Evaluation

#### CF model

In [None]:
df_result = pd.DataFrame(columns = ['Method', 'Evaluation Metric', 'Train', 'Test'])

auc_train = auc_score(model_cf, train_interactions).mean()
auc_test = auc_score(model_cf, test_interactions).mean()

precision_train = precision_at_k(model_cf, train_interactions, k = 10).mean()
precision_test = precision_at_k(model_cf, test_interactions, k = 10).mean()

df_result = df_result.append(pd.DataFrame([['CF', 'AUC', auc_train, auc_test],
                                           ['CF', 'Precision@10', precision_train, precision_test]],
                                          columns = df_result.columns))

In [None]:
df_result

Unnamed: 0,Method,Evaluation Metric,Train,Test
0,CF,AUC,0.99433,0.962841
1,CF,Precision@10,0.594967,0.049118


#### Hybrid model

In [None]:
auc_train = auc_score(model_hybrid, train_interactions2, item_features = item_features).mean()
auc_test = auc_score(model_hybrid, test_interactions2, item_features = item_features).mean()

precision_train = precision_at_k(model_hybrid, train_interactions2, item_features = item_features, k = 10).mean()
precision_test = precision_at_k(model_hybrid, test_interactions2, item_features = item_features, k = 10).mean()

df_result = df_result.append(pd.DataFrame([['Hybrid', 'AUC', auc_train, auc_test],
                                           ['Hybrid', 'Precision@10', precision_train, precision_test]],
                                          columns = df_result.columns))

In [None]:
df_result

Unnamed: 0,Method,Evaluation Metric,Train,Test
0,CF,AUC,0.99433,0.962841
1,CF,Precision@10,0.594967,0.049118
0,Hybrid,AUC,0.994585,0.96146
1,Hybrid,Precision@10,0.599013,0.046722


### Recommendation

In [None]:
def get_positive_movie_id(weighted_interactions, user_id, movie_id_mapping):
    # list is a np.array which save IDs of books that user, whose id is user_id, rated greater than or equal to 4 stars
    list = []
    for i in weighted_interactions.tocsr()[user_id].indices:
        if weighted_interactions.tocsr()[user_id, i] >= 4:
            list.append(movie_id_mapping[i])

    return np.array(list)

# these are users we want to generate recommendations for
def sample_recommendation(model, weighted_interactions, dataset, user_ids, items_data):

    # number of users and books in data
    n_users, n_items = weighted_interactions.shape
    items_data.set_index(['movie_id'], inplace=True)

    # generate recommendations for each user we input
    for user_id in user_ids:

        # considers ratings that are 5 'positive' and ratings that are 4 or below 'negative'
        # to make the problem binary -> much simple

        # In this dictionary, the Values are the internal indices and the Keys are external ids.
        movie_id_mapping = dataset.mapping()[2]
        # Invert these to map internal indices to external ids.
        inv_movie_id_mapping = {v: k for k, v in movie_id_mapping.items()}

        # books they already like
        known_positives = get_positive_movie_id(weighted_interactions, user_id, inv_movie_id_mapping)

        # books our model predicts they will like
        user_x = dataset.mapping()[0][user_id]
        scores = model.predict(user_x, np.arange(n_items))
        # np.argsort(-score) rank them in order of most like to least
        # mapping internal indices to external ids of top_items
        top_recommendations = np.array([inv_movie_id_mapping[i] for i in np.argsort(-scores)])

        # print out the results
        print("User %s --------------------------------------- \n" % user_id)

        print("    Known positives:")
        display(items_data.loc[known_positives[:3]])

        print("    Recommended:")
        display(items_data.loc[top_recommendations[:3]])

#### CF model

In [None]:
sample_recommendation(model_cf, weights, dataset, [5, 21], df_movie_features[['movie_id', 'title', 'year', 'director_name']])

User 5 --------------------------------------- 

    Known positives:


Unnamed: 0_level_0,title,year,director_name
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2565,Joe Versus the Volcano,1990.0,John Patrick Shanley


    Recommended:


Unnamed: 0_level_0,title,year,director_name
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
541,The Man with the Golden Arm,1955.0,Otto Preminger
750,Murder She Said,1961.0,George Pollock
3114,The Searchers,1956.0,John Ford


User 21 --------------------------------------- 

    Known positives:


Unnamed: 0_level_0,title,year,director_name
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
260,The 39 Steps,1935.0,Alfred Hitchcock
4993,5 Card Stud,1968.0,Henry Hathaway
1610,Fever Pitch,1997.0,David Evans


    Recommended:


Unnamed: 0_level_0,title,year,director_name
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1645,A Time to Kill,1996.0,Joel Schumacher
2006,"Bell, Book and Candle",1958.0,Richard Quine
31696,A Kiss Before Dying,1991.0,James Dearden


#### Hybrid model

In [None]:
sample_recommendation(model_hybrid, weights2, dataset2, [5, 21], df_movie_features[['movie_id', 'title', 'year', 'director_name']])

User 5 --------------------------------------- 

    Known positives:


Unnamed: 0_level_0,title,year,director_name
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2565,Joe Versus the Volcano,1990.0,John Patrick Shanley


    Recommended:


Unnamed: 0_level_0,title,year,director_name
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
750,Murder She Said,1961.0,George Pollock
541,The Man with the Golden Arm,1955.0,Otto Preminger
4993,5 Card Stud,1968.0,Henry Hathaway


User 21 --------------------------------------- 

    Known positives:


Unnamed: 0_level_0,title,year,director_name
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
260,The 39 Steps,1935.0,Alfred Hitchcock
4993,5 Card Stud,1968.0,Henry Hathaway
1610,Fever Pitch,1997.0,David Evans


    Recommended:


Unnamed: 0_level_0,title,year,director_name
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4643,The Guardian,2006.0,Andrew Davis
31696,A Kiss Before Dying,1991.0,James Dearden
1645,A Time to Kill,1996.0,Joel Schumacher
