# Recommendation system from scratch

## Import libraries

In [7]:
# importing necessary libraries
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

import nltk
nltk.download('stopwords')
nltk.download('punkt')

from nltk.corpus import stopwords
from tqdm import tqdm
import re
from nltk.tokenize import word_tokenize

import warnings
warnings.simplefilter("ignore")

## Read dataset

In [10]:
# Load dữ liệu movies_metadata.csv
movies = pd.read_csv('../../data_example/Movielens/ml-latest-small/movies_metadata_test.csv')
movies

Unnamed: 0.1,Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,False,"{'id': 271668, 'name': 'Caballeros Collection'...",0,"[{'id': 16, 'name': 'Animation'}, {'id': 10751...",http://movies.disney.com/the-three-caballeros,15947,tt0038166,en,The Three Caballeros,...,1944-07-21,0.0,71.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,,The Three Caballeros,False,6.2,108.0
996,996,False,,3000000,"[{'id': 16, 'name': 'Animation'}, {'id': 10751...",,9078,tt0057546,en,The Sword in the Stone,...,1963-12-25,22182353.0,79.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Tired of living in a Medieval mess...Merlin us...,The Sword in the Stone,False,6.9,935.0
997,997,False,,0,"[{'id': 16, 'name': 'Animation'}, {'id': 18, '...",,29682,tt0041890,en,So Dear to My Heart,...,1948-11-29,0.0,79.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,So Dear to My Heart,False,6.3,5.0
998,998,False,,48000000,"[{'id': 12, 'name': 'Adventure'}]",,8367,tt0102798,en,Robin Hood: Prince of Thieves,...,1991-06-14,390493908.0,143.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,"For the good of all men, and the love of one w...",Robin Hood: Prince of Thieves,False,6.6,937.0


In [11]:
movies[['title','overview']]

Unnamed: 0,title,overview
0,Toy Story,led woody andy toys live happily room andy bir...
1,Jumanji,siblings judy peter discover enchanted board g...
2,Grumpier Old Men,family wedding reignites ancient feud next doo...
3,Waiting to Exhale,cheated mistreated stepped women holding breat...
4,Father of the Bride Part II,george banks recovered daughter wedding receiv...
...,...,...
995,The Three Caballeros,donald birthday receives box three gifts insid...
996,The Sword in the Stone,wart young boy aspires knight squire hunting t...
997,So Dear to My Heart,tale jeremiah kincaid quest raise champion lam...
998,Robin Hood: Prince of Thieves,dastardly sheriff nottingham murders robin fat...


In [12]:
# Replace NaN with an empty string
movies['overview'] = movies['overview'].fillna('')

## Preprocessing function

In [13]:
# Viết hàm tiền xử lý dữ liệu trên cột overview
def processing_data(data):
  # chuyển về từ thường
  data = data.lower()
  # xóa dấu câu, ký tự đặc biệt
  data = re.sub('\W+',' ', data)
  # xóa khoảng trắng đầu và cuối câu
  data = data.strip()
  # Xóa stopword
  data = ' '.join([word for word in data.split() if word not in stopwords.words("english")])
  # Tách từ
  data = word_tokenize(data)
  data = ' '.join(data)
  return data

In [14]:
# Viết hàm tiền xử lý dữ liệu trên cột overview
def processing_overview(col):
  list_processed = []
  for i in tqdm(range(len(col))):
      list_processed.append(processing_data(col[i]))
  return list_processed

In [15]:
# Xử lý dữ liệu trên cột overview
processing_overview(movies['overview'])

100%|██████████| 1000/1000 [00:06<00:00, 165.32it/s]


['led woody andy toys live happily room andy birthday brings buzz lightyear onto scene afraid losing place andy heart woody plots buzz circumstances separate buzz woody owner duo eventually learns put aside differences',
 'siblings judy peter discover enchanted board game opens door magical world unwittingly invite alan adult trapped inside game 26 years living room alan hope freedom finish game proves risky three find running giant rhinoceroses evil monkeys terrifying creatures',
 'family wedding reignites ancient feud next door neighbors fishing buddies john max meanwhile sultry italian divorcée opens restaurant local bait shop alarming locals worry scare fish away less interested seafood cooking hot time max',
 'cheated mistreated stepped women holding breath waiting elusive good man break string less stellar lovers friends confidants vannah bernie glo robin talk determined find better way breathe',
 'george banks recovered daughter wedding receives news pregnant george wife nina ex

## TF-IDF

In [16]:
#sử dụng TfidfVectorizer để chuyển cột overview của các bộ phim về tf-idf và lưu vào biến overview_matrix
vectorizer = TfidfVectorizer(stop_words='english')
overview_matrix = vectorizer.fit_transform(movies['overview'])

In [27]:
# tính toán cosine giữa các bộ phim với nhau bằng linear_kernel
overview_matrix_new = overview_matrix[:int(overview_matrix.shape[0]),:int(overview_matrix.shape[1])]

In [28]:
# Tính toán cosine giữa các bộ phim với nhau bằng linear_kernel
cosine_sim = linear_kernel(overview_matrix_new, overview_matrix_new)
cosine_sim

array([[1.        , 0.01570657, 0.        , ..., 0.        , 0.        ,
        0.01234882],
       [0.01570657, 1.        , 0.05108323, ..., 0.        , 0.01578968,
        0.02378018],
       [0.        , 0.05108323, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.01578968, 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.01234882, 0.02378018, 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [29]:
#đánh index cho các bộ phim bằng  pd.Series() và lưu trong biến mapping
mapping = pd.Series(movies.index, index=movies['title']).drop_duplicates()
mapping

title
Toy Story                          0
Jumanji                            1
Grumpier Old Men                   2
Waiting to Exhale                  3
Father of the Bride Part II        4
                                ... 
The Three Caballeros             995
The Sword in the Stone           996
So Dear to My Heart              997
Robin Hood: Prince of Thieves    998
Mary Poppins                     999
Length: 1000, dtype: int64

In [30]:
#viết hàm nhận vào index của bộ phim trả về tên của bộ phim
def get_title(index):
  Title = movies.iloc[index]['title']
  return Title

In [21]:
mapping

title
Toy Story                          0
Jumanji                            1
Grumpier Old Men                   2
Waiting to Exhale                  3
Father of the Bride Part II        4
                                ... 
The Three Caballeros             995
The Sword in the Stone           996
So Dear to My Heart              997
Robin Hood: Prince of Thieves    998
Mary Poppins                     999
Length: 1000, dtype: int64

## Recommend function

In [31]:
# viết hàm trả và top 10 bộ phim sẽ đem đi khuyến nghị cho người dùng U.
def get_recommendations(title, cosine_sim=cosine_sim):
  movie_index = mapping[title]
  sim_scores = list(enumerate(cosine_sim[movie_index]))
  sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
  sim_scores = sim_scores[1:10]
  movie_mapping = [i[0] for i in sim_scores]
  titles = [index for index, title in mapping.items() if title in movie_mapping]
  return titles
  # return movies['title'].iloc[movie_mapping]

In [32]:
get_recommendations('Father of the Bride Part II')

['Nine Months',
 'The Madness of King George',
 'Nina Takes a Lover',
 'The War Room',
 'Killer',
 "My Mother's Courage",
 'The Philadelphia Story',
 'Father of the Bride',
 "It's a Wonderful Life"]

In [None]:
# Lấy ra tên của một bộ phim
def get_movie_name(movieid):
  return (movies.loc[(movies.movieId==movieid), 'title'].iloc[0])

In [None]:
# Lấy ra id của một bộ phim
def get_movie_id(movie_name):
  return (movies.loc[(movies.title==movie_name), 'id'].iloc[0])

In [None]:
user201_seen = get_movieids_seen(201)
len(user201_seen)

110

In [None]:
# Cắt ra 80 bộ phim đầu tiên làm tập test.
user201_seen_80 = user201_seen[:80]
len(user201_seen_80)

80

In [None]:
user201_seen_80[:20]

[1,
 11,
 16,
 24,
 25,
 32,
 34,
 46,
 150,
 153,
 160,
 185,
 196,
 260,
 293,
 296,
 339,
 356,
 380,
 435]

In [None]:
# Sử dụng overview của 8 bộ phim còn lại để biểu diễn nội dung của người dùng userId = 201 này.
list_recom_CB =[]
for i in user201_seen[40:48]:
  # print(get_recommendations(get_title(i)).values)
  list_recom_CB.append(get_recommendations(get_title(i)).values)

In [None]:
# 80 film được recommend
user201_pred_80 = []
for i in list_recom_CB:
  for j in i:
    user201_pred_80.append(int(get_movie_id(j)))

In [None]:
user201_pred_80[:20]

[125709,
 14675,
 32044,
 53128,
 214217,
 116340,
 651,
 46741,
 56133,
 21027,
 8749,
 10549,
 141489,
 10549,
 10549,
 10549,
 19952,
 6947,
 13439,
 10549]

## Evaluation: P@K, R@K, F1@K. (K=80) of userID = 201

In [None]:
from sklearn.metrics import precision_score
precision_score(user201_seen_80, user201_pred_80, average='macro')

0.0

In [None]:
from sklearn.metrics import recall_score
recall_score(user201_seen_80, user201_pred_80, average='macro')

0.0

In [None]:
from sklearn.metrics import f1_score
recall_score(user201_seen_80, user201_pred_80, average='macro')

0.0

In [None]:
# Sắp xếp thứ tự các bộ phim trong tập test theo thứ tự rating
user201_pred_80.sort()
user201_pred_80[:20]

[220,
 299,
 651,
 831,
 3107,
 5460,
 6947,
 8749,
 9593,
 9754,
 10060,
 10128,
 10461,
 10518,
 10549,
 10549,
 10549,
 10549,
 10549,
 10647]

In [None]:
relevance = []
for i in range(len(user201_seen_80)):
  if user201_seen_80[i] == user201_pred_80[i]:
    relevance.append(1)
  else: relevance.append(0)

In [None]:
def mean_reciprocal_rank(rs):
  rs = (np.asarray(r).nonzero()[0] for r in rs)
  return np.mean([1. / (r[0] + 1) if r.size else 0. for r in rs])

In [None]:
# relevance của 1 user
rs = np.array([relevance])
mean_reciprocal_rank(rs)

0.0

In [None]:
def dcg_at_k(r, k):
  r = np.asfarray(r)[:k]
  if r.size:
    return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
  return 0.

def ndcg_at_k(r, k):
  dcg_max = dcg_at_k(sorted(r, reverse=True), k)
  if not dcg_max:
    return 0.
  return dcg_at_k(r, k) / dcg_max

In [None]:
# relevance của 1 user
ndcg_at_k(relevance, 80)

0.0