In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.decomposition import NMF


from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import pairwise
from sklearn.cluster import KMeans
from scipy.sparse import csr_matrix


import sklearn
import pickle



from sklearn.cluster import DBSCAN

In [3]:
ratings = pd.read_csv('web_app/ratings.csv')
movies = pd.read_csv('web_app/movies.csv')

In [4]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [5]:
#create clean title and year columns
movies[["title", "year"]] = movies["title"].str.split(r"(",1, expand=True)

#clean the year column
movies["year"] = movies["year"].str.rstrip(')')

#separate the genres in one column
movies["genres"] = movies["genres"].str.split("|")


## Clean the DataSet

In [6]:
def Model_Develpment(df, main_feature):
    # number of ratings per movie
    ratings_per_movie = df.groupby("movieId")["userId"].count()

    # filter movies with less than 20 ratings
    popular_movies = ratings_per_movie.loc[ratings_per_movie > 20].index

    # filter and only keep popular movies
    df = df.loc[df["movieId"].isin(popular_movies)]

    # initialize a sparse matrix

    R = csr_matrix((df[main_feature], (df['userId'], df['movieId'])))
    return R

R = Model_Develpment(ratings, "rating")
R

<611x168253 sparse matrix of type '<class 'numpy.float64'>'
	with 66658 stored elements in Compressed Sparse Row format>

In [7]:
with open('web_app/static/r_matrix.pkl', 'wb') as file:
    pickle.dump(R, file)

___

# NMF Model

___

In [6]:
#initialize the unsupervised model

model = NMF(n_components=200, init='nndsvd', max_iter=30000, tol=0.01, verbose=2)

# fit it to the user-item rating matrix

model.fit(R)

# initialzed P, Q matrix with random values
# iterate and optimize the values stored in P and Q

violation: 1.0
violation: 0.15521533720915692
violation: 0.11293496862854785
violation: 0.08599133196741558
violation: 0.061932771539748986
violation: 0.047472596927869704
violation: 0.03907101010419858
violation: 0.034111336710117
violation: 0.030244433417048577
violation: 0.026905353157435422
violation: 0.023738101023973016
violation: 0.021004485952495957
violation: 0.018896864914307334
violation: 0.017030044521399612
violation: 0.015453859459327363
violation: 0.014541440972733615
violation: 0.01375739317723391
violation: 0.013030083250506936
violation: 0.011757552956835707
violation: 0.010477361879096356
violation: 0.008873684817100134
Converged at iteration 22


NMF(init='nndsvd', max_iter=30000, n_components=200, tol=0.01, verbose=2)

In [12]:
NMF_model = model

In [47]:
# user-'genre' matrix [611x55]
#P = model.transform(R)

# movie-'genre' matrix [55x168253]
#Q = model.components_

#P.shape, Q.shape

In [9]:
# reconstructed matrix Rhat
#R_hat = P.dot(Q)
# R -> encoding -> P -> decoding -> Rhat
#R_hat = model.inverse_transform(model.transform(R))
#R.shape, R_hat.shape

In [10]:
# reconstruction error
#np.sqrt(np.sum(np.square(R - R_hat)))

# Pickle the Model

In [13]:
with open('web_app/static/nmf_recommender.pkl', 'wb') as file:
    pickle.dump(model, file)

In [14]:
with open('web_app/static/nmf_recommender.pkl', 'rb') as file:
    model = pickle.load(file)

In [15]:
test_query = {1:5,2:5,3:5,4:5,5:5,7:5}

In [16]:
list(test_query.values())

[5, 5, 5, 5, 5, 5]

In [48]:
R.shape[1]

168253

In [17]:
data = list(test_query.values())   # the ratings of the new user
row_ind = [0]*len(data)       # we use just a single row 0 for this user 
col_ind = list(test_query.keys())  # the columns (=movieId) of the ratings

# new user vector: needs to have the same format as the training data
user_vec = csr_matrix((data, (row_ind, col_ind)), shape=(1, R.shape[1]))

# user_vec -> encoding -> p_user_vec -> decoding -> user_vec_hat
scores = NMF_model.inverse_transform(NMF_model.transform(user_vec))

# convert to a pandas series
scores = pd.Series(scores[0])

# give a zero score to movies the user has allready seen
scores[test_query.keys()] = 0

# sort the scores from high to low 
scores = scores.sort_values(ascending=False)

# get the movieIds of the top 10 entries
recommendations = scores.head(10).index

movies.set_index('movieId').loc[recommendations]

violation: 1.0
violation: 2.4118084529397152
violation: 0.07204433044028632
violation: 0.01660799305434758
violation: 0.004757178961986985
Converged at iteration 6


Unnamed: 0,title,genres,year
780,Independence Day,"[Action, Adventure, Sci-Fi, Thriller]",a.k.a. ID4) (1996
736,Twister,"[Action, Adventure, Romance, Thriller]",1996
1073,Willy Wonka & the Chocolate Factory,"[Children, Comedy, Fantasy, Musical]",1971
648,Mission: Impossible,"[Action, Adventure, Mystery, Thriller]",1996
95,Broken Arrow,"[Action, Adventure, Thriller]",1996
788,"Nutty Professor, The","[Comedy, Fantasy, Romance, Sci-Fi]",1996
733,"Rock, The","[Action, Adventure, Thriller]",1996
1356,Star Trek: First Contact,"[Action, Adventure, Sci-Fi, Thriller]",1996
786,Eraser,"[Action, Drama, Thriller]",1996
62,Mr. Holland's Opus,[Drama],1995


In [43]:
query=test_query

In [41]:
def get_NMF_recommendations(query):
    data = list(test_query.values())   # the ratings of the new user
    row_ind = [0]*len(data)       # we use just a single row 0 for this user 
    col_ind = list(test_query.keys())  # the columns (=movieId) of the ratings

    # new user vector: needs to have the same format as the training data
    user_vec = csr_matrix((data, (row_ind, col_ind)), shape=(1, R.shape[1]))

    # user_vec -> encoding -> p_user_vec -> decoding -> user_vec_hat
    scores = NMF_model.inverse_transform(NMF_model.transform(user_vec))

    # convert to a pandas series
    scores = pd.Series(scores[0])

    # give a zero score to movies the user has allready seen
    scores[test_query.keys()] = 0

    # sort the scores from high to low 
    scores = scores.sort_values(ascending=False)

    # get the movieIds of the top 10 entries
    recommendations = scores.head(10).index

    result = movies.set_index('movieId').loc[recommendations]

    result_titles = result['title'].values

    print ("result_titles: ", result_titles)

    return result_titles


In [44]:
get_NMF_recommendations(query)

violation: 1.0
violation: 2.4118084529397152
violation: 0.07204433044028632
violation: 0.01660799305434758
violation: 0.004757178961986985
Converged at iteration 6
result_titles:  ['Independence Day ' 'Twister ' 'Willy Wonka & the Chocolate Factory '
 'Mission: Impossible ' 'Broken Arrow ' 'Nutty Professor, The '
 'Rock, The ' 'Star Trek: First Contact ' 'Eraser ' "Mr. Holland's Opus "]


array(['Independence Day ', 'Twister ',
       'Willy Wonka & the Chocolate Factory ', 'Mission: Impossible ',
       'Broken Arrow ', 'Nutty Professor, The ', 'Rock, The ',
       'Star Trek: First Contact ', 'Eraser ', "Mr. Holland's Opus "],
      dtype=object)

___

# Nearest Neighbors

___

In [24]:
nbrs = NearestNeighbors(metric = 'cosine', algorithm = 'brute', n_neighbors=10, radius=1.5, n_jobs=-1).fit(R)
distances, indices = nbrs.kneighbors(R)

In [25]:
#nbrs = NearestNeighbors(metric = 'cosine', algorithm = 'brute', n_neighbors=10, radius=1.5, n_jobs=-1).fit(R_m)
#distances, indices = nbrs.kneighbors(R_m)

In [26]:
#nbrs_BIG = nbrs

In [27]:
nbrs

NearestNeighbors(algorithm='brute', metric='cosine', n_jobs=-1, n_neighbors=10,
                 radius=1.5)

In [28]:
with open('web_app/static/nn_new_recommender.pkl', 'wb') as file:
    pickle.dump(nbrs, file)
    


In [29]:
with open('web_app/static/nn_new_recommender.pkl', 'rb') as file:
    model = pickle.load(file)

---

#  PCA

---

In [30]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [31]:
X = ratings[["rating", "movieId", "userId"]]


In [32]:
X_pivot = X.pivot_table(index='movieId',columns='userId',values='rating').fillna(0)

In [33]:
X_pivot

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193583,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193585,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193587,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
from sklearn.decomposition import PCA

In [35]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_std = sc.fit_transform(X_pivot)

In [36]:
model = PCA(n_components=0.99, svd_solver='full')
model.fit(X_std)
print(f'No. of components explaining 99% variance: {model.n_components_}')

No. of components explaining 99% variance: 555


In [37]:
pca = PCA(n_components=518, svd_solver='full')
pca.fit_transform(X_std)

array([[ 8.25961038e+01,  1.30799134e+01, -2.24410385e+01, ...,
         1.04462822e+00, -5.64739685e-01,  1.09380075e-01],
       [ 3.47581385e+01,  1.92631736e+01,  2.65973185e-01, ...,
        -6.48117216e-01, -2.46949982e-01,  8.16396394e-01],
       [ 1.24472530e+01,  1.13459067e+01, -1.38296464e+01, ...,
         1.10971820e+00, -6.66414919e-01,  1.80250050e+00],
       ...,
       [-2.63360674e+00,  6.35556284e-01,  9.30684543e-01, ...,
        -3.43025034e-01, -2.15147177e-01, -7.10149460e-03],
       [-2.63360674e+00,  6.35556284e-01,  9.30684543e-01, ...,
        -3.43025034e-01, -2.15147177e-01, -7.10149460e-03],
       [-2.36311611e+00,  3.57193642e-01,  1.13392388e+00, ...,
         3.67330567e-02,  1.31604687e-01,  1.64545331e-01]])

In [38]:
km = KMeans(n_clusters=5)

X_clustered = km.fit_predict(X_std)
