In [88]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
from dotenv import dotenv_values
from sqlalchemy import create_engine
import sklearn
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import pairwise
from sklearn.metrics import accuracy_score
import statsmodels.formula.api as sm
from scipy.sparse import csr_matrix
import os
import streamlit as sl
import pickle

In [2]:
movies = pd.read_csv("../data/movie_sorted.csv")
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17770 entries, 0 to 17769
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieid  17770 non-null  int64 
 1   year     17770 non-null  int64 
 2   title    17770 non-null  object
dtypes: int64(2), object(1)
memory usage: 416.6+ KB


In [3]:
ratings = pd.read_csv("../../thymestamps-working-folder/data/selected-ratings.csv")
ratings.head()

Unnamed: 0,customerid,rating,movieid,title
0,332466,2.0,550,First Knight
1,815418,2.0,550,First Knight
2,2568360,1.0,550,First Knight
3,2387526,4.0,550,First Knight
4,554171,4.0,550,First Knight


In [4]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16791531 entries, 0 to 16791530
Data columns (total 4 columns):
 #   Column      Dtype  
---  ------      -----  
 0   customerid  int64  
 1   rating      float64
 2   movieid     int64  
 3   title       object 
dtypes: float64(1), int64(2), object(1)
memory usage: 512.4+ MB


In [5]:
ratings["movieid"].value_counts()

11283    12531
4306     12499
1905     12469
14691    12393
14410    12358
         ...  
9016       223
5657       222
14021      211
761        206
11819      193
Name: movieid, Length: 5264, dtype: int64

In [6]:
ratings['customerid'].nunique(), ratings['movieid'].nunique(), ratings['movieid'].max()

(13100, 5264, 17769)

In [51]:
user = csr_matrix((ratings['rating'], (ratings['customerid'], ratings['movieid'])))

#### like pivot_table but much more memory efficient

In [35]:
user.shape

(2649286, 17770)

In [36]:
# customerid vector
user[1,:].todense()

matrix([[0., 0., 0., ..., 0., 0., 0.]])

In [37]:
# movieid vector
user[:,1].todense().shape

(2649286, 1)

In [75]:
# collect the movie ids for later

# for calculating recommendations
liked_items = [12748,4979, 12785, 12918,13031,13313,16265,14240,14214]
# liked = [11283,4306,1905,14691,14410,9016,5657,14021,761,11819]
# for testing the recommender after getting some recommendations
relevant_items = []

In [39]:
# which metrics can we use
sorted(sklearn.neighbors.VALID_METRICS_SPARSE['brute'])

['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan', 'precomputed']

In [59]:
# initialize the unsupervised model
model = NearestNeighbors(metric='cosine')


In [60]:
# fit it to the user-item matrix
model.fit(user)

NearestNeighbors(metric='cosine')

In [61]:
# top-10 most similar users for the user with the id 1 (id1 is included in the neighborhood!)
model.kneighbors(user[1,:], n_neighbors=20)

(array([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1.]]),
 array([[1766185, 1766187, 1766188, 1766189, 1766190, 1766191, 1766192,
         1766193, 1766194, 1766195, 1766196, 1766197, 1766198, 1766199,
         1766200, 1766201, 1766202, 1766184, 1766186, 1766204]],
       dtype=int64))

In [62]:
user[1,:].shape

(1, 17770)

In [63]:
# new user vector
user_vec = np.repeat(0, 17770)
user_vec.shape

(17770,)

In [76]:

# fill in some ratings
user_vec[liked_items] = 5
user_vec.shape

(17770,)

In [77]:
# find the neighborhood
distances, user_ids = model.kneighbors([user_vec], n_neighbors=10)

In [78]:
user_ids

array([[1583664, 1330226, 1843540,  989704,  815238, 1027371,  998862,
        2568366,  879221, 1631168]], dtype=int64)

In [79]:
# find the ratings for the neighbors
neighborhood = ratings.set_index('customerid').loc[user_ids[0]]
neighborhood

Unnamed: 0_level_0,rating,movieid,title
customerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1583664,3.0,571,American Beauty
1583664,3.0,575,Highlander: Season 4
1583664,3.0,607,Speed
1583664,5.0,708,The Perfect Storm
1583664,4.0,752,Star Trek: The Next Generation: Season 7
...,...,...,...
1631168,4.0,483,Rush Hour 2
1631168,5.0,486,Journey to the Center of the Earth
1631168,5.0,488,His Secret Life
1631168,5.0,504,The Twilight Zone: Vol. 27


In [80]:
recommendations = neighborhood.groupby('movieid')['rating'].sum().sort_values(ascending=False)
recommendations

movieid
14240    49.0
9628     49.0
7230     49.0
16265    49.0
5582     49.0
         ... 
2905      1.0
2834      1.0
8211      1.0
2791      1.0
13551     1.0
Name: rating, Length: 3355, dtype: float64

In [81]:
liked_items

[12748, 4979, 12785, 12918, 13031, 13313, 16265, 14240, 14214]

In [82]:
# filter out movies that the user allready watched
item_filter = ~recommendations.index.isin(liked_items)
recommendations = recommendations.loc[item_filter]

In [83]:
# top 10 recommendations
movies.loc[recommendations.head(10).index]

Unnamed: 0_level_0,movieid,year,title
movieid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
9628,9629,2002,Stranded
7230,7231,2005,Audioslave: Live in Cuba
5582,5583,2005,The Marksman
7057,7058,1979,Zombie
2452,2453,2001,Happy End
3523,3524,1994,Pulp Fiction: Bonus Material
11521,11522,2002,Queer as Folk: Season 2
752,753,1964,Mary Poppins: Bonus Material
10666,10667,2005,Stone Cold
2040,2041,1998,Silent Mobius: Vol. 2


In [None]:
recommended_movie_ids = recommendations.head(10).index
recommended_movie_ids

In [None]:
relevant_items

In [89]:
filename = 'finalized_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [93]:
loaded_model = pickle.load(open(filename, 'rb'))
recommendations = loaded_model.
recommendations

AttributeError: 'NearestNeighbors' object has no attribute 'neighborhood'