In [1]:
from sqlalchemy import create_engine
from sklearn.decomposition import NMF

import pandas as pd
import numpy as np
import os
from joblib import load, dump
#! pip install fuzzywuzzy
from fuzzywuzzy import fuzz




#### connect to database:

In [2]:
HOST = 'localhost'
PORT = '5432'
DB = 'movieLens'

In [3]:
conn_string = f'postgres://{HOST}:{PORT}/{DB}' 

In [4]:
engine = create_engine(conn_string)

#### get tables to df's: 

In [5]:
ratings = pd.read_sql_query('SELECT * FROM ratings', con=engine)

In [6]:
ratings = ratings.set_index('userid')

In [7]:
ratings = ratings.drop('time_stamps', axis=1)

In [8]:
movies = pd.read_sql_query('SELECT * FROM movies', con=engine)

In [9]:
movies

Unnamed: 0,movieid,title,genre
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [10]:
ratings

Unnamed: 0_level_0,movieid,rating
userid,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1,4.0
1,3,4.0
1,6,4.0
1,47,5.0
1,50,5.0
...,...,...
610,166534,4.0
610,168248,5.0
610,168250,5.0
610,168252,5.0


In [11]:
# get right shape for matrix multiplication
ratings = ratings.pivot_table(index=ratings.index, values='rating', columns='movieid')

In [12]:
ratings = ratings.fillna(ratings.mean().round(2)) # !!fill with 3.0? # which mean is that now? - mean per column or what?

In [13]:
#ratings.isna().sum()

In [14]:
ratings

movieid,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.00,3.43,4.00,2.36,3.07,4.00,3.19,2.88,3.12,3.5,...,3.5,3.0,4.0,4.0,3.5,4.0,3.5,3.5,3.5,4.0
2,3.92,3.43,3.26,2.36,3.07,3.95,3.19,2.88,3.12,3.5,...,3.5,3.0,4.0,4.0,3.5,4.0,3.5,3.5,3.5,4.0
3,3.92,3.43,3.26,2.36,3.07,3.95,3.19,2.88,3.12,3.5,...,3.5,3.0,4.0,4.0,3.5,4.0,3.5,3.5,3.5,4.0
4,3.92,3.43,3.26,2.36,3.07,3.95,3.19,2.88,3.12,3.5,...,3.5,3.0,4.0,4.0,3.5,4.0,3.5,3.5,3.5,4.0
5,4.00,3.43,3.26,2.36,3.07,3.95,3.19,2.88,3.12,3.5,...,3.5,3.0,4.0,4.0,3.5,4.0,3.5,3.5,3.5,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.50,3.43,3.26,2.36,3.07,3.95,2.50,2.88,3.12,3.5,...,3.5,3.0,4.0,4.0,3.5,4.0,3.5,3.5,3.5,4.0
607,4.00,3.43,3.26,2.36,3.07,3.95,3.19,2.88,3.12,3.5,...,3.5,3.0,4.0,4.0,3.5,4.0,3.5,3.5,3.5,4.0
608,2.50,2.00,2.00,2.36,3.07,3.95,3.19,2.88,3.12,4.0,...,3.5,3.0,4.0,4.0,3.5,4.0,3.5,3.5,3.5,4.0
609,3.00,3.43,3.26,2.36,3.07,3.95,3.19,2.88,3.12,4.0,...,3.5,3.0,4.0,4.0,3.5,4.0,3.5,3.5,3.5,4.0


### Matrix Multiplication

In [15]:
model = NMF(n_components=20, init='random', random_state=10, max_iter=1000)

In [16]:
model.fit(ratings)



NMF(init='random', max_iter=1000, n_components=20, random_state=10)

In [17]:
def dump_model(MODEL_fitted):
    dump(MODEL_fitted, './NMF.joblib')

In [18]:
dump_model(model)

In [19]:
#Q = model.components_ 

In [20]:
#P = model.transform(ratings)

In [21]:
print(model.reconstruction_err_)

243.47673611192806


In [22]:
nR = np.dot(P, Q)
print(nR) ## The reconstructed matrix!


NameError: name 'P' is not defined

#### recommendations for one user with NMF: 

In [None]:
user = ratings.loc[[101]] # need double square brackets!

In [None]:
#user.reshape(1, -1) not necessary then

In [None]:
user.shape

In [None]:
ratings.shape

In [None]:
profile = model.transform(user)

In [None]:
model.components_.shape # this is Q or P

In [None]:
# approximate the reconstructed matrix for this user
movie_preds = np.dot(profile, model.components_)
movie_preds.round(2)   # centered around 3.0

In [None]:
movie_names = list(ratings.columns)
#movie_preds.argmax()

In [None]:
movie_names[2075]

In [None]:
s = pd.Series(movie_preds[0], index=movies)
s.sort_values(ascending=False)

In [None]:
# get names from dictionary in simple and direct way?

In [None]:
# create dic with movieIds and movie names: 
MOVIE_NAMES = dict(zip(movies['movieid'], movies['title']))

In [None]:
df_temp = pd.DataFrame(MOVIE_NAMES.items())

In [None]:
df_temp

In [None]:
#cd ~/flask-recommender/

In [None]:
#if 'NMF.joblib' in os.listdir('./'):
   # print('yes')
    #MODEL_fitted = load('./gettingstarted/NMF.joblib')

In [None]:
#os.listdir('./')

### process input user dict to array:

In [23]:
user_dict = {'movie1': 'Jurassic Park', 'rating1': '1', 'movie2': 'Up', 'rating2': '5', 'movie3': 'I, Robot', 'rating3': '3'}

In [24]:
#user_dict.get('rating1')

In [26]:
def format_dict(user_dict):

    user_dict_new = {user_dict.get('movie1'): user_dict.get('rating1'), user_dict.get('movie2'): user_dict.get('rating2'),user_dict.get('movie3'): user_dict.get('rating3')}
    return user_dict_new

In [27]:
user_dict_new = format_dict(user_dict)

In [28]:
#user_dict = {'Finding Nemo' : '4', "casablanca" : '1', "I, Robot" : '3'}

In [29]:
# create dic with movieIds and movie names: 
MOVIE_NAMES = dict(zip(movies['movieid'], movies['title']))

In [31]:
for movie in user_dict_new: 
    print(movie)


Jurassic Park
Up
I, Robot


In [32]:
def user_movie_index(user_dict):
    """compares movie titles in user input dict to titles in movie names dict with fuzzybuzzy. If sort ration > 70 original
       movie title and movie Id are appended to list user_movie_index. If doubles occur, error is thrown.
        returns list with tuples of movieId and movie title"""
    
    user_movie_index = []

    for movie in user_dict: 
        movie = str(movie).lower()
        print(' movies in user_dict: ' + movie)
        for index, moviename in MOVIE_NAMES.items(): 
            if fuzz.token_sort_ratio(movie, moviename) > 70:
                print('over 70: ' + moviename)
                user_movie_index.append([index, moviename])
                print('appended: ' + str(user_movie_index))
            
    if len(user_movie_index) != len(user_dict):
        print("sorry, this doesn't work for now, please be more precise about the year or chose another movie: " + str(dict(user_movie_index).values()))
   
        #elif fuzz.token_sort_ratio(movie, moviename) < 70:
            #print('not appended: ' + movie)
            #break
            #print(user_movie_index)
    return user_movie_index

In [33]:
user_movie_index = user_movie_index(user_dict_new)

 movies in user_dict: jurassic park
over 70: Jurassic Park (1993)
appended: [[480, 'Jurassic Park (1993)']]
over 70: Jurassic Park III (2001)
appended: [[480, 'Jurassic Park (1993)'], [4638, 'Jurassic Park III (2001)']]
 movies in user_dict: up
 movies in user_dict: i, robot
over 70: I, Robot (2004)
appended: [[480, 'Jurassic Park (1993)'], [4638, 'Jurassic Park III (2001)'], [8644, 'I, Robot (2004)']]


In [37]:
user_movie_index

[[480, 'Jurassic Park (1993)'],
 [4638, 'Jurassic Park III (2001)'],
 [8644, 'I, Robot (2004)']]

In [38]:
#user_dict

In [41]:
def to_array(user_movie_index, user_dict):
    """input user dict with moive names and ratings. Creates df out of it, formats it, joins with columns of ratings df
       and keeps only new_user row. To array, returns array in shape (1, 9724,)"""
    # to df:
    user_df = pd.DataFrame(user_movie_index)
    # add ratings from user_dict (input) to column
    user_df['rating'] = user_dict.values()
    # drop movie titles
    user_df.drop([1], axis=1, inplace=True)
    # rename Movie ID column for better overview
    user_df.rename(columns = {0:'movieId'}, inplace=True)
    # to right format:
    user_ratings = user_df.set_index(['movieId']).transpose()
    # join with ratings df and keep only 1st row
    user_ratings = pd.concat([user_ratings, ratings], axis=0, join='outer').iloc[0]
    # get rid of Nan's:
    user_ratings = user_ratings.fillna(ratings.mean().round(2))
    # to array: 
    user_array = user_ratings.to_numpy()
    # reshape: 
    user_array = user_array.reshape(1, 9724)
    
    
    
    return user_array

In [42]:
user_array = to_array(user_movie_index, user_dict_new)

In [43]:
user_array

array([[3.92, 3.43, 3.26, ..., 3.5, 3.5, 4.0]], dtype=object)

In [44]:
# get prediciton: 
# load model: 
trained_NMF = load('/Users/krystanafoh/flask-recommender/NMF.joblib')

#list for movie names. 
movie_names = list(zip(movies['movieid'], movies['title']))

def get_prediction(user_array, trained_model):
    """take in user array, make prediction, show best prediction movie title"""
    profile = trained_model.transform(user_array)
    Q = trained_model.components_ 
    movie_preds = np.dot(profile, Q)
    best_rating = movie_preds.argmax()
    movie = movie_names[best_rating]
    print(movie[1])
    
    return movie[1]
    

In [45]:
recommendation = get_prediction(user_array, trained_NMF)

Night Porter, The (Portiere di notte, Il) (1974)


### 2. cosinus similarity (3. weighted rating function)

use weighted rating/cosinus sim instead of NMF
weighted rating: m -> do we want to prefer/rate blockbusters(movies rated by a lot of people)
but recommend movies only intresting for a niche of people - maybe bad recommendations


similarity: user based filtering
loop over all users except wanted, compare user ratings to wanted user by similarity

similarity measures f.ex euklidian distance - does not work that great in higher dimensional space

--> cosine similarity (entspr. angle between two vectors) ||X|| -> norm of a vector