### Non-negative matrix factorization of movie recommendations from movielens dataset

In [None]:
# import libraries
from sqlalchemy import create_engine
from sklearn.decomposition import NMF

import pandas as pd
import numpy as np
import os
from joblib import load, dump
#! pip install fuzzywuzzy
from fuzzywuzzy import fuzz


#### connect to Postgres database containing movie lens data:

In [None]:
HOST = 'localhost'
PORT = '5432'
DB = 'movieLens'

In [None]:
conn_string = f'postgres://{HOST}:{PORT}/{DB}' 

In [None]:
engine = create_engine(conn_string)

#### postgres tables to dataframes: 

In [None]:
# query ratings table
ratings = pd.read_sql_query('SELECT * FROM ratings', con=engine)

In [None]:
# set index to userid
ratings = ratings.set_index('userid')

In [None]:
# drop time stamps column
ratings = ratings.drop('time_stamps', axis=1)

In [None]:
# query movies table
movies = pd.read_sql_query('SELECT * FROM movies', con=engine)

In [None]:
# check movies df
movies

In [None]:
# check ratings table
ratings

In [None]:
# get ratings df to right shape for matrix multiplication
ratings = ratings.pivot_table(index=ratings.index, values='rating', columns='movieid')

In [None]:
# fill Nan's with mean
ratings = ratings.fillna(ratings.mean().round(2)) 

In [None]:
# check for Nan's
ratings.isna().sum()

#### matrix mulitplication:

In [None]:
# create model and set hyperparameters
model = NMF(n_components=20, init='random', random_state=10, max_iter=1000)

In [None]:
# fit model
model.fit(ratings)

In [None]:
# save model to disk
def dump_model(MODEL_fitted):
    dump(MODEL_fitted, './NMF.joblib')

In [None]:
dump_model(model)

#### reconstruct matrix: 

In [None]:
# movie-genre matrix
Q = model.components_ 

In [None]:
# user-genre matrix
P = model.transform(ratings)

In [None]:
# reconstruction error
print(model.reconstruction_err_)

In [None]:
# reconstructed matrix
nR = np.dot(P, Q)
print(nR) 

#### movie recommendation from user input: 

In [None]:
# dictionary from example user input: 
user_dict = {'movie1': 'Jurassic Park', 'rating1': '1', 'movie2': 'Up', 'rating2': '5', 'movie3': 'I, Robot', 'rating3': '3'}

In [None]:
# reformat dictionary to format input_movie:input_rating
def format_dict(user_dict):
    user_dict_new = {user_dict.get('movie1'): user_dict.get('rating1'), user_dict.get('movie2'): user_dict.get('rating2'),user_dict.get('movie3'): user_dict.get('rating3')}
    return user_dict_new

In [None]:
user_dict_new = format_dict(user_dict)

In [None]:
# create dict with movieIds and movie names from movies df: 
MOVIE_NAMES = dict(zip(movies['movieid'], movies['title']))

In [None]:
# check user dictionary
for movie in user_dict_new: 
    print(movie)

In [None]:
def user_movie_index(user_dict):
    """compares movie titles in user input dict to titles in movie names dict with fuzzybuzzy. If sort ration > 70 original
       movie title and movie Id are appended to list user_movie_index. If doubles occur, error is thrown.
        returns list with tuples of movieId and movie title"""
    
    user_movie_index = []

    for movie in user_dict: 
        movie = str(movie).lower()
        print(' movies in user_dict: ' + movie)
        for index, moviename in MOVIE_NAMES.items(): 
            if fuzz.token_sort_ratio(movie, moviename) > 70:
                print('over 70: ' + moviename)
                user_movie_index.append([index, moviename])
                print('appended: ' + str(user_movie_index))
            
    if len(user_movie_index) != len(user_dict):
        print("sorry, this doesn't work for now, please be more precise about the year or chose another movie: " + str(dict(user_movie_index).values()))

    return user_movie_index

In [None]:
user_movie_index = user_movie_index(user_dict_new)

In [None]:
user_movie_index

In [None]:
def to_array(user_movie_index, user_dict):
    """input user dict with moive names and ratings. Creates df out of it, formats it, joins with columns of ratings df
       and keeps only new_user row. To array, returns array in shape (1, 9724,)"""
    # to df:
    user_df = pd.DataFrame(user_movie_index)
    # add ratings from user_dict (input) to column
    user_df['rating'] = user_dict.values()
    # drop movie titles
    user_df.drop([1], axis=1, inplace=True)
    # rename Movie ID column for better overview
    user_df.rename(columns = {0:'movieId'}, inplace=True)
    # to right format:
    user_ratings = user_df.set_index(['movieId']).transpose()
    # join with ratings df and keep only 1st row
    user_ratings = pd.concat([user_ratings, ratings], axis=0, join='outer').iloc[0]
    # get rid of Nan's:
    user_ratings = user_ratings.fillna(ratings.mean().round(2))
    # to array: 
    user_array = user_ratings.to_numpy()
    # reshape: 
    user_array = user_array.reshape(1, 9724)
    
    return user_array

In [None]:
user_array = to_array(user_movie_index, user_dict_new)

In [None]:
user_array

In [None]:
# load model: 
trained_NMF = load('/Users/krystanafoh/flask-recommender/NMF.joblib')
#list for movie names. 
movie_names = list(zip(movies['movieid'], movies['title']))

def get_prediction(user_array, trained_model):
    """takes in user array, makes prediction with saved NMF model, shows best prediction - movie title"""
    
    profile = trained_model.transform(user_array)
    Q = trained_model.components_     # movie-genre matrix
    movie_preds = np.dot(profile, Q)  # user-genre matrix
    best_rating = movie_preds.argmax()
    movie = movie_names[best_rating]
    print(movie[1])
    
    return movie[1]    

In [None]:
recommendation = get_prediction(user_array, trained_NMF)