In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import sklearn as sk
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
import pickle

## Downloading the necessary models and files for making predictions

Here we need to download the validation and test datasets we made when making our models. We also need to download all the models and the data and film files.

In [2]:
X_val = pd.read_csv('cleandata/X_val.csv')
y_val =pd.read_csv('cleandata/y_val.csv') 

X_test = pd.read_csv('cleandata/X_test.csv')
y_test = pd.read_csv('cleandata/y_test.csv')

model_baseline = pickle.load(open('model_baseline.pkl', 'rb'))
model_cb = pickle.load(open('model_cb.pkl', 'rb'))
correlation_matrix = pickle.load(open('correlation_matrix.pkl', 'rb'))

data = pd.read_csv('cleandata/data.csv') 
film2 = pd.read_csv('cleandata/film2.csv')
film = pd.read_csv('cleandata/film.csv')

## Making predictions with the baseline model

The baseline model makes predictions by first checking if the movie we want to predict is in our model, if it is then it takes out the value at that index and that is the prediction. If the movie is not in our model then it takes the mean of all the values in the model and makes that the prediction.

In [3]:
def baseline_predict(userid, filmid):
    if any(model_baseline.index == filmid):
        return model_baseline.loc[filmid][0]
    else:
        return model_baseline.mean()[0]

### Calculating rmse on validation data for our baseline model

Here I will calculate the root mean squared error on our validation set. This value will come in handy when I compare the models.

In [4]:
predicted_baseline = []
for i in range(len(X_val)):
    predicted_baseline.append(baseline_predict(X_val.iloc[i,0], X_val.iloc[i,1]))

mse_baseline = mean_squared_error(y_val, predicted_baseline)
print("Root Mean Squared Error for baseline on validation data is", str(np.sqrt(mse_baseline)))

Root Mean Squared Error for baseline on validation data is 0.9920240555792099


## Making predictions with the content based model 

In [5]:
def prediction_cb(userid, filmid):
    model_user = model_cb[userid] # Extract the model from the dictionary
    X = film2.loc[film2['FilmID'] == filmid].drop(['FilmID'], axis=1) # Extract the x values for the movie
    X = np.array(X).reshape(1,-1)

    pred = model_user.predict(X) # Predict rating
    return pred[0]

### Calculating rmse on validation data for our content based model

In [6]:
predicted_cb = []
# Makes predictions for every user and movie in the validation set
for i in range(len(X_val)):
    predicted_cb.append(prediction_cb(X_val.iloc[i,0], X_val.iloc[i,1])) 

mse_cb = mean_squared_error(y_val, predicted_cb)
print("Root Mean Squared Error for content based on validation data is", str(np.sqrt(mse_cb)))

Root Mean Squared Error for content based on validation data is 1.3294757642943764


## Making predictions with the collaborative model



In [7]:
table = pd.pivot_table(data, values='Rangering', columns='FilmID', index='BrukerID') # Make a table of each known rating
bruker_mean = table.mean(axis=1)

table = table.fillna(0.1)

def prediction_collaborative(userid, filmid):
    correlation_vector = correlation_matrix.loc[userid, :] # Extract the row in the correlation matrix which represents the user
    film_vector = table.loc[:,filmid] # Extract the column in table corresponding to the filmid
    has_rated = film_vector.index[film_vector != 0.1].tolist() # Find every user that has rated the given filmid
    has_rated_corr = correlation_vector[has_rated] # Find a vector with the correlations with evry person that has rated the filmid

    k = 3
    k_largest = has_rated_corr.nlargest(k) # Find the k highest correlations
    pred = 0
    # Calculate the predictions
    for i in range(len(k_largest)):
        pred += (table.loc[k_largest.index[i], filmid]-bruker_mean[k_largest.index[i]])
    pred = bruker_mean.loc[userid] + (1/k)*pred
    return pred

### Calculating rmse on validation data for our collaborative model

In [8]:
predicted_coll = []
# Makes predictions for every user and movie in the validation set
for i in range(len(X_val)):
    predicted_coll.append(prediction_collaborative(X_val.iloc[i,0], X_val.iloc[i,1]))

mse_coll = mean_squared_error(y_val, predicted_coll)
print("Root Mean Squared Error for collaborative on validation data is", str(np.sqrt(mse_coll)))

Root Mean Squared Error for collaborative on validation data is 0.7007817124570134


## Making predictions with a combined approche

For the combined approche I have decided to weigth the two models differently. I have decided to weigth the collaborative model with 3/4 and the content based 1/4. The reason for this is that the collaborative model performed a lot better then the content based and it will make sanse to weigth this prediction more.

In [9]:
def prediction_combined(userid, filmid):
    cont = prediction_cb(userid, filmid) # Makes predictions with the content based model
    coll = prediction_collaborative(userid, filmid) # Makes predictions with the collaborative model
    pred = (0.25*cont)+(0.75*coll) # Makes a weghted prediction
    return pred

### Calculating rmse on validation data for our combined approche

Instead of using the function above I will use the predictions made on validation data from above. I will do this to save some time.

In [10]:
predicted_cb_w = [i * 0.2 for i in predicted_cb] 
predicted_coll_w = [i * 0.8 for i in predicted_coll]
predicted_comb = []
# Makes predictions for every user and movie in the validation set
for i in range(len(predicted_coll_w)):
    predicted_comb.append(predicted_cb_w[i] + predicted_coll_w[i])

mse_comb = mean_squared_error(y_val, predicted_comb)
print("Root Mean Squared Error for combiend on validation data is", str(np.sqrt(mse_comb)))

Root Mean Squared Error for combiend on validation data is 0.7403101343454669


# Comparison of the four different models based on validation data

As one can see for the rmse values for the four different models the collaborative model gained a lower rmse then the three others. And because of this result I would have choosen to use the collaborative model when making predictions. I the cell below I will test the model against the unbiased test set and estimate a perfermance.

## Test the best model

In [11]:
predicted_coll_test = []
for i in range(len(X_test)):
    predicted_coll_test.append(prediction_collaborative(X_test.iloc[i,0], X_test.iloc[i,1]))

mse_coll_test = mean_squared_error(y_test, predicted_coll_test)
print("Root Mean Squared Error for collaborative on validation data is", str(np.sqrt(mse_coll_test)))

Root Mean Squared Error for collaborative on validation data is 0.7020214854185584


As one can see from the result I estimate that my model will have and average error of 0.7 for new unsees data.

## Making predictions for the entire dataset (For all users and all movies)

Here we are supposed to make predictions for all users and all movies. Becuase of the time this take I have decided to only make predictions for the first 250 movies. One can easily change the code below such that one can make predictions for the entire dataset. 

In [12]:
# Makes a DataFrame with all the known ratings
predictions = pd.pivot_table(data, values='Rangering', columns='BrukerID', index='FilmID')
columns = list(predictions.columns)
rows = list(predictions.index)
predictions = predictions.fillna(0.1).iloc[0:250, :] # Remove the .iloc to make predictions for the entire dataset

# Predicting the unknown values with the predicted values
for i in range(len(predictions)):
    for j in range(len(predictions.iloc[0])):
        if predictions.iloc[i,j] == 0.1:
            predictions.iloc[i,j] = prediction_collaborative(columns[j], rows[i])
            
print(predictions)


BrukerID      0         1         2         3         4         5     \
FilmID                                                                 
0         1.156174  1.770670  2.448580  3.811687  2.391108  2.978528   
1         4.139513  3.899817  3.728852  4.203422  3.475259  3.361512   
2         3.160736  3.605722  3.714048  4.118542  4.010434  3.209451   
3         3.279912  3.027184  3.871290  4.899560  3.587432  2.485781   
5         3.469079  3.302847  4.485642  4.289191  3.830698  2.465458   
...            ...       ...       ...       ...       ...       ...   
267       1.674177  2.553589  3.130304  2.467914  2.934510  1.677569   
268       2.132015  1.895780  2.070686  3.025850  1.563227  1.216402   
269       3.120774  3.622281  2.490634  3.902742  3.565559  3.527635   
270       2.942445  2.000000  3.699050  4.490130  3.302005  3.175524   
271       2.470195  3.620914  3.153671  2.525830  3.606265  3.617511   

BrukerID      6         7         8         9     ...      6031

In [13]:
predictions.to_csv('cleandata/predictions.csv', index=False)

## Notes on the project

Ratings are given in integer between 1 and 5. When I make predictions I have decided not to take this into account. The reason for this is that when I make the website I give a list of 10 movies, we will get a more accurate result by using float not integer numbers.

## How I want my website to make predictions

Since I have some trouble running the website on on the local host I have decided to make a demonstration of how the code in the app2.py should work

In [12]:
Title1 = 'Autumn in New York (2000)' # First movie
Rating1 = 4 # Rating for first movie

Title2 = 'Defying Gravity (1997)' # Second movie
Rating2 = 5 # Second rating

Title3 = 'Vie est belle, La (Life is Rosey) (1987)' # Third movie
Rating3 = 2 # Third rating

Title4 = 'Ruthless People (1986)' # Fourth movie
Rating4 = 2 # Fourth rating

Title5 = 'True Lies (1994)' # Fifth movie
Rating5 = 5 # Fifth rating

feature = {'Title1':Title1, 'Title2':Title2, 'Title3':Title3, 'Title4':Title4, 'Title5':Title5,
            'Rating1':Rating1, 'Rating2':Rating2, 'Rating3':Rating3, 'Rating4':Rating4, 'Rating5':Rating5} # Make a dictionary since this is the way the feature will appare in the app


feature['Title1'] = float(film['FilmID'][film['Tittel'] == feature['Title1']]) # Find the filmid
feature['Title2'] = float(film['FilmID'][film['Tittel'] == feature['Title2']]) # Find the filmid
feature['Title3'] = float(film['FilmID'][film['Tittel'] == feature['Title3']]) # Find the filmid
feature['Title4'] = float(film['FilmID'][film['Tittel'] == feature['Title4']]) # Find the filmid
feature['Title5'] = float(film['FilmID'][film['Tittel'] == feature['Title5']]) # Find the filmid
    
filmids = [feature['Title1'], feature['Title2'], feature['Title3'], feature['Title4'], feature['Title5']]
    
df = pd.DataFrame(data.loc[data['FilmID'] == feature['Title1']].iloc[0]).transpose()
df = df.append(pd.DataFrame(data.loc[data['FilmID'] == feature['Title2']].iloc[0]).transpose(), ignore_index=True)
df = df.append(pd.DataFrame(data.loc[data['FilmID'] == feature['Title3']].iloc[0]).transpose(), ignore_index=True)
df = df.append(pd.DataFrame(data.loc[data['FilmID'] == feature['Title4']].iloc[0]).transpose(), ignore_index=True)
df = df.append(pd.DataFrame(data.loc[data['FilmID'] == feature['Title5']].iloc[0]).transpose(), ignore_index=True)
    
df['Rangering'] = [feature['Rating1'], feature['Rating2'], feature['Rating3'], feature['Rating4'], feature['Rating5']]
df['BrukerID'] = [7000, 7000, 7000, 7000, 7000]
    
data = data.append(df, ignore_index=True)

    
# Make the correlation matrix
table = pd.pivot_table(data, values='Rangering', columns='FilmID', index='BrukerID')
bruker_mean = table.mean(axis=1)
table = table.fillna(0.1)
table2 = table.copy()
table = table.transpose()
    
    
for i in table.columns:
    table[i] = table[i].replace([0.1], bruker_mean[i])
        
table = table.transpose()
    
idx = list(table.index)
correlation_matrix = pd.DataFrame(np.corrcoef(table))
correlation_matrix.index = idx
correlation_matrix.columns = idx
    
# predict
correlation_vector = correlation_matrix.loc[7000, :] # Extract the row in the correlation matrix which represents the user
predictions = []
filmid = data['FilmID'].unique().tolist()
for i in range(len(film['FilmID'])):
    if film['FilmID'].iloc[i] in filmids:
        continue
    else:
        film_vector = table2.loc[:,filmid[i]] # Extract the column in table corresponding to the filmid
        has_rated = film_vector.index[film_vector != 0.1].tolist() # Find every user that has rated the given filmid
        has_rated_corr = correlation_vector[has_rated] # Find a vector with the correlations with evry person that has rated the filmid

        k = 3
        k_largest = has_rated_corr.nlargest(k) # Find the k highest correlations
        pred = 0
    # Calculate the predictions
        for i in range(len(k_largest)):
            pred += (table2.loc[k_largest.index[i], i]-bruker_mean[k_largest.index[i]])
        predictions.append(bruker_mean.loc[7000] + (1/k)*pred)
        filmid.append(i)
        #film['FilmID'].iloc[i]

index10 = np.argsort(predictions)[-10:]
filmid10 = []
for i in index10:
    filmid10.append(filmid[i])

recommended = []
for i in filmid10:
    recommended.append(film.loc[film['FilmID']==i]['Tittel'].iloc[0])
    
print(recommended)

['Make Them Die Slowly (Cannibal Ferox) (1980)', 'Battle of the Sexes, The (1959)', 'Criminal Lovers (Les Amants Criminels) (1999)', 'Wedding Bell Blues (1996)', 'Buck and the Preacher (1972)', 'Two Bits (1995)', 'Telling You (1998)', 'Institute Benjamenta, or This Dream People Call Human Life (1995)', 'Map of the World, A (1999)', 'Autumn in New York (2000)']
