<a href="https://colab.research.google.com/github/albanda/CE888/blob/master/lab4-recommender/rec_features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Hybrid Models for Recommendation Systems

Load Pandas, we are going to need it for manipulating data

In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC

np.set_printoptions(precision=3)

Now load the data

In [2]:
user_ratings_df = pd.read_csv("https://raw.githubusercontent.com/albanda/CE888/master/lab4-recommender/user_ratings.csv")
user_features_df = pd.read_csv("https://raw.githubusercontent.com/albanda/CE888/master/lab4-recommender/user_features.csv")
item_features_df = pd.read_csv("https://raw.githubusercontent.com/albanda/CE888/master/lab4-recommender/item_features.csv")

In [3]:
user_features_df["key"] = 0
user_features_df["user_id"] = range(user_features_df.shape[0])
item_features_df["key"] = 0
item_features_df["item_id"] = range(item_features_df.shape[0])

merged_df = pd.merge(user_features_df, item_features_df, left_index=True, on="key")
merged_df.drop(labels='key', axis=1, inplace=True)  # drop the "key" column


In [4]:
merged_df[["item_id", "user_id"]]

Unnamed: 0,item_id,user_id
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
0,0,1
1,1,1
2,2,1
3,3,1
4,4,1


In [21]:
merged_df["rating"] = list(map(lambda ids: user_ratings_df.values[ids[1]][ids[2]], 
                          merged_df[["user_id", "item_id"]].itertuples()))

train = merged_df.dropna()
test = merged_df[merged_df.isnull().any(axis=1)]

In [19]:
print(train.head())

   Sex   Over60  user_id  Critic0   Critic1  item_id  rating
0  1.0      0.0        0      0.3       0.9        0     8.0
1  1.0      0.0        0      0.9       0.3        1     2.0
3  1.0      0.0        0      0.2       0.1        3     5.0
4  1.0      0.0        0      0.7       0.8        4     4.0
0  0.0      1.0        1      0.3       0.9        0     3.0


In [20]:
print(test.head())

   Sex   Over60  user_id  Critic0   Critic1  item_id  rating
2  1.0      0.0        0      0.6       0.4        2     NaN
2  0.0      1.0        1      0.6       0.4        2     NaN
1  0.0      0.0        2      0.9       0.3        1     NaN
0  1.0      0.0        3      0.3       0.9        0     NaN
1  1.0      0.0        3      0.9       0.3        1     NaN


In [22]:
X_tr, y_tr = train.iloc[:, :-1].values, train.iloc[:, -1].values  # separate into X and y

clf = SVC()
clf.fit(X_tr, y_tr)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [23]:
X_test = test.iloc[:, :-1].values
predictions = clf.predict(X_test)

In [None]:
n_latent_factors = 2

user_ratings = user_ratings_df.values
latent_user_preferences = np.random.random((user_ratings.shape[0], n_latent_factors))
latent_item_features = np.random.random((user_ratings.shape[1], n_latent_factors))

user_features = user_features_df.values
item_features = item_features_df.values

user_features = np.concatenate([np.ones(shape=(user_features.shape[0],1)), user_features], axis=1)
item_features = np.concatenate([np.ones(shape=(item_features.shape[0],1)), item_features], axis=1)

user_features_weights = np.random.random((user_ratings.shape[0], user_features.shape[1]))
item_features_weights = np.random.random((user_ratings.shape[1], item_features.shape[1]))

print(user_features)

[[1. 1. 0. 0. 0.]
 [1. 0. 1. 0. 1.]
 [1. 0. 0. 0. 2.]
 [1. 1. 0. 0. 3.]
 [1. 0. 1. 0. 4.]
 [1. 0. 0. 0. 5.]
 [1. 0. 0. 0. 6.]
 [1. 1. 0. 0. 7.]
 [1. 0. 1. 0. 8.]
 [1. 1. 0. 0. 9.]]


In [None]:
def predict_rating(user_id, item_id):
    """
    Predict a rating given a user_id and an item_id.
    """
    user_preference = latent_user_preferences[user_id]
    item_preference = latent_item_features[item_id]
    
    user_score = user_features_weights[user_id].dot(user_features[user_id])
    item_score = item_features_weights[item_id].dot(item_features[item_id])
    #print(user_preference.dot(item_preference), user_score, item_score)
    return user_preference.dot(item_preference) + user_score + item_score


def train(user_id, item_id, rating, alpha=0.001, 
          latent_feature_weight_decay=0.1,
          user_weight_decay=0.01, item_weight_decay=0.0001):
    #print(item_id)
    prediction_rating = predict_rating(user_id, item_id)
    err = prediction_rating - rating
    #print(err)
    user_pref_values = latent_user_preferences[user_id]
    latent_user_preferences[user_id] -= alpha * err * (latent_item_features[item_id] + latent_feature_weight_decay * latent_user_preferences[user_id])
    latent_item_features[item_id] -= alpha * err * (user_pref_values + latent_feature_weight_decay * latent_item_features[item_id])
    
    user_features_weights[user_id] -=alpha * err * (user_features[user_id] + user_weight_decay * user_features_weights[user_id])
    item_features_weights[item_id] -=alpha * err * (item_features_weights[item_id] + item_weight_decay * item_features_weights[item_id])
    
    return err


def sgd(iterations):
    """ 
    Iterate over all users and all items and train for 
    a certain number of iterations
    """
    for iteration in range(iterations):
        error = []
        for user_id in range(latent_user_preferences.shape[0]):
            for item_id in range(latent_item_features.shape[0]):
                rating = user_ratings[user_id, item_id]
                if not np.isnan(rating):
                    err = train(user_id, item_id, rating)
                    error.append(err)
    mse = (np.array(error) ** 2).mean()          
    print(mse)


In [None]:
for _ in range(10): 
    sgd(30000)  # Note decreasing values with iterations

0.2924306742808656
0.28453077564694945
0.2817163772237676
0.28029527427222917
0.27944187497457373
0.2788770146949821
0.27848036768199524
0.27819148738609634
0.27797666950397126
0.2778156670168853


In [None]:
predictions = np.zeros(shape=(latent_user_preferences.shape[0], latent_item_features.shape[0]))
print(user_features_weights)
print(item_features_weights)
for user_id in range(latent_user_preferences.shape[0]):
    for item_id in range(latent_item_features.shape[0]):
        predictions[user_id, item_id] =  predict_rating(user_id, item_id)
  

[[-1.496 -1.307  0.444  0.539  0.628]
 [ 2.674  0.509  2.741  0.647  2.021]
 [ 0.339  0.126  0.938  0.005  0.84 ]
 [ 0.122  0.986  0.498  0.121  0.702]
 [ 0.46   0.093  0.017  0.817 -0.141]
 [ 0.895  0.12   0.722  0.793  1.09 ]
 [-0.156  0.074  0.539  0.733 -0.207]
 [ 0.833  0.652  0.242  0.973 -0.035]
 [ 0.925  0.431  0.601  0.134  0.754]
 [ 0.031  0.852  0.121  0.676 -0.289]]
[[6.667e-01 1.579e+00 1.659e+00 6.338e-01 4.093e-01]
 [8.534e-03 3.520e-03 4.218e-03 5.424e-03 2.978e-03]
 [6.160e-01 3.625e+00 2.471e-01 3.220e+00 2.452e+00]
 [1.017e-01 8.029e-01 6.620e-01 1.816e-01 6.818e-01]
 [1.703e+00 1.240e+00 1.356e+00 1.803e+00 1.462e+00]]


In [None]:
values = [zip(user_ratings[i], predictions[i]) for i in range(predictions.shape[0])]
comparison_data = pd.DataFrame(values)
comparison_data.columns = user_ratings_df.columns

In [None]:
comparison_data


Unnamed: 0,The Call of Cthulhu,Frankenstein,Dracula,Neuromancer,Space Odyssey
0,"(8.0, 7.989009214774393)","(2.0, 2.0298502407257972)","(nan, -21.07706957234261)","(5.0, 4.987066693419507)","(4.0, 3.9961884466574507)"
1,"(3.0, 2.8999445144728613)","(2.0, 2.5611088026228175)","(nan, 65.59528301522094)","(7.0, 6.61683521881328)","(7.0, 6.924038094649975)"
2,"(9.0, 9.045140131418352)","(nan, 4.359561735359623)","(7.0, 7.00313600116011)","(8.0, 7.949536858691001)","(5.0, 5.0007776886254)"
3,"(nan, 8.967631597452794)","(nan, 4.930348965028055)","(7.0, 7.0000060404902005)","(8.0, 7.9995708172424695)","(9.0, 8.999353129545058)"
4,"(nan, 3.807001082202804)","(1.0, 0.570007765561784)","(8.0, 7.98976007424398)","(3.0, 3.3696153784184646)","(7.0, 7.053859914069573)"
5,"(2.0, 2.001749622460377)","(3.0, 2.995069364751593)","(5.0, 5.000022201138316)","(nan, 2.2937471837542627)","(nan, 32.34688589300806)"
6,"(4.0, 4.307705591973359)","(2.0, 0.43730846358782094)","(nan, -3.6998449407284824)","(2.0, 2.999215875707626)","(7.0, 7.205439717829883)"
7,"(7.0, 6.762140454422343)","(1.0, 2.907086263403007)","(2.0, 2.081245659730257)","(7.0, 5.661495055235152)","(9.0, 8.759411762048767)"
8,"(3.0, 3.1013073890122174)","(3.0, 2.518825593521948)","(nan, 74.2135973583635)","(7.0, 7.29050165818998)","(3.0, 3.061401919598006)"
9,"(4.0, 3.8908403463985706)","(nan, -0.14773235754631464)","(5.0, 4.983946474710881)","(3.0, 3.1110872431062866)","(3.0, 2.9966828940180683)"
