# Hybrid Models for Recommendation Systems

Load Pandas, we are going to need it for manipulating data

In [0]:
import pandas as pd
import numpy as np
from IPython.display import Image
np.set_printoptions(precision = 3)

Now load the data

In [0]:
user_ratings_df = pd.read_csv("user_ratings.csv")
user_features_df = pd.read_csv("user_features.csv")
item_features_df = pd.read_csv("item_features.csv")

In [0]:
user_features_df["key"] = 0
user_features_df["user_id"] = range(user_features_df.shape[0])
item_features_df["key"] = 0
item_features_df["item_id"] = range(item_features_df.shape[0])

merged_df = pd.merge(user_features_df, item_features_df, left_index=True, on="key")
merged_df.drop(labels='key', axis=1, inplace=True)  # drop the "key" column


In [4]:
merged_df[["item_id", "user_id"]]

Unnamed: 0,item_id,user_id
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
0,0,1
1,1,1
2,2,1
3,3,1
4,4,1


In [0]:
merged_df["rating"] = map(lambda ids: user_ratings_df.values[ids[1]][ids[2]], 
                          merged_df[["user_id", "item_id"]].itertuples())

train = merged_df.dropna()
test = merged_df[merged_df.isnull().any(axis=1)]


In [6]:
n_latent_factors = 2

user_ratings = user_ratings_df.values
latent_user_preferences = np.random.random((user_ratings.shape[0], n_latent_factors))
latent_item_features = np.random.random((user_ratings.shape[1], n_latent_factors))

user_features = user_features_df.values
item_features = item_features_df.values

user_features = np.concatenate([np.ones(shape=(user_features.shape[0],1)), user_features], axis=1)
item_features = np.concatenate([np.ones(shape=(item_features.shape[0],1)), item_features], axis=1)

user_features_weights = np.random.random((user_ratings.shape[0], user_features.shape[1]))
item_features_weights = np.random.random((user_ratings.shape[1], item_features.shape[1]))

print(user_features)

[[1. 1. 0. 0. 0.]
 [1. 0. 1. 0. 1.]
 [1. 0. 0. 0. 2.]
 [1. 1. 0. 0. 3.]
 [1. 0. 1. 0. 4.]
 [1. 0. 0. 0. 5.]
 [1. 0. 0. 0. 6.]
 [1. 1. 0. 0. 7.]
 [1. 0. 1. 0. 8.]
 [1. 1. 0. 0. 9.]]


In [0]:
def predict_rating(user_id, item_id):
    """
    Predict a rating given a user_id and an item_id.
    """
    user_preference = latent_user_preferences[user_id]
    item_preference = latent_item_features[item_id]
    
    user_score = user_features_weights[user_id].dot(user_features[user_id])
    item_score = item_features_weights[item_id].dot(item_features[item_id])
    #print(user_preference.dot(item_preference), user_score, item_score)
    return user_preference.dot(item_preference) + user_score + item_score


def train(user_id, item_id, rating, alpha=0.001, 
          latent_feature_weight_decay=0.1,
          user_weight_decay=0.01, item_weight_decay=0.0001):
    #print(item_id)
    prediction_rating = predict_rating(user_id, item_id)
    err = prediction_rating - rating
    #print(err)
    user_pref_values = latent_user_preferences[user_id][:]
    latent_user_preferences[user_id] -= alpha * err * (latent_item_features[item_id] + latent_feature_weight_decay * latent_user_preferences[user_id])
    latent_item_features[item_id] -= alpha * err * (user_pref_values + latent_feature_weight_decay * latent_item_features[item_id])
    
    user_features_weights[user_id] -=alpha * err * (user_features[user_id] + user_weight_decay * user_features_weights[user_id])
    item_features_weights[item_id] -=alpha * err * (item_features_weights[item_id] + item_weight_decay * item_features_weights[item_id])
    
    return err


def sgd(iterations=30000):
    """ 
    Iterate over all users and all items and train for 
    a certain number of iterations
    """
    for iteration in range(iterations):
        error = []
        for user_id in range(latent_user_preferences.shape[0]):
            for item_id in range(latent_item_features.shape[0]):
                rating = user_ratings[user_id][item_id]
                if not np.isnan(rating):
                    err = train(user_id, item_id, rating)
                    error.append(err)
    mse = (np.array(error) ** 2).mean()          
    print(mse)


In [8]:
for _ in range(10): 
    sgd()  # Note decreasing values with increasing iterations

0.2973442126648768
0.28602837586704016
0.2824716048686834
0.2807987319996694
0.27983567365215345
0.2792177093982289
0.27879568808427513
0.27849730965825586
0.2782833179057015
0.2781306400861111


In [9]:
predictions = np.zeros(shape=(latent_user_preferences.shape[0], latent_item_features.shape[0]))
print(user_features_weights)
print(item_features_weights)
for user_id in range(latent_user_preferences.shape[0]):
    for item_id in range(latent_item_features.shape[0]):
        predictions[user_id, item_id] =  predict_rating(user_id, item_id)
  

[[-1.720e+00 -1.185e+00  9.456e-01  6.286e-02  2.923e-01]
 [ 2.463e+00  1.683e-01  2.343e+00  1.001e+00  2.174e+00]
 [ 3.889e-01  5.195e-01  5.144e-01  6.454e-01  6.691e-01]
 [ 8.333e-01  7.998e-01  4.536e-01  9.199e-01  4.793e-01]
 [ 3.331e-03  2.324e-01 -1.003e-03  4.989e-01 -3.817e-02]
 [ 3.175e-01  3.052e-01  7.086e-01  2.599e-02  1.390e+00]
 [ 3.387e-01  4.357e-02  4.631e-01  8.108e-01 -2.902e-01]
 [ 3.518e-01  4.616e-01  4.465e-01  5.279e-01  5.265e-02]
 [ 1.062e+00  3.047e-01  1.763e-01  7.451e-01  7.070e-01]
 [ 6.368e-01  7.241e-01  4.557e-01  9.564e-01 -3.640e-01]]
[[2.109e+00 1.060e+00 3.477e-01 1.677e+00 2.302e+00]
 [1.769e-04 4.528e-05 2.647e-04 1.231e-04 1.229e-03]
 [2.972e+00 3.006e+00 2.035e+00 2.379e+00 2.871e+00]
 [1.753e+00 1.776e+00 1.281e+00 1.142e+00 1.482e-01]
 [6.708e-01 9.994e-01 3.097e+00 2.010e+00 9.817e-01]]


In [0]:
values = [zip(user_ratings[i], predictions[i]) for i in range(predictions.shape[0])]
comparison_data = pd.DataFrame(values)
comparison_data.columns = user_ratings_df.columns

In [11]:
comparison_data


Unnamed: 0,The Call of Cthulhu,Frankenstein,Dracula,Neuromancer,Space Odyssey
0,"(8.0, 7.989383309248961)","(2.0, 2.029506099927064)","(nan, -20.726217917686053)","(5.0, 4.986841662428718)","(4.0, 3.996292010743696)"
1,"(3.0, 2.900485154304468)","(2.0, 2.5597892743598885)","(nan, 64.69551485408711)","(7.0, 6.617535783984954)","(7.0, 6.924227461501811)"
2,"(9.0, 9.045443148475462)","(nan, 4.359782197174101)","(7.0, 7.003192190704558)","(8.0, 7.950086474143937)","(5.0, 5.001122536352218)"
3,"(nan, 8.966709927925518)","(nan, 4.929515489545647)","(7.0, 6.99999770626815)","(8.0, 8.000184266115024)","(9.0, 8.99965445027381)"
4,"(nan, 3.7994575069936447)","(1.0, 0.567794282651453)","(8.0, 7.989499944757836)","(3.0, 3.371037392783573)","(7.0, 7.0542398625809355)"
5,"(2.0, 2.001838849852293)","(3.0, 2.993946039927256)","(5.0, 4.999990675007442)","(nan, 2.3075513416474127)","(nan, 32.32565324019022)"
6,"(4.0, 4.308177275864381)","(2.0, 0.4359183030972078)","(nan, -3.6514472184116364)","(2.0, 2.999516365409395)","(7.0, 7.2057007460134255)"
7,"(7.0, 6.763408971764298)","(1.0, 2.906138425941872)","(2.0, 2.0822344993612436)","(7.0, 5.662053933059235)","(9.0, 8.759135007983314)"
8,"(3.0, 3.101485832100447)","(3.0, 2.518066196789666)","(nan, 73.21570623710834)","(7.0, 7.29084418988701)","(3.0, 3.0616652777800786)"
9,"(4.0, 3.888694701410135)","(nan, -0.14761759550276474)","(5.0, 4.9834809342277016)","(3.0, 3.113246252794392)","(3.0, 2.9965962990082193)"
