In [2]:
import pandas as pd
from scipy import stats
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.utils import shuffle
from datetime import datetime
from sortedcontainers import SortedList

In [3]:
df_raw = pd.read_csv("archive/RAW_interactions.csv")

In [4]:
df_recipes = pd.read_csv("archive/RAW_recipes.csv")

df_interaction_train = pd.read_csv("archive/interactions_train.csv")
df_interaction_test= pd.read_csv("archive/interactions_validation.csv")

In [5]:
df_raw[["user_id","recipe_id","rating"]].count()

user_id      1132367
recipe_id    1132367
rating       1132367
dtype: int64

In [6]:
df_raw.drop_duplicates(subset=["user_id","recipe_id","rating"],keep="last")[["user_id","recipe_id","rating"]].count()

user_id      1132367
recipe_id    1132367
rating       1132367
dtype: int64

**Observation** 

Our raw user-food interaction matrix does not have duplicates and hence we can proceed with 3 columns : user_id , recipe_id and rating

In [7]:
df_raw=df_raw[["user_id","recipe_id","rating"]]

In [8]:
df_raw.user_id.max() - df_raw.user_id.min()

2002371173

## Converting the user_ids and recipe_ids into contiguous value series

In [9]:
user_id_set=set(df_raw.user_id.values)
user2idx={}
i=0
for user in user_id_set:
    user2idx[user]=i
    i+=1

In [10]:
recipe_id_set=set(df_raw.recipe_id.values)
recipe2idx={}
i=0
for recipe in recipe_id_set:
    recipe2idx[recipe]=i
    i+=1

In [11]:
df_raw['user_idx'] = df_raw.apply(lambda row: user2idx[row.user_id], axis=1)
df_raw['recipe_idx'] = df_raw.apply(lambda row: recipe2idx[row.recipe_id], axis=1)

In [12]:
df_raw.drop(["user_id","recipe_id"],axis=1,inplace=True)

In [13]:
N = df_raw.user_idx.max() + 1 # number of users
M = df_raw.recipe_idx.max() + 1 # number of recipes

In [14]:
N,M

(226570, 231637)

In [15]:
user_ids_count = Counter(df_raw.user_idx)
recipe_ids_count = Counter(df_raw.recipe_idx)

In [16]:
df_raw.head()

Unnamed: 0,rating,user_idx,recipe_idx
0,4,17890,23485
1,5,116700,23485
2,4,3970,25577
3,5,61790,49480
4,5,27974,49480


In [17]:
user_ids_keep = [i for (i,j) in user_ids_count.most_common()[::-1] if j>=10]
recipe_ids_keep = [i for (i,j) in recipe_ids_count.most_common()[::-1] if j>=10]

In [19]:
len(user_ids_keep)

12486

In [20]:
len(recipe_ids_keep)

21399

In [18]:
df_small = df_raw[(df_raw.user_idx.isin(user_ids_keep)) & df_raw.recipe_idx.isin(recipe_ids_keep)].reset_index(drop=True).copy()

In [22]:
df_small.head()

Unnamed: 0,rating,user_idx,recipe_idx
0,5,27684,46168
1,5,87869,46168
2,5,49778,46168
3,4,197797,46168
4,4,108252,46168


In [19]:
user_ids=set(df_small.user_idx.values)
recipe_ids=set(df_small.recipe_idx.values)

In [20]:
# need to remake user ids and movie ids since they are no longer sequential
new_user_id_map = {}
i = 0
for old in user_ids:
  new_user_id_map[old] = i
  i += 1
#print("i:", i)

new_recipe_id_map = {}
j = 0
for old in recipe_ids:
  new_recipe_id_map[old] = j
  j += 1
#print("j:", j)

In [21]:
df_small.loc[:, 'user_id'] = df_small.apply(lambda row: new_user_id_map[row.user_idx], axis=1)
df_small.loc[:, 'recipe_id'] = df_small.apply(lambda row: new_recipe_id_map[row.recipe_idx], axis=1)

In [22]:
df_small.drop(["user_idx","recipe_idx"],axis=1,inplace=True)

In [23]:
df_small

Unnamed: 0,rating,user_id,recipe_id
0,5,10594,10525
1,5,8624,10525
2,5,6601,10525
3,4,507,10525
4,4,3904,10525
...,...,...,...
401052,5,12289,10855
401053,5,1642,10855
401054,5,1004,10855
401055,5,11474,10855


In [24]:
print("max user id:", df_small.user_id.max())
print("max recipe id:", df_small.recipe_id.max())

print("small dataframe size:", len(df_small))

max user id: 12464
max recipe id: 21307
small dataframe size: 401057


In [25]:
N,M = df_small.user_id.max() + 1,df_small.recipe_id.max()+1

In [26]:
df=shuffle(df_small)

In [27]:
cutoff = int(0.8*len(df))
df_train = df.iloc[:cutoff]
df_test = df.iloc[cutoff:]

In [28]:
# a dictionary to tell us which users have rated which recipes
user2recipe = {}
# a dicationary to tell us which recipes have been rated by which users
recipe2user = {}
# a dictionary to look up ratings
userrecipe2rating = {}
print("Calling: update_user2movie_and_movie2user")
count = 0

Calling: update_user2movie_and_movie2user


In [29]:
df_small.head()

Unnamed: 0,rating,user_id,recipe_id
0,5,10594,10525
1,5,8624,10525
2,5,6601,10525
3,4,507,10525
4,4,3904,10525


In [30]:
def update_user2recipe_and_recipe2user(row):
    
    global count
    count += 1
    if count % 100000 == 0:
        print("processed: %.3f" % (float(count)/cutoff))

    i = int(row.user_id)
    j = int(row.recipe_id)
    if i not in user2recipe:
        user2recipe[i] = [j]
    else:
        user2recipe[i].append(j)

    if j not in recipe2user:
        recipe2user[j] = [i]
    else:
        recipe2user[j].append(i)

    userrecipe2rating[(i,j)] = row.rating

In [31]:
df_train.apply(update_user2recipe_and_recipe2user, axis=1)

processed: 0.312
processed: 0.623
processed: 0.935


132737    None
6968      None
286472    None
237182    None
240154    None
          ... 
194967    None
3713      None
116959    None
289622    None
98138     None
Length: 320845, dtype: object

In [32]:
userrecipe2rating_test = {}
print("Calling: update_userrecipe2rating_test")
count = 0

Calling: update_userrecipe2rating_test


In [33]:
def update_userrecipe2rating_test(row):
    
    global count
    count += 1
    if count % 100000 == 0:
        print("processed: %.3f" % (float(count)/len(df_test)))

    i = int(row.user_id)
    j = int(row.recipe_id)
    userrecipe2rating_test[(i,j)] = row.rating

In [34]:
df_test.apply(update_userrecipe2rating_test, axis=1)

391619    None
97793     None
378067    None
318673    None
208280    None
          ... 
66705     None
263383    None
197676    None
193524    None
31686     None
Length: 80212, dtype: object

In [35]:
n1 = np.max(list(user2recipe.keys())) + 1
n2 = np.max([u for (u, m), r in userrecipe2rating_test.items()])
# the test set may contain recipes the train set doesn't have data on
m1 = np.max(list(recipe2user.keys()))
m2 = np.max([m for (u, m), r in userrecipe2rating_test.items()])
M = max(m1, m2) + 1
N = max(n1, n2) + 1
print("N:", N, "M:", M)

N: 12466 M: 21308


# User - User Collaborative Filtering 

In [36]:
K = 25 # number of neighbors we'd like to consider
limit = 5 # number of common recipes users must have in common in order to consider
neighbors = {} # store neighbors in this list
averages = {} # each user's average rating for later use
deviations = {} # each user's deviation for later use
SIGMA_CONST = 1e-6

In [39]:
for j1,i in enumerate(list(set(df_train.user_id.values))):
    
    recipes_i = user2recipe[i]
    recipes_i_set = set(recipes_i)

    # calculate avg and deviation
    ratings_i = { recipe:userrecipe2rating[(i, recipe)] for recipe in recipes_i }
    avg_i = np.mean(list(ratings_i.values()))
    dev_i = { recipe:(rating - avg_i) for recipe, rating in ratings_i.items() }
    dev_i_values = np.array(list(dev_i.values()))
    sigma_i = np.sqrt(dev_i_values.dot(dev_i_values))

    # save these for later use
    #averages.append((i,avg_i))
    #deviations.append((i,dev_i))
    
    averages[i]=avg_i
    deviations[i]=dev_i
    
    sl = SortedList()
    
    for i1,j in enumerate(list(set(df_train.user_id.values))):
        if j!=i:
            recipes_j = user2recipe[j]
            recipes_j_set = set(recipes_j)
            common_recipes = (recipes_i_set & recipes_j_set)
            if(len(common_recipes)>limit):
                
                # calculate avg and deviation
                ratings_j = { recipe:userrecipe2rating[(j, recipe)] for recipe in recipes_j }
                avg_j = np.mean(list(ratings_j.values()))
                dev_j = { recipe:(rating - avg_j) for recipe, rating in ratings_j.items() }
                dev_j_values = np.array(list(dev_j.values()))
                sigma_j = np.sqrt(dev_j_values.dot(dev_j_values))
                
                # calculate correlation coefficient
                numerator = sum(dev_i[m]*dev_j[m] for m in common_recipes)
                denominator = ((sigma_i+SIGMA_CONST) * (sigma_j+SIGMA_CONST))
                #print((i,j))
                #print(numerator)
                #print(denominator)
                w_ij = numerator / (denominator)
                # insert into sorted list and truncate
                # negate absolute weight, because list is sorted ascending and we get all neighbors with the highest correlation
                # maximum value (1) is "closest"
                sl.add((-np.abs(w_ij), j))
                #print((-w_ij, j))
                # Putting an upper cap on the number of neighbors
                if len(sl)>K:
                    del sl[-1]
    if i%100==0:                
        print((i,j1,sl))
    neighbors[i]=sl

(0, 0, SortedList([]))
(100, 100, SortedList([]))
(200, 200, SortedList([]))
(300, 300, SortedList([]))
(400, 400, SortedList([]))
(500, 499, SortedList([(-0.05349261147650021, 4419), (-0.037159486543857034, 3117), (-0.034664997736231225, 8850), (-0.028707052404000052, 8932), (-0.0277515736189838, 4223), (-0.025807169607174173, 5194), (-0.02245176298178745, 1748), (-0.021546547190093933, 10946), (-0.019215734228768575, 10350), (-0.01837134685216954, 11397), (-0.018183021020847737, 7466), (-0.01810575211254439, 1596), (-0.016783684624145177, 593), (-0.016593498386726597, 5466), (-0.015638836824134358, 1931), (-0.015484381061828525, 8392), (-0.014681365940732395, 4329), (-0.01384492961295538, 11002), (-0.013489450024711655, 10082), (-0.012544517597654676, 1781), (-0.012535299934027166, 3139), (-0.011778337476416463, 4011), (-0.010531579792976921, 782), (-0.010497505959794541, 6128), (-0.010155144040560256, 11115)]))
(600, 599, SortedList([(-0.01389353912500327, 1004)]))
(700, 699, Sorted

(8100, 8091, SortedList([(-0.0049525026639567074, 1004)]))
(8200, 8191, SortedList([(-0.8414021379548168, 2766), (-0.8249621951740308, 10677), (-0.010801326463142674, 2826), (-0.00889977048248447, 11648), (-0.008697487380352283, 1004), (-0.007402904384535408, 1384), (-0.00552958299818585, 6844), (-0.004771171810653319, 2898), (-0.004442435235940469, 6657), (-0.00344302299509751, 2415), (-0.003293969591908867, 11532), (-0.0032651763033945532, 4033), (-0.0029787323298697704, 6994), (-0.002750157405574406, 11153), (-0.002271055227360244, 6868), (-0.0011806039842569087, 7764), (-0.0007820243821844117, 4129), (-8.876529162662338e-05, 4011)]))
(8300, 8291, SortedList([(-0.0032966705934380624, 12174), (-0.0018362327706553192, 1004), (-0.0005763576315813361, 11811)]))
(8400, 8391, SortedList([(-0.02053005959294371, 1004)]))
(8500, 8491, SortedList([]))
(8600, 8591, SortedList([]))
(8700, 8691, SortedList([]))
(8800, 8791, SortedList([(-0.019170665241323735, 7764), (-0.0038576937029620364, 1004

In [41]:
def predict(i, m):
    # calculate the weighted sum of deviations
    numerator = 0
    denominator = 0
    neighbors_get=neighbors.get(i,0)
    
    if neighbors_get!=0:
        for neg_w, j in neighbors.get(i):
            # remember, the weight is stored as its negative
            # so the negative of the negative weight is the positive weight
            try:
                numerator += -neg_w * deviations[j][m]
                denominator += abs(neg_w)
            except KeyError:
              # neighbor may not have rated the same rating
              # don't want to do dictionary lookup twice
              # so just throw exception
              pass

        if denominator == 0:
            prediction = averages.get(i)
        else:
            prediction = numerator / denominator + averages.get(i)
    else:
        prediction = averages.get(i,-1)
    if prediction==-1:
        #print("New User Identified")
        pass
    else:
        prediction = min(5, prediction)
        prediction = max(0.5, prediction) # min rating is 0.5
    return prediction

In [42]:
train_predictions = []
train_targets = []
for (i, m), target in userrecipe2rating.items():
    
    # calculate the prediction for this recipe
    prediction = predict(i, m)

    # save the prediction and target
    train_predictions.append(prediction)
    train_targets.append(target)

In [43]:
test_predictions = []
test_targets = []
# same thing for test set
for (i, m), target in userrecipe2rating_test.items():
    
    # calculate the prediction for this recipe
    prediction_test = predict(i, m)

    # save the prediction and target
    test_predictions.append(prediction_test)
    test_targets.append(target)

In [44]:
# calculate accuracy
def mse(p, t):
    
    p = np.array(p)
    t = np.array(t)
    return np.mean((p - t)**2)

def rmse(p, t):
    
    p = np.array(p)
    t = np.array(t)
    return np.sqrt(np.mean((p - t)**2))

print('train mse:', mse(train_predictions, train_targets))
print('test mse:', mse(test_predictions, test_targets))

print('train rmse:', rmse(train_predictions, train_targets))
print('test rmse:', rmse(test_predictions, test_targets))

train mse: 0.8652148447990182
test mse: 0.8870792994234658
train rmse: 0.9301692559953905
test rmse: 0.9418488729214819
