In [1]:
import pandas as pd
from scipy import stats
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.utils import shuffle
from datetime import datetime
from sortedcontainers import SortedList

In [2]:
df_raw = pd.read_csv("archive/RAW_interactions.csv")

In [3]:
df_recipes = pd.read_csv("archive/RAW_recipes.csv")

df_interaction_train = pd.read_csv("archive/interactions_train.csv")
df_interaction_test= pd.read_csv("archive/interactions_validation.csv")

In [4]:
df_raw[["user_id","recipe_id","rating"]].count()

user_id      1132367
recipe_id    1132367
rating       1132367
dtype: int64

In [5]:
df_raw.drop_duplicates(subset=["user_id","recipe_id","rating"],keep="last")[["user_id","recipe_id","rating"]].count()

user_id      1132367
recipe_id    1132367
rating       1132367
dtype: int64

**Observation** 

Our raw user-food interaction matrix does not have duplicates and hence we can proceed with 3 columns : user_id , recipe_id and rating

In [6]:
df_raw=df_raw[["user_id","recipe_id","rating"]]

In [7]:
df_raw.user_id.max() - df_raw.user_id.min()

2002371173

## Converting the user_ids and recipe_ids into contiguous value series

In [8]:
user_id_set=set(df_raw.user_id.values)
user2idx={}
i=0
for user in user_id_set:
    user2idx[user]=i
    i+=1

In [9]:
recipe_id_set=set(df_raw.recipe_id.values)
recipe2idx={}
i=0
for recipe in recipe_id_set:
    recipe2idx[recipe]=i
    i+=1

In [10]:
df_raw['user_idx'] = df_raw.apply(lambda row: user2idx[row.user_id], axis=1)
df_raw['recipe_idx'] = df_raw.apply(lambda row: recipe2idx[row.recipe_id], axis=1)

In [11]:
df_raw.drop(["user_id","recipe_id"],axis=1,inplace=True)

In [12]:
N = df_raw.user_idx.max() + 1 # number of users
M = df_raw.recipe_idx.max() + 1 # number of recipes

In [13]:
N,M

(226570, 231637)

In [14]:
user_ids_count = Counter(df_raw.user_idx)
recipe_ids_count = Counter(df_raw.recipe_idx)

In [15]:
df_raw.head()

Unnamed: 0,rating,user_idx,recipe_idx
0,4,17890,23485
1,5,116700,23485
2,4,3970,25577
3,5,61790,49480
4,5,27974,49480


In [16]:
user_ids_keep = [i for (i,j) in user_ids_count.most_common()[::-1] if j>=10]
recipe_ids_keep = [i for (i,j) in recipe_ids_count.most_common()[::-1] if j>=10]

In [17]:
len(user_ids_keep)

12486

In [18]:
len(recipe_ids_keep)

21399

In [19]:
df_small = df_raw[(df_raw.user_idx.isin(user_ids_keep)) & df_raw.recipe_idx.isin(recipe_ids_keep)].reset_index(drop=True).copy()

In [20]:
df_small.head()

Unnamed: 0,rating,user_idx,recipe_idx
0,5,27684,46168
1,5,87869,46168
2,5,49778,46168
3,4,197797,46168
4,4,108252,46168


In [21]:
user_ids=set(df_small.user_idx.values)
recipe_ids=set(df_small.recipe_idx.values)

In [22]:
# need to remake user ids and movie ids since they are no longer sequential
new_user_id_map = {}
i = 0
for old in user_ids:
  new_user_id_map[old] = i
  i += 1
#print("i:", i)

new_recipe_id_map = {}
j = 0
for old in recipe_ids:
  new_recipe_id_map[old] = j
  j += 1
#print("j:", j)

In [23]:
df_small.loc[:, 'user_id'] = df_small.apply(lambda row: new_user_id_map[row.user_idx], axis=1)
df_small.loc[:, 'recipe_id'] = df_small.apply(lambda row: new_recipe_id_map[row.recipe_idx], axis=1)

In [24]:
df_small.drop(["user_idx","recipe_idx"],axis=1,inplace=True)

In [25]:
df_small

Unnamed: 0,rating,user_id,recipe_id
0,5,10594,10525
1,5,8624,10525
2,5,6601,10525
3,4,507,10525
4,4,3904,10525
...,...,...,...
401052,5,12289,10855
401053,5,1642,10855
401054,5,1004,10855
401055,5,11474,10855


In [26]:
print("max user id:", df_small.user_id.max())
print("max recipe id:", df_small.recipe_id.max())

print("small dataframe size:", len(df_small))

max user id: 12464
max recipe id: 21307
small dataframe size: 401057


In [27]:
N,M = df_small.user_id.max() + 1,df_small.recipe_id.max()+1

In [28]:
df=shuffle(df_small)

In [29]:
cutoff = int(0.8*len(df))
df_train = df.iloc[:cutoff]
df_test = df.iloc[cutoff:]

In [30]:
# a dictionary to tell us which users have rated which recipes
user2recipe = {}
# a dicationary to tell us which recipes have been rated by which users
recipe2user = {}
# a dictionary to look up ratings
userrecipe2rating = {}
print("Calling: update_user2movie_and_movie2user")
count = 0

Calling: update_user2movie_and_movie2user


In [31]:
df_small.head()

Unnamed: 0,rating,user_id,recipe_id
0,5,10594,10525
1,5,8624,10525
2,5,6601,10525
3,4,507,10525
4,4,3904,10525


In [32]:
def update_user2recipe_and_recipe2user(row):
    
    global count
    count += 1
    if count % 100000 == 0:
        print("processed: %.3f" % (float(count)/cutoff))

    i = int(row.user_id)
    j = int(row.recipe_id)
    if i not in user2recipe:
        user2recipe[i] = [j]
    else:
        user2recipe[i].append(j)

    if j not in recipe2user:
        recipe2user[j] = [i]
    else:
        recipe2user[j].append(i)

    userrecipe2rating[(i,j)] = row.rating

In [33]:
df_train.apply(update_user2recipe_and_recipe2user, axis=1)

processed: 0.312
processed: 0.623
processed: 0.935


108531    None
345339    None
301954    None
31706     None
318355    None
          ... 
222184    None
171789    None
301704    None
240918    None
127120    None
Length: 320845, dtype: object

In [34]:
userrecipe2rating_test = {}
print("Calling: update_userrecipe2rating_test")
count = 0

Calling: update_userrecipe2rating_test


In [35]:
def update_userrecipe2rating_test(row):
    
    global count
    count += 1
    if count % 100000 == 0:
        print("processed: %.3f" % (float(count)/len(df_test)))

    i = int(row.user_id)
    j = int(row.recipe_id)
    userrecipe2rating_test[(i,j)] = row.rating

In [36]:
df_test.apply(update_userrecipe2rating_test, axis=1)

333675    None
375418    None
143241    None
58583     None
6765      None
          ... 
241085    None
87177     None
251194    None
107754    None
57066     None
Length: 80212, dtype: object

In [37]:
n1 = np.max(list(user2recipe.keys())) + 1
n2 = np.max([u for (u, m), r in userrecipe2rating_test.items()])
# the test set may contain recipes the train set doesn't have data on
m1 = np.max(list(recipe2user.keys()))
m2 = np.max([m for (u, m), r in userrecipe2rating_test.items()])
M = max(m1, m2) + 1
N = max(n1, n2) + 1
print("N:", N, "M:", M)

N: 12466 M: 21308


# User - User Collaborative Filtering 

In [38]:
K = 25 # number of neighbors we'd like to consider
limit = 5 # number of common recipes users must have in common in order to consider
neighbors = {} # store neighbors in this list
averages = {} # each user's average rating for later use
deviations = {} # each user's deviation for later use
SIGMA_CONST = 1e-6

In [39]:
for j1,i in enumerate(list(set(df_train.user_id.values))):
    
    recipes_i = user2recipe[i]
    recipes_i_set = set(recipes_i)

    # calculate avg and deviation
    ratings_i = { recipe:userrecipe2rating[(i, recipe)] for recipe in recipes_i }
    avg_i = np.mean(list(ratings_i.values()))
    dev_i = { recipe:(rating - avg_i) for recipe, rating in ratings_i.items() }
    dev_i_values = np.array(list(dev_i.values()))
    sigma_i = np.sqrt(dev_i_values.dot(dev_i_values))

    # save these for later use
    #averages.append((i,avg_i))
    #deviations.append((i,dev_i))
    
    averages[i]=avg_i
    deviations[i]=dev_i
    
    sl = SortedList()
    
    for i1,j in enumerate(list(set(df_train.user_id.values))):
        if j!=i:
            recipes_j = user2recipe[j]
            recipes_j_set = set(recipes_j)
            common_recipes = (recipes_i_set & recipes_j_set)
            if(len(common_recipes)>limit):
                
                # calculate avg and deviation
                ratings_j = { recipe:userrecipe2rating[(j, recipe)] for recipe in recipes_j }
                avg_j = np.mean(list(ratings_j.values()))
                dev_j = { recipe:(rating - avg_j) for recipe, rating in ratings_j.items() }
                dev_j_values = np.array(list(dev_j.values()))
                sigma_j = np.sqrt(dev_j_values.dot(dev_j_values))
                
                # calculate correlation coefficient
                numerator = sum(dev_i[m]*dev_j[m] for m in common_recipes)
                denominator = ((sigma_i+SIGMA_CONST) * (sigma_j+SIGMA_CONST))
                #print((i,j))
                #print(numerator)
                #print(denominator)
                w_ij = numerator / (denominator)
                # insert into sorted list and truncate
                # negate absolute weight, because list is sorted ascending and we get all neighbors with the highest correlation
                # maximum value (1) is "closest"
                sl.add((-np.abs(w_ij), j))
                #print((-w_ij, j))
                # Putting an upper cap on the number of neighbors
                if len(sl)>K:
                    del sl[-1]
    if i%100==0:                
        print((i,j1,sl))
    neighbors[i]=sl

(0, 0, SortedList([]))
(100, 99, SortedList([(-0.00986868160820586, 1004)]))
(200, 199, SortedList([(-0.02294975058823153, 3516)]))
(300, 298, SortedList([]))
(400, 398, SortedList([]))
(500, 498, SortedList([(-0.032324112413773906, 6320), (-0.03212604544421148, 640), (-0.0254963497613206, 7608), (-0.024609804906838093, 10920), (-0.023193567805582252, 6821), (-0.022904033722201605, 4419), (-0.021984442351598455, 9759), (-0.016973528592779386, 4329), (-0.016529702533562708, 1931), (-0.016384241666107403, 11490), (-0.015684108299337616, 1516), (-0.015150096408056117, 8464), (-0.014846742310364505, 3771), (-0.013450780017529066, 3139), (-0.01327785776582788, 10183), (-0.013152810430937256, 9391), (-0.012219163310106862, 8392), (-0.011806158971313582, 6706), (-0.011782098302568768, 11811), (-0.01147495341302986, 10354), (-0.011418060408410346, 12131), (-0.01090401541906544, 11397), (-0.010270605202534453, 1004), (-0.010119668700297681, 3403), (-0.009193239356064527, 6994)]))
(600, 598, Sor

(8300, 8290, SortedList([(-0.014036377978731834, 3516), (-0.006599591244236586, 11811), (-0.0007969136631050556, 1004), (-0.00019053914201699114, 11153)]))
(8400, 8390, SortedList([(-0.024437402251941635, 1004), (-0.0010677811929844844, 5574)]))
(8500, 8490, SortedList([(-0.005446845087738212, 1004)]))
(8600, 8590, SortedList([]))
(8700, 8690, SortedList([]))
(8800, 8790, SortedList([(-0.010007381038510467, 4324), (-0.004321123814398378, 1004), (-0.0008557494534682006, 7411)]))
(8900, 8889, SortedList([(-0.00455474758919533, 1004), (-0.0024086718055395997, 11811), (-0.00156170862845503, 6809), (-5.464924146294584e-05, 2415)]))
(9000, 8989, SortedList([(-0.053591267840222535, 1440), (-0.02229662220957037, 4621), (-0.01715883806103126, 8442), (-0.015465003233935036, 2415), (-0.015016613271127875, 1004), (-0.014599163533540637, 9759), (-0.011796930386389563, 11331), (-0.011677896159390778, 5945), (-0.008893975465005524, 4419), (-0.008801482618662834, 11811), (-0.007433510272119857, 4129),

In [40]:
def predict(i, m):
    # calculate the weighted sum of deviations
    numerator = 0
    denominator = 0
    neighbors_get=neighbors.get(i,0)
    
    if neighbors_get!=0:
        for neg_w, j in neighbors.get(i):
            # remember, the weight is stored as its negative
            # so the negative of the negative weight is the positive weight
            try:
                numerator += -neg_w * deviations[j][m]
                denominator += abs(neg_w)
            except KeyError:
              # neighbor may not have rated the same rating
              # don't want to do dictionary lookup twice
              # so just throw exception
              pass

        if denominator == 0:
            prediction = averages.get(i)
        else:
            prediction = numerator / denominator + averages.get(i)
    else:
        prediction = averages.get(i,-1)
    if prediction==-1:
        #print("New User Identified")
        pass
    else:
        prediction = min(5, prediction)
        prediction = max(0.5, prediction) # min rating is 0.5
    return prediction

In [41]:
train_predictions = []
train_targets = []
for (i, m), target in userrecipe2rating.items():
    
    # calculate the prediction for this recipe
    prediction = predict(i, m)

    # save the prediction and target
    train_predictions.append(prediction)
    train_targets.append(target)

In [42]:
test_predictions = []
test_targets = []
# same thing for test set
for (i, m), target in userrecipe2rating_test.items():
    
    # calculate the prediction for this recipe
    prediction_test = predict(i, m)

    # save the prediction and target
    test_predictions.append(prediction_test)
    test_targets.append(target)

In [43]:
# calculate accuracy
def mse(p, t):
    
    p = np.array(p)
    t = np.array(t)
    return np.mean((p - t)**2)

def rmse(p, t):
    
    p = np.array(p)
    t = np.array(t)
    return np.sqrt(np.mean((p - t)**2))

print('train mse:', mse(train_predictions, train_targets))
print('test mse:', mse(test_predictions, test_targets))

print('train rmse:', rmse(train_predictions, train_targets))
print('test rmse:', rmse(test_predictions, test_targets))

train mse: 0.8615178024084629
test mse: 0.9062015866125724
train rmse: 0.9281798330110728
test rmse: 0.9519462099365554


In [44]:
df_train.to_csv("df_collaborative_train.csv")
df_test.to_csv("df_collaborative_train.csv")