# Imports

In [2]:
from surprise import NormalPredictor
from surprise.model_selection import train_test_split
from surprise import accuracy
import heapq
import pickle 
import pandas as pd
from surprise import Dataset
from surprise import Reader
from surprise import AlgoBase
from create_similarity_vectors import create_top_k_similar_vectors
from sentence_transformers import util
import torch
import sampling

from surprise.prediction_algorithms.predictions import PredictionImpossible
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

from sklearn.model_selection import train_test_split as train_test_split
from surprise.model_selection import train_test_split as train_test_split_sup

# Constants

In [3]:
MATRIX_ING_50 = "G:/Recipes/Matrix/ING_50/matrix_top25k_ing_50.obj"
POS_TO_RECIPE_ID_MATRIX = "G:/Recipes/Matrix/ING_50/pos_to_recipe_id.obj"
RECIPE_ID_TO_POS_MATRIX = "G:/Recipes/Matrix/ING_50/recipe_id_to_pos.obj"

RATINGS_BASE = "../Data/base/ratings_base.parquet"

WORD2VEC_ING_50_VECTORS = "G:/Recipes/Vectors/ingredients_vectors_50_dict.obj"

# Load data

In [33]:
with open("G:/Recipes/Matrix/ING_50/matrix_top25k_ing_50.obj", "rb") as input_file:
    matrix_ing_50 = pickle.load(input_file)

In [39]:
with open(POS_TO_RECIPE_ID_MATRIX, "rb") as input_file:
    pos_to_recipe_id = pickle.load(input_file)

In [40]:
with open(RECIPE_ID_TO_POS_MATRIX, "rb") as input_file:
    recipe_id_to_pos = pickle.load(input_file)

In [9]:
with open(WORD2VEC_ING_50_VECTORS, "rb") as input_file:
    word2vec_ing_vectors = pickle.load(input_file)

In [4]:
ratings_base = pd.read_parquet(RATINGS_BASE)

In [5]:
ratings_sample = sampling.get_ratings_with_min_number_list(ratings_base, [20,10])

In [6]:
del(ratings_base)

## Create dataset

In [7]:
def create_train_test_dataframe(ratings_df, test_size, random_state):
    x_train, x_test, y_train, y_test = train_test_split(
        ratings_df[["AuthorId", "RecipeId"]], 
        ratings_df[["Rating"]], 
        test_size=test_size, 
        random_state=random_state, 
        stratify=ratings_df["AuthorId"])
    
    trainset = x_train.merge(y_train, left_index=True, right_index=True)
    testset = x_test.merge(y_test, left_index=True, right_index=True)

    return trainset, testset

In [8]:
def train_test_surprise_format(trainset_df, testset_df):
    trainset_surprise = Dataset.load_from_df(trainset_df[["AuthorId", "RecipeId", "Rating"]], Reader(rating_scale=(0, 5)))
    trainset_surprise = trainset_surprise.build_full_trainset()
    
    testset_surprise = list(testset_df.to_records())
    testset_surprise = [(x[1], x[2], x[3]) for x in testset_surprise]
    
    return trainset_surprise, testset_surprise

In [9]:
user_item_ratings_dataset = Dataset.load_from_df(ratings_sample[["AuthorId", "RecipeId", "Rating"]], Reader(rating_scale=(0, 5)))

In [10]:
train_df, test_df = create_train_test_dataframe(ratings_sample, 0.2, 13)
trainset, testset = train_test_surprise_format(train_df, test_df)

# KNN Algo

In [11]:
class KnnMatrixAglorithm(AlgoBase):
    def __init__(self, matrix, recipe_id_to_pos, vectors, k=40, verbose=True):
        self.k = k
        self.verbose = verbose
        self.matrix = matrix
        self.recipe_id_to_pos = recipe_id_to_pos
        self.vectors = vectors
    
    def fit(self, trainset):
        AlgoBase.fit(self, trainset)
        self.items_not_in_matrix = {}
        for item in trainset.all_items():
            raw_item = trainset.to_raw_iid(item)
            self.items_not_in_matrix[raw_item] = {}
        return self
    
    def estimate(self, u, i):
        if not (self.trainset.knows_item(i)):
#             if(self.verbose):
#                 print('raise for ', u, ' ', i)
            raise PredictionImpossible('Item is unknown')
        if not (self.trainset.knows_user(u)):
            raise PredictionImpossible('User is unknown')
        
        item_recipe_id = self.trainset.to_raw_iid(i)
        is_in_matrix = True
        neighbours = []
        calculated = 0
        from_matrix = 0
        from_dict = 0
        for rating in self.trainset.ur[u]:
            recipe_id = self.trainset.to_raw_iid(rating[0])
            
            try:
                item_pos = self.recipe_id_to_pos[item_recipe_id]
            except:
#                 print(f"{item_recipe_id} error")
                is_in_matrix = False
            
            try:
                recipe_pos = self.recipe_id_to_pos[recipe_id]
            except:
#                 print(f"{recipe_id} error")
                is_in_matrix = False
                
            if is_in_matrix:
                sim = self.matrix[item_pos, recipe_pos]
                from_matrix += 1
            else:
#                 print(item_recipe_id, " ", recipe_id)
                try:
                    if item_recipe_id > recipe_id:
                        sim = self.items_not_in_matrix[item_recipe_id][recipe_id]
                    else:
                        sim = self.items_not_in_matrix[recipe_id][item_recipe_id]
                    from_dict += 1
                except:
                    tensor1 = torch.tensor(self.vectors[item_recipe_id], dtype=torch.float)
                    tensor2 = torch.tensor(self.vectors[recipe_id], dtype=torch.float)

                    sim = util.pytorch_cos_sim(tensor1, tensor2)[0][0].item()
                    
                    if item_recipe_id > recipe_id:
                        self.items_not_in_matrix[item_recipe_id][recipe_id] = sim
                    else:
                        self.items_not_in_matrix[recipe_id][item_recipe_id] = sim
                    calculated += 1   
#                     print(f'Calculating similarity for {recipe_id} and {item_recipe_id}')
                
            neighbours.append((sim, rating[1]))
        k_neighbours = heapq.nlargest(self.k, neighbours, key=lambda t: t[0])
        
        sim_total = 0
        weighted_sum = 0
        for sim_score, rating in k_neighbours:
            if sim_score > 0:
                sim_total += sim_score
                weighted_sum += sim_score * rating
                
        if sim_total == 0:
            raise PredictionImpossible('No neighbours')
            
        predicted_rating= weighted_sum / sim_total
        
#         print(f"Calculated {calculated}, from matrix: {from_matrix}, from dict: {from_dict}")
        return predicted_rating

In [56]:
knnMatrixAlgorithm = KnnMatrixAglorithm(matrix = matrix_ing_50, recipe_id_to_pos=recipe_id_to_pos, 
                                        vectors=word2vec_ing_vectors)
knnMatrixAlgorithm.fit(trainset)

## Predictions ING CAT 50

In [53]:
predictions = knnMatrixAlgorithm.test(testset)

In [21]:
accuracy.rmse(predictions)

RMSE: 0.8631


0.8631389410134441

## Predictions ING 50

In [52]:
predictions = knnMatrixAlgorithm.test(testset)

In [45]:
accuracy.rmse(predictions)

RMSE: 0.8639


0.8638989963596698

## Predictions ING CAT 100

In [48]:
del(matrix_ing_50)
del(recipe_id_to_pos)
del(pos_to_recipe_id)

In [12]:
with open("C:/Users/Użytkownik/Recipes/Data/word_embeddings_files/matrix/ING_CAT_100_VECTORS/matrix_25k_most_popular.obj", "rb") as input_file:
    matrix_ing_100 = pickle.load(input_file)

with open("C:/Users/Użytkownik/Recipes/Data/word_embeddings_files/matrix/ING_CAT_100_VECTORS/pos_to_idmatrix_25k_most_popular.obj", "rb") as input_file:
    pos_to_recipe_id = pickle.load(input_file)

with open("C:/Users/Użytkownik/Recipes/Data/word_embeddings_files/matrix/ING_CAT_100_VECTORS/id_to_pos_matrix_25k_most_popular.obj", "rb") as input_file:
    recipe_id_to_pos = pickle.load(input_file)

with open("C:/Users/Użytkownik/Recipes/Data/word_embeddings_files/vectors/word2vec_ing_cat_vectors.obj", "rb") as input_file:
    word2vec_ing_100_vectors = pickle.load(input_file)

In [13]:
knnMatrixAlgorithm = KnnMatrixAglorithm(matrix = matrix_ing_100, recipe_id_to_pos=recipe_id_to_pos, 
                                        vectors=word2vec_ing_100_vectors)
knnMatrixAlgorithm.fit(trainset)

<__main__.KnnMatrixAglorithm at 0x1ea24d92e80>

In [16]:
%%time
predictions = knnMatrixAlgorithm.test(testset)

Wall time: 42min 28s


In [17]:
accuracy.rmse(predictions)

RMSE: 0.8646


0.8645553249084379

## ING 100

In [20]:
del(matrix_ing_100)
del(recipe_id_to_pos)
del(pos_to_recipe_id)

with open("C:/Users/Użytkownik/Recipes/Data/word_embeddings_files/matrix/ING_100_VECTORS/matrix_25k_most_popular.obj", "rb") as input_file:
    matrix_ing_100 = pickle.load(input_file)

with open("C:/Users/Użytkownik/Recipes/Data/word_embeddings_files/matrix/ING_100_VECTORS/id_to_pos_matrix_25k_most_popular.obj", "rb") as input_file:
    recipe_id_to_pos = pickle.load(input_file)

with open("C:/Users/Użytkownik/Recipes/Data/word_embeddings_files/vectors/word2vec_ingredients_vectors.obj", "rb") as input_file:
    word2vec_ing_100_vectors = pickle.load(input_file)



UsageError: Line magic function `%%time` not found.


In [21]:
%%time
knnMatrixAlgorithm = KnnMatrixAglorithm(matrix = matrix_ing_100, recipe_id_to_pos=recipe_id_to_pos, 
                                        vectors=word2vec_ing_100_vectors)
knnMatrixAlgorithm.fit(trainset)


predictions = knnMatrixAlgorithm.test(testset)

accuracy.rmse(predictions)

RMSE: 0.8644
Wall time: 47min 20s


0.8643714553957503

In [41]:
all_items = []
for item in trainset.all_items():
    item_raw = trainset.to_raw_iid(item)
    all_items.append(item_raw)

In [42]:
len(set(all_items) & set(recipe_id_to_pos.keys()))

25719