In [1]:

import numpy as np
import pandas as pd

from gensim.models import Word2Vec
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from typing import List, Dict, Tuple
import random
import pickle
import pprint
import spacy

from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy


product_data = {
    "data_context": "books",
    "product_filepath": "data/products_books_v1_10_10.csv",
    "transactions_filepath": "data/transactions_books_v1_10_10.csv",
    "features": ["product_title", "product_image", "product_soup", "product_images"],
    "version": "1.0",
    "unique_name": "_books_v1_10_10",
}

print("looking at", "../" + product_data["product_filepath"])

productdf =  pd.read_csv("../" + product_data["product_filepath"])
transactiondf = pd.read_csv("../" + product_data["transactions_filepath"])


print(len(transactiondf))
productdf.head()
transactiondf.head()


looking at ../data/products_books_v1_10_10.csv
381082


Unnamed: 0,id,user_id,product_id,rate
0,eaba468d-6226-4d3d-84b2-23796812a7bc,276847,446364193,0
1,eaba468d-6226-4d3d-84b2-23796812a7bc,276847,3379015180,0
2,eaba468d-6226-4d3d-84b2-23796812a7bc,276847,3404148576,8
3,eaba468d-6226-4d3d-84b2-23796812a7bc,276847,3423071516,10
4,eaba468d-6226-4d3d-84b2-23796812a7bc,276847,3442413508,10


In [2]:

class RecommendationAbstract():
    strategy_name: str = "REQUIRES IMPLEMENTATION"
    version: str = "REQUIRES IMPLEMENTATION"
    details: str = "REQUIRES IMPLEMENTATION"
    link: str = "REQUIRES IMPLEMENTATION"
    supports_single_recommendation: bool = "REQUIRES IMPLEMENTATION"
    supports_past_recommendation: bool = "REQUIRES IMPLEMENTATION"

    def __init__(self, products, product_data):
        self.products = products
        self.product_data = product_data
        self.model = None
        # populate id_to_products
        self.id_to_products = {}
        for product in self.products.to_dict(orient='records'):
            self.id_to_products[product['id']] = product

    def loadModel(self, model_code):
        """
        Load the model
        """
        self.model = model_code

    def train(self, verbose=False, transactions_train=None, users_train=None):
        """
        Train the model
        """
        # ... do training
        # self.model = trained_model
        
    def get_random_recommendation(self, n=1):
        """
        Get random recommendations
        """
        # Select n random rows from the DataFrame
        random_rows = self.products.sample(n)
        # Convert the selected rows to a list of dictionaries
        random_recommendations = random_rows.to_dict(orient='records')
        return random_recommendations



    def saveModel(self, model_code):
        """
        Save the model
        """
        # ... saves the model

    def id_to_productDetail(self, product_id: str) -> Dict[str, str]:
        """
        Return product details based on product id.
        """
        return self.id_to_products.get(product_id)

    def ids_to_products(self, ids: List[str]) -> List[Dict[str, str]]:
        """
        Return product details for a list of product ids.
        """
        return [self.id_to_productDetail(id) for id in ids]

    def like(self, keyword: str) -> List[str]:
        """
        Return a list of products that contain the given keyword in their title.
        """
        return [product for product in self.products if keyword in product['product_title']]

    def recommend_from_single(self, product_id: str, n=5) -> List[str]:
        """
        Return recommendations based on a single product.
        """
        target_name = self.id_to_productDetail(product_id)['product_title']
        keywords = target_name.split(" ")
        recommendations = []
        for keyword in keywords:
            recommendations.extend(self.like(keyword))
        
        random.shuffle(recommendations)
        return recommendations[:n]

    def recommend_from_past(self, user_transactions, n=10) -> List[str]:
        """
        Return recommendations based on past user transactions.
        """
        rec = []
        for transaction in user_transactions:
            rec.extend(self.recommend_from_single(transaction['product_id']))
        random.shuffle(rec)
        return rec[:n]

In [4]:
# Implementation of the class using cosine Similarity.

class CosineSimilarityRecommender(RecommendationAbstract):
    strategy_name: str = "Cosine Similarity"
    slug_name: str = "cosine_similarity"
    version: str = "v1"
    details: str = "REQUIRES IMPLEMENTATION"
    link: str = "REQUIRES IMPLEMENTATION"
    supports_single_recommendation: bool = True
    supports_past_recommendation: bool = True
    
    def __init__(self, products, product_data):
        super().__init__(products, product_data)
        self.products = products
        self.pt = []
        self.sim_score = None
        
    def train(self, transactions, auto_save=True):
        self.pt = transactions.pivot_table(index="product_id", columns="user_id", values="rate")
        self.pt.fillna(0, inplace=True)
        self.sim_score = cosine_similarity(self.pt)
        if auto_save:
            self.save()
        
        
    def get_filename(self):
        return "models/" + self.slug_name + self.product_data["unique_name"] + ".pik"
    
    def save(self):
        # Store self.pt
        filename = self.get_filename()
        file_simscr = open(filename, 'wb')
        pickle.dump(self.sim_score, file_simscr)
        file_simscr.close()
        
    def load(self):
        filename = self.get_filename()
        file_simscr = open(filename, 'rb')
        self.sim_score = pickle.load(file_simscr)
        file_simscr.close()
        

    def recommend_from_single(self, product_id, n=5) -> List[Tuple[dict, float]]:
        # Find the index of the product_id in the DataFrame
        index = np.where(self.products['id'] == product_id)[0][0]
        
        # Get similarity scores for the product at the found index
        similar_products = sorted(enumerate(self.sim_score[index]), key=lambda x: x[1], reverse=True)[1:n+1]
        
        # Retrieve the similar products using their indices and return them
        recommendations_list = []
        for similar_product in similar_products:
            product_index, score = similar_product
            product_dict = self.products.iloc[product_index].to_dict()
            recommendations_list.append((product_dict, score))
        
        return recommendations_list


    def recommend_from_past(self, transactions, n=10):
        rec: List[tuple[dict, float]] = []
        for transaction in transactions:
            rec.extend(self.recommend_from_single(transaction))
        
        # Sort by the confidence (second parameter of tuple)
        sorted_rec: List[tuple[dict, float]] = sorted(rec, key=lambda x: x[1], reverse=True)
        return sorted_rec[:n]
    
    

In [5]:
# Test

cosineRecommender = CosineSimilarityRecommender(productdf, product_data)
# Train.
# cosineRecommender.train(transactiondf, auto_save=True)
cosineRecommender.load()

In [6]:
# Use JSON parse.
# cosineRecommender.get_random_recommendation()
randomProduct = cosineRecommender.get_random_recommendation()[0]
pprint.pprint(randomProduct)


print('======== RECOMENDATIONS SINGLE CASE =========== ')
cosineRecommender.recommend_from_single(randomProduct['id'])


{'id': '0786869054',
 'product_image': 'http://images.amazon.com/images/P/0786869054.01.MZZZZZZZ.jpg',
 'product_price': nan,
 'product_soup': 'The Sunday Wife: A Novel Cassandra King Hyperion',
 'product_tags': nan,
 'product_title': 'The Sunday Wife: A Novel'}


[({'id': '0395683297',
   'product_title': 'Silent Spring',
   'product_image': 'http://images.amazon.com/images/P/0395683297.01.MZZZZZZZ.jpg',
   'product_price': nan,
   'product_soup': 'Silent Spring Rachel Carson Mariner Books',
   'product_tags': nan},
  0.39823291210564526),
 ({'id': '0425175413',
   'product_title': 'The White House Connection',
   'product_image': 'http://images.amazon.com/images/P/0425175413.01.MZZZZZZZ.jpg',
   'product_price': nan,
   'product_soup': 'The White House Connection Jack Higgins Berkley Publishing Group',
   'product_tags': nan},
  0.38631440705543546),
 ({'id': '0743428188',
   'product_title': 'The Twentieth Wife: A Novel',
   'product_image': 'http://images.amazon.com/images/P/0743428188.01.MZZZZZZZ.jpg',
   'product_price': nan,
   'product_soup': 'The Twentieth Wife: A Novel Indu Sundaresan Washington Square Press',
   'product_tags': nan},
  0.3584070796460177),
 ({'id': '0826308791',
   'product_title': 'The Education of Little Tree (A Zia

In [7]:

# ... Repetition.
print("=============  RECOMENDATIONS RECOMMENDATIONS  ============")
tansactions = ['0590353403', '0439139597']

rec_id = cosineRecommender.recommend_from_past(tansactions)
pprint.pprint(rec_id)


[({'id': '0440225922',
   'product_image': 'http://images.amazon.com/images/P/0440225922.01.MZZZZZZZ.jpg',
   'product_price': nan,
   'product_soup': 'The Poyson Garden: An Elizabethan I Mystery (Elizabeth I '
                   'Mysteries (Paperback)) Karen Harper Dell Publishing '
                   'Company',
   'product_tags': nan,
   'product_title': 'The Poyson Garden: An Elizabethan I Mystery (Elizabeth I '
                    'Mysteries (Paperback))'},
  0.6555859867761229),
 ({'id': '1853260193',
   'product_image': 'http://images.amazon.com/images/P/1853260193.01.MZZZZZZZ.jpg',
   'product_price': nan,
   'product_soup': 'Vanity Fair (Wordsworth Collection) William Makepeace '
                   'Thackeray NTC/Contemporary Publishing Company',
   'product_tags': nan,
   'product_title': 'Vanity Fair (Wordsworth Collection)'},
  0.5928693837112383),
 ({'id': '0553574132',
   'product_image': 'http://images.amazon.com/images/P/0553574132.01.MZZZZZZZ.jpg',
   'product_price': n

In [8]:


class WordVecBodyRecommender(RecommendationAbstract):
    
    strategy_name: str = "WordVec"
    slug_name: str = "wordvec"
    version: str = "v1"
    details: str = "REQUIRES IMPLEMENTATION"
    link: str = "REQUIRES IMPLEMENTATION"
    supports_single_recommendation: bool = True
    supports_past_recommendation: bool = True
    
    def __init__(self, products, product_data):
        """
        Initialize the recommender with a pre-trained Word2Vec model and a dataframe of books.
        """
        super().__init__(products, product_data)
        self.products_df = products
        self.model = None
        print('id_to_products length', len(self.id_to_products))
        self.train()


    def train(self, auto_save=True):
        """
        Train the Word2Vec model on the book titles.
        """
        sentences = [title.lower().split() for title in self.products_df['product_soup']]
        self.model = Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=1, workers=4)
        if auto_save:
            self.save()
            

    def recommend_books_updated(self, input_text, top_n=5):
        """
        Return recommendations based on input text.
        """
        input_text = input_text.lower().split()
        vector = self.model.wv[input_text].mean(axis=0)
        # Compute cosine similarity between the input vector and all product vectors
        similarities = cosine_similarity([vector], self.model.wv.vectors)
        # Get indices of top similar products
        top_indices = similarities.argsort()[0][-top_n:]
        recommended_products = []
        for index in reversed(top_indices):  # Reversed to get top similarities first
            product_title = self.products_df.iloc[index]['product_title']
            confidence = similarities[0][index]
            recommended_products.append((self.id_to_productDetail(self.products_df.iloc[index]['id']), confidence))
        return recommended_products

        
    def recommend_from_single(self, product_id, n=5) -> List[tuple[dict, float]]:
        """
        Return recommendations based on a single product.
        """
        # Get the product_soup for the given product_id
        product_soup = self.products.loc[self.products['id'] == product_id, 'product_soup'].values[0]
        # Use the recommend_books_updated function to recommend books based on the product_soup
        recommendations = self.recommend_books_updated(product_soup, top_n=n)
        return recommendations

    def recommend_from_past(self, transactions, n=10):
        """
        Return recommendations based on past transactions.
        """
        # Concatenate product_soup from past transactions
        past_text = ' '.join(self.products.loc[self.products['id'].isin(transactions), 'product_soup'])
        # Use the self.recommend_books_updated function to recommend books based on past transactions
        recommendations = self.recommend_books_updated(past_text, top_n=n)
        return recommendations
        
        
    def get_filename(self):
        return  "models/" + self.slug_name + self.product_data["unique_name"] + ".model"

    def save(self):
        """
        Save the computed book vectors to a file.
        """
        filename = self.get_filename()
        filemodel = open(filename, 'wb')
        pickle.dump(self.model, filemodel)
        filemodel.close()

    def load(self):
        """
        Load the book vectors from a file.
        """
        
        filename = self.get_filename()
        filemodel = open(filename, 'rb')
        self.model = pickle.load(filemodel)
        filemodel.close()

In [9]:
# Implementationa nd test.

wordvecRecommender = WordVecBodyRecommender(productdf, product_data)
# Train.
# wordvecRecommender.train()
wordvecRecommender.load()


id_to_products length 13011


In [10]:

# Get random and recommend.

randomProduct = wordvecRecommender.get_random_recommendation()[0]
pprint.pprint(randomProduct)
wordvecRecommender.recommend_from_single(randomProduct['id'])

{'id': '0060391553',
 'product_image': 'http://images.amazon.com/images/P/0060391553.01.MZZZZZZZ.jpg',
 'product_price': nan,
 'product_soup': 'Enter Whining Fran Drescher Harpercollins',
 'product_tags': nan,
 'product_title': 'Enter Whining'}


[({'id': '0553584383',
   'product_title': 'Dead Aim',
   'product_image': 'http://images.amazon.com/images/P/0553584383.01.MZZZZZZZ.jpg',
   'product_price': nan,
   'product_soup': 'Dead Aim IRIS JOHANSEN Bantam Books',
   'product_tags': nan},
  0.99974954),
 ({'id': '0060925000',
   'product_title': 'A Suitable Boy : Novel, A',
   'product_image': 'http://images.amazon.com/images/P/0060925000.01.MZZZZZZZ.jpg',
   'product_price': nan,
   'product_soup': 'A Suitable Boy : Novel, A Vikram Seth Perennial',
   'product_tags': nan},
  0.9983109),
 ({'id': '193072229X',
   'product_title': "MoveOn's 50 Ways to Love Your Country: How to Find Your Political Voice and Become a Catalyst for Change",
   'product_image': 'http://images.amazon.com/images/P/193072229X.01.MZZZZZZZ.jpg',
   'product_price': nan,
   'product_soup': "MoveOn's 50 Ways to Love Your Country: How to Find Your Political Voice and Become a Catalyst for Change Moveon Inner Ocean Publishing",
   'product_tags': nan},
  0.99

[({'id': '0385490992',
   'product_image': 'http://images.amazon.com/images/P/0385490992.01.MZZZZZZZ.jpg',
   'product_price': nan,
   'product_soup': 'The Street Lawyer John Grisham Doubleday Books',
   'product_tags': nan,
   'product_title': 'The Street Lawyer'},
  0.9987477),
 ({'id': '0345313097',
   'product_image': 'http://images.amazon.com/images/P/0345313097.01.MZZZZZZZ.jpg',
   'product_price': nan,
   'product_soup': 'Crewel Lye Piers Anthony Ballantine Books',
   'product_tags': nan,
   'product_title': 'Crewel Lye'},
  0.9985696),
 ({'id': '0385418493',
   'product_image': 'http://images.amazon.com/images/P/0385418493.01.MZZZZZZZ.jpg',
   'product_price': nan,
   'product_soup': 'How the Irish Saved Civilization: The Untold Story of '
                   "Ireland's Heroic Role from the Fall of Rome to the Rise of "
                   'Medieval Europe (Hinges of History) Thomas Cahill Anchor',
   'product_tags': nan,
   'product_title': 'How the Irish Saved Civilization: The

In [12]:

class TitleWordVecTitleyRecommender(RecommendationAbstract):
    
    strategy_name: str = "TitleWordVec"
    slug_name: str = "title_word_vec"
    version: str = "v1"
    details: str = "REQUIRES IMPLEMENTATION"
    link: str = "REQUIRES IMPLEMENTATION"
    supports_single_recommendation: bool = True
    supports_past_recommendation: bool = True
    
    def __init__(self, products, product_data):
        """
        Initialize the recommender with a pre-trained Word2Vec model and a dataframe of books.
        """
        super().__init__(products, product_data)
        self.products_df = products
        self.model = None
        self.train()

    def train(self, auto_save=False):
        """
        Train the Word2Vec model on the book titles.
        """
        sentences = [title.lower().split() for title in self.products_df['product_title']]
        self.model = Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=1, workers=4)
        if auto_save:
            self.save()

    def recommend_from_single(self, product_id, n=5):
        """
        Return recommendations based on a single product.
        """
        # Get the product_title for the given product_id
        product_title = self.products_df.loc[self.products_df['id'] == product_id, 'product_title'].values[0]
        # Use the recommend_books_updated function to recommend books based on the product_title
        recommendations = self.recommend_books_updated(product_title, top_n=n)
        return recommendations
    
    def recommend_from_past(self, transactions, n=10):
        """
        Return recommendations based on past transactions.
        """
        # Concatenate product_titles from past transactions
        past_titles = self.products_df.loc[self.products_df['id'].isin(transactions), 'product_title']
        # Use the self.recommend_books_updated function to recommend books based on past transactions
        recommendations = self.recommend_books_updated(' '.join(past_titles), top_n=n)
        return recommendations

        

    def recommend_books_updated(self, input_text, top_n=5):
        """
        Return recommendations based on input text.
        """
        input_text = input_text.lower().split()
        vector = self.model.wv[input_text].mean(axis=0)
        # Compute cosine similarity between the input vector and all product vectors
        similarities = cosine_similarity([vector], self.model.wv.vectors)
        # Get indices of top similar products
        top_indices = similarities.argsort()[0][-top_n:]
        recommended_products = []
        for index in reversed(top_indices):  # Reversed to get top similarities first
            product_title = self.products_df.iloc[index]['product_title']
            confidence = similarities[0][index]
            recommended_products.append((self.id_to_productDetail(self.products_df.iloc[index]['id']), confidence))
        return recommended_products

    def save(self):
        """
        Save the computed book vectors to a file.
        """
        filename = self.get_filename()
        with open(filename, 'wb') as filemodel:
            pickle.dump(self.model, filemodel)

    def load(self):
        """
        Load the book vectors from a file.
        """
        filename = self.get_filename()
        with open(filename, 'rb') as filemodel:
            self.model = pickle.load(filemodel)

    def get_filename(self):
        """
        Get the filename for saving/loading the model.
        """
        return "models/" + self.slug_name + self.product_data["unique_name"] + ".model"

In [13]:
# Get random and recommend.
wordvecRecommender = TitleWordVecTitleyRecommender(productdf, product_data)
# Train.
# wordvecRecommender.train( auto_save=True)
wordvecRecommender.load()

In [14]:


randomProduct = wordvecRecommender.get_random_recommendation()[0]
pprint.pprint(randomProduct)
wordvecRecommender.recommend_from_single(randomProduct['id'])


# ... Repetition.
print("=============  RECOMENDATIONS  ============")
rec_id = wordvecRecommender.recommend_from_past(tansactions)
pprint.pprint(rec_id)


{'id': '0373872062',
 'product_image': 'http://images.amazon.com/images/P/0373872062.01.MZZZZZZZ.jpg',
 'product_price': nan,
 'product_soup': 'Loving Hearts Gail Gaymer Martin Steeple Hill',
 'product_tags': nan,
 'product_title': 'Loving Hearts'}
[({'id': '0452264464',
   'product_image': 'http://images.amazon.com/images/P/0452264464.01.MZZZZZZZ.jpg',
   'product_price': nan,
   'product_soup': 'Beloved (Plume Contemporary Fiction) Toni Morrison Plume',
   'product_tags': nan,
   'product_title': 'Beloved (Plume Contemporary Fiction)'},
  0.99993634),
 ({'id': '0002005018',
   'product_image': 'http://images.amazon.com/images/P/0002005018.01.MZZZZZZZ.jpg',
   'product_price': nan,
   'product_soup': 'Clara Callan Richard Bruce Wright HarperFlamingo Canada',
   'product_tags': nan,
   'product_title': 'Clara Callan'},
  0.99988186),
 ({'id': '0345417623',
   'product_image': 'http://images.amazon.com/images/P/0345417623.01.MZZZZZZZ.jpg',
   'product_price': nan,
   'product_soup': 'Ti

In [15]:

# ... Repetition.
print("=============  RECOMENDATIONS RECOMMENDATIONS  ============")
tansactions = ['0590353403', '0439139597']

"""
Harry Potter and the Sorcerer's Stone (Book 1)
"Harry Potter and the Goblet of Fire (Book 4)"
"""

rec_id = wordvecRecommender.recommend_from_past(tansactions)
pprint.pprint(rec_id)

[({'id': '0452264464',
   'product_image': 'http://images.amazon.com/images/P/0452264464.01.MZZZZZZZ.jpg',
   'product_price': nan,
   'product_soup': 'Beloved (Plume Contemporary Fiction) Toni Morrison Plume',
   'product_tags': nan,
   'product_title': 'Beloved (Plume Contemporary Fiction)'},
  0.99993634),
 ({'id': '0002005018',
   'product_image': 'http://images.amazon.com/images/P/0002005018.01.MZZZZZZZ.jpg',
   'product_price': nan,
   'product_soup': 'Clara Callan Richard Bruce Wright HarperFlamingo Canada',
   'product_tags': nan,
   'product_title': 'Clara Callan'},
  0.99988186),
 ({'id': '0345417623',
   'product_image': 'http://images.amazon.com/images/P/0345417623.01.MZZZZZZZ.jpg',
   'product_price': nan,
   'product_soup': 'Timeline MICHAEL CRICHTON Ballantine Books',
   'product_tags': nan,
   'product_title': 'Timeline'},
  0.99985254),
 ({'id': '0425182908',
   'product_image': 'http://images.amazon.com/images/P/0425182908.01.MZZZZZZZ.jpg',
   'product_price': nan,
  

In [105]:

class TitleWordVecTitleyRecommenderV2(RecommendationAbstract):
    """
    Key Changes:
    - Using nlp to search first nouns>verbs>adjectives
    - Past Transactions search instead of the aggregated titles, makes individual search with prioritization with eah title
    """
    strategy_name: str = "TitleWordVec"
    slug_name: str = "title_word_vec"
    version: str = "v2"
    details: str = "REQUIRES IMPLEMENTATION"
    link: str = "REQUIRES IMPLEMENTATION"
    supports_single_recommendation: bool = True
    supports_past_recommendation: bool = True
    
    def __init__(self, products, product_data, useKeyword=True):
        """
        Initialize the recommender with a pre-trained Word2Vec model and a dataframe of books.
        """
        super().__init__(products, product_data)
        self.products_df = products
        self.model = None
        self.train()
        self.nlp = spacy.load("en_core_web_sm")
        self.useKeyword = useKeyword # Otherwise uses the list of keywords concatenated found.

    def train(self, auto_save=False):
        """
        Train the Word2Vec model on the book titles.
        """
        # Preprocess text data
        self.products_df['processed_soup'] = self.products_df['product_title'].str.lower().str.translate(str.maketrans('', '', string.punctuation))

        # Prepare self.products_df for Word2Vec model
        sentences = [row.split() for row in self.products_df['processed_soup'].dropna()]
        self.model = Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=1, workers=4)
        if auto_save:
            self.save()


    def getMostSignificantKeyword(self, text, default_to_text = True, k=3) -> tuple[str, str]:
        """
        Extracts keywords from the provided text, prioritizing nouns, then verbs, then adjectives.
        Continues until the combined length of keywords is at least 3 characters.
        
        This algorithm is designed to work for short range titles.
        - default_to_
        
        Returns a tuple of (most_relevant_keyword, an_concatenated_list_of_keywords)
        """
        doc = self.nlp(text)
        priorityKey = ""
        top_keywords = []
        fullsearch = ""

        # Define the order of part of speech tags to search based on their relevance
        pos_priority = ["NOUN", "VERB", "ADJ", "PROPN", "ADV", "PRON", "ADP", "CCONJ", "SCONJ", "DET", "AUX", "NUM",
                        "PART", "INTJ", "SYM", "PUNCT", "X"]

        # Iterate over each part of speech in priority order
        for pos in pos_priority:
            keywords = [token.text for token in doc if token.pos_ == pos]
            for key in keywords:
                fullsearch += f" {key}"  # Append all found keywords to fullsearch
                if len(key) > len(priorityKey):
                    priorityKey = key  # Update priorityKey if a longer keyword is found
                if len(top_keywords) < k:
                    top_keywords.append(key)  # Add the keyword to top_keywords if there are less than k
                if len(priorityKey) >= k and len(top_keywords) == k:
                    break  # Break both loops if condition is met
            if len(priorityKey) >= k and len(top_keywords) == k:
                break  # Break outer loop if condition is met
        # If there are more than k words, and not enough top words, to fulfill k. Add as many words as possible to get to k.
        if len(top_keywords) < k:
            set_keywords = set(top_keywords)
            for word in text.split():
                if word not in set_keywords:
                    set_keywords.add(word)
                if len(top_keywords) == k:
                    break
            set_keywords = list(set_keywords)
        
        # print(priorityKey, ' '.join(top_keywords), fullsearch)
        return (' '.join(top_keywords), fullsearch)
        

    def recommend_from_single(self, product_id, n=5, verbose=True, greedy_attempt=3) -> List[tuple[str, float]]:
        """
        Return recommendations based on a single product.
        """
        # Get the product_title for the given product_id
        product_title = self.products_df.loc[self.products_df['id'] == product_id, 'product_title'].values[0]
        # Use the recommend_books_updated function to recommend books based on the product_title
        recommendations = []
        search_term = ""
        seen = set(product_title)
        for k in range(greedy_attempt):
            keyword, keywords_concat = self.getMostSignificantKeyword(product_title, k=k+1)
            if(self.useKeyword):
                search_term = keyword
            else:
                search_term = keywords_concat
            if verbose:
                print(f"Searching for '{search_term}' from '{product_title}'")
                # print(recommendations)
        
            rec = self.recommend_books_updated(search_term, top_n=n)
            for rec_item, confidence_rate in rec:
                # print(rec_item)
                if rec_item['product_title'] not in seen:
                    seen.add(rec_item['product_title'])
                    recommendations.append((rec_item, confidence_rate))
                else:
                    continue
            if len(recommendations) >= greedy_attempt:
                break
        return recommendations
    
    def recommend_from_past(self, transactions, n=10):
        """
        Return recommendations based on past transactions.
        Does the following:
        
        @param transactions: List[id] = List of transactions
        
        - Per each transaction uses recommend_from_single, to find relevant books. around 5 recommendations.
        - ensures that the recommendations are unique.
        - Sorts by confidence.
        - limits to n.
        """
    
        for transaction in transactions:
            rec: List[tuple[dict, int]] = self.recommend_from_single(transaction)
            rec.extend(rec) 
        # Because there could be repeated rec[i]['product_title'] we need to remove duplicates.
        seen_titles = set()
        unique_rec = []
        for rec_item, confidence_rate in rec:
            if rec_item['product_title'] not in seen_titles:
                seen_titles.add(rec_item['product_title'])
                unique_rec.append((rec_item, confidence_rate))
                    
        
        # Sort by confidence second parameter
        unique_rec.sort(key=lambda x: x[1])
        return unique_rec[:n]
        

    def recommend_books_updated(self, input_text, top_n=5):
        """
        Return recommendations based on input text.
        """
        input_text = input_text.lower().translate(str.maketrans('', '', string.punctuation)).split()
        vector = self.model.wv[input_text].mean(axis=0)
        similar_vectors = self.model.wv.similar_by_vector(vector, topn=top_n + 10)  # Retrieve more results to filter unique titles
        recommended_titles = []
        for book_vector in similar_vectors:
            similar_title = self.products_df.loc[self.products_df['processed_soup'].apply(lambda x: any(word in x for word in input_text)), 'processed_soup'].unique()
            for title in similar_title:
                if title not in recommended_titles and len(recommended_titles) < top_n:
                    product = self.products_df.loc[self.products_df['processed_soup'] == title].iloc[0].to_dict()
                    recommended_titles.append((product, book_vector[1]))
        return recommended_titles

    def save(self):
        """
        Save the computed book vectors to a file.
        """
        filename = self.get_filename()
        with open(filename, 'wb') as filemodel:
            pickle.dump(self.model, filemodel)

    def load(self):
        """
        Load the book vectors from a file.
        """
        filename = self.get_filename()
        with open(filename, 'rb') as filemodel:
            self.model = pickle.load(filemodel)

    def get_filename(self):
        """
        Get the filename for saving/loading the model.
        """
        return "models/" + self.slug_name + self.product_data["unique_name"] + ".model"
    

In [106]:
# file_path = '../data/products_books_v1_10_10.csv'
# data = pd.read_csv(file_path)
# print(len(data))
# engineRec = TitleWordVecTitleyRecommenderV2(data, product_data)
engineRec = TitleWordVecTitleyRecommenderV2(productdf, product_data)
engineRec.train()

In [107]:


randomProduct = engineRec.get_random_recommendation()[0]
pprint.pprint(randomProduct)

print('======== RECOMENDATIONS SINGLE CASE =========== ')
engineRec.recommend_from_single(randomProduct['id'])


{'id': '0553565370',
 'processed_soup': 'scandal in fair haven',
 'product_image': 'http://images.amazon.com/images/P/0553565370.01.MZZZZZZZ.jpg',
 'product_price': nan,
 'product_soup': 'Scandal in Fair Haven Carolyn G. Hart Bantam Books',
 'product_tags': nan,
 'product_title': 'Scandal in Fair Haven'}
Searching for 'Scandal' from 'Scandal in Fair Haven'


[({'id': '0060505885',
   'product_title': 'The Scandalous Summer of Sissy LeBlanc : A Novel',
   'product_image': 'http://images.amazon.com/images/P/0060505885.01.MZZZZZZZ.jpg',
   'product_price': nan,
   'product_soup': 'The Scandalous Summer of Sissy LeBlanc : A Novel Loraine Despres Perennial',
   'product_tags': nan,
   'processed_soup': 'the scandalous summer of sissy leblanc  a novel'},
  0.9999999403953552),
 ({'id': '038078615X',
   'product_title': "Married at Midnight an Anthology: The Determined Bride/A Kiss After Midnight/Scandal's Bride/Beyond the Kiss",
   'product_image': 'http://images.amazon.com/images/P/038078615X.01.MZZZZZZZ.jpg',
   'product_price': nan,
   'product_soup': "Married at Midnight an Anthology: The Determined Bride/A Kiss After Midnight/Scandal's Bride/Beyond the Kiss Kathleen E. Woodiwiss Avon",
   'product_tags': nan,
   'processed_soup': 'married at midnight an anthology the determined bridea kiss after midnightscandals bridebeyond the kiss'},
  0.

In [108]:

# ... Repetition.
print("=============  RECOMENDATIONS RECOMMENDATIONS  ============")
tansactions = ['0590353403', '0439139597']

"""
Harry Potter and the Sorcerer's Stone (Book 1)
"Harry Potter and the Goblet of Fire (Book 4)"
"""

rec_id = engineRec.recommend_from_past(tansactions)
pprint.pprint(rec_id)


Searching for 'Harry' from 'Harry Potter and the Sorcerer's Stone (Book 1)'
Searching for 'Harry' from 'Harry Potter and the Goblet of Fire (Book 4)'
[({'id': '0767908473',
   'processed_soup': 'the sorcerers companion a guide to the magical world of '
                     'harry potter',
   'product_image': 'http://images.amazon.com/images/P/0767908473.01.MZZZZZZZ.jpg',
   'product_price': nan,
   'product_soup': "The Sorcerer's Companion: A Guide to the Magical World of "
                   'Harry Potter ALLAN ZOLA KRONZEK Broadway',
   'product_tags': nan,
   'product_title': "The Sorcerer's Companion: A Guide to the Magical World of "
                    'Harry Potter'},
  0.9999998807907104),
 ({'id': '059035342X',
   'processed_soup': 'harry potter and the sorcerers stone harry potter '
                     'paperback',
   'product_image': 'http://images.amazon.com/images/P/059035342X.01.MZZZZZZZ.jpg',
   'product_price': nan,
   'product_soup': "Harry Potter and the Sorcerer's S

In [74]:
engineRec.recommend_books_updated("Harry")

[]

In [139]:
# Implementation of the class using cosine Similarity.
from surprise import KNNBasic

class BasicKNNRecommender(RecommendationAbstract):
    strategy_name: str = "Basic KNN"
    slug_name: str = "basic_knn"
    version: str = "v1"
    details: str = "REQUIRES IMPLEMENTATION"
    link: str = "REQUIRES IMPLEMENTATION"
    supports_single_recommendation: bool = True
    supports_past_recommendation: bool = True
    
    def __init__(self, products: pd.DataFrame, product_data: dict):
        super().__init__(products, product_data)
        self.products = products
        self.model = None
        
        # Get the product ids and store them.
        self.product_ids = self.products['id'].unique()
        
    def train(self, transactions, auto_save=True):
        
        sim_options = {"name": "pearson_baseline", "user_based": False}
        model = KNNBasic(sim_options=sim_options)
        
        reader = Reader(rating_scale=(1, 5))
        
        data = Dataset.load_from_df(transactions[['user_id', 'product_id', 'rate']], reader)
        
        model.fit(data.build_full_trainset())
        self.model = model
        # self.accuracy = accuracy.rmse(model.test(data.build_full_trainset().build_testset()), verbose=True)
        
        if auto_save:
            self.save()
        
        
    def get_filename(self):
        return "models/" + self.slug_name + self.product_data["unique_name"] + ".pik"
    
    def save(self):
        # Store self.pt
        filename = self.get_filename()
        model_file = open(filename, 'wb')
        pickle.dump(self.model, model_file)
        model_file.close()
        
    def load(self):
        filename = self.get_filename()
        model_file = open(filename, 'rb')
        self.model = pickle.load(model_file)
        model_file.close()
        

    def recommend_from_single(self, product_id: str, n=5) -> List[Tuple[dict, float]]:
        """
        
        # Retrieve inner ids of the nearest neighbors of Toy Story.
        toy_story_neighbors = algo.get_neighbors(toy_story_inner_id, k=10)
        """
        recommendation_list: List[tuple[dict, float]] = []
        
        product_inner_id = self.model.trainset.to_inner_iid(product_id)
        
        neighbors = self.model.get_neighbors(product_inner_id, k=n)
        
        for neighbor in neighbors:
            product_serie = self.products.iloc[neighbor]
            product = product_serie.to_dict()
            recommendation_list.append((product, 1.0))
        
        return recommendation_list[:n]


    def recommend_from_past(self, transactions: List[str], n=10):
        """
        Calls for each transaction the recommend_from_single method.
        Gives Priority if seen multiple recommendations.
        Shuffle and returns :n
        """
        recs = set()
        recs_seen_times = {}
        products_dictionary = {}
        
        for transaction in transactions:
            recs = self.recommend_from_single(transaction)
            for rec_id, confidence in recs:
                
                if rec_id in recs:
                    recs_seen_times[rec_id['id']] += 1
                else:
                    products_dictionary[rec_id['id']] = rec_id
                    recs_seen_times[rec_id['id']] = 1
        
        for rec_id in recs_seen_times:
            recs.append((products_dictionary[rec_id], recs_seen_times[rec_id]))
            
        recs = list(recs)
        # sort
        
        recs.sort(key=lambda x: x[1], reverse=True)
        return recs
    

In [140]:

engineRec = BasicKNNRecommender(productdf, product_data)
# engineRec.train(transactions=transactiondf, auto_save=True)
engineRec.load()


In [141]:


randomProduct = engineRec.get_random_recommendation()[0]

pprint.pprint(randomProduct)

  

print('======== RECOMENDATIONS SINGLE CASE =========== ')

engineRec.recommend_from_single(randomProduct['id'])

{'count': 19,
 'id': '0380818337',
 'product_id': '0380818337',
 'product_image': 'http://images.amazon.com/images/P/0380818337.01.MZZZZZZZ.jpg',
 'product_price': nan,
 'product_soup': 'The Lady Is Tempted Cathy Maxwell Avon',
 'product_tags': nan,
 'product_title': 'The Lady Is Tempted'}


[({'product_id': '0307010856',
   'count': 10,
   'id': '0307010856',
   'product_title': 'The Monster at the End of This Book',
   'product_image': 'http://images.amazon.com/images/P/0307010856.01.MZZZZZZZ.jpg',
   'product_price': nan,
   'product_soup': 'The Monster at the End of This Book JON STONE Golden Books',
   'product_tags': nan},
  1.0),
 ({'product_id': '0553582364',
   'count': 50,
   'id': '0553582364',
   'product_title': 'A Traitor to Memory',
   'product_image': 'http://images.amazon.com/images/P/0553582364.01.MZZZZZZZ.jpg',
   'product_price': nan,
   'product_soup': 'A Traitor to Memory Elizabeth George Bantam Books',
   'product_tags': nan},
  1.0),
 ({'product_id': '0425114236',
   'count': 51,
   'id': '0425114236',
   'product_title': 'Accidental Tourist',
   'product_image': 'http://images.amazon.com/images/P/0425114236.01.MZZZZZZZ.jpg',
   'product_price': nan,
   'product_soup': 'Accidental Tourist Anne Tyler Penguin Putnam~mass',
   'product_tags': nan},
  1

In [142]:
print("=============  RECOMENDATIONS RECOMMENDATIONS  ============")

tansactions = ['0590353403', '0439139597']


rec_id = engineRec.recommend_from_past(tansactions)

pprint.pprint(rec_id)

[({'count': 40,
   'id': '0786884142',
   'product_id': '0786884142',
   'product_image': 'http://images.amazon.com/images/P/0786884142.01.MZZZZZZZ.jpg',
   'product_price': nan,
   'product_soup': 'A Monk Swimming : A Memoir Malachy McCourt Hyperion',
   'product_tags': nan,
   'product_title': 'A Monk Swimming : A Memoir'},
  1.0),
 ({'count': 266,
   'id': '0446606812',
   'product_id': '0446606812',
   'product_image': 'http://images.amazon.com/images/P/0446606812.01.MZZZZZZZ.jpg',
   'product_price': nan,
   'product_soup': 'Message in a Bottle Nicholas Sparks Warner Vision',
   'product_tags': nan,
   'product_title': 'Message in a Bottle'},
  1.0),
 ({'count': 58,
   'id': '0061091790',
   'product_id': '0061091790',
   'product_image': 'http://images.amazon.com/images/P/0061091790.01.MZZZZZZZ.jpg',
   'product_price': nan,
   'product_soup': 'After All These Years Susan Isaacs HarperTorch',
   'product_tags': nan,
   'product_title': 'After All These Years'},
  1.0),
 ({'count'

In [18]:

from surprise import KNNWithMeans

class KNNWithMeansRecommender(RecommendationAbstract):
    strategy_name: str = "KNN With Means"
    slug_name: str = "knn_with_means"
    version: str = "v1"
    details: str = "REQUIRES IMPLEMENTATION"
    link: str = "REQUIRES IMPLEMENTATION"
    supports_single_recommendation: bool = True
    supports_past_recommendation: bool = True
    
    def __init__(self, products: pd.DataFrame, product_data: dict):
        super().__init__(products, product_data)
        self.products = products
        self.model = None
        
        # Get the product ids and store them.
        self.product_ids = self.products['id'].unique()
        self.all_transactions_df = None
        
    def train(self, transactions, auto_save=True, dont_save_self_state=False) :
        
        sim_options = {"name": "pearson_baseline", "user_based": False}
        model = KNNWithMeans(sim_options=sim_options)
        
        reader = Reader(rating_scale=(1, 5))
        
        data = Dataset.load_from_df(transactions[['user_id', 'product_id', 'rate']], reader)
        
        model.fit(data.build_full_trainset())
        
        if dont_save_self_state:
            return model
        
        self.model = model
        self.all_transactions_df = transactions
        # self.accuracy = accuracy.rmse(model.test(data.build_full_trainset().build_testset()), verbose=True)
        
        if auto_save:
            self.save()
            
        return model
        
        
    def get_filename(self):
        return "models/" + self.slug_name + self.product_data["unique_name"] + ".pik"
    
    def save(self):
        # Store self.pt
        filename = self.get_filename()
        model_file = open(filename, 'wb')
        pickle.dump(self.model, model_file)
        model_file.close()
        
    def load(self):
        filename = self.get_filename()
        model_file = open(filename, 'rb')
        self.model = pickle.load(model_file)
        model_file.close()
        

    def recommend_from_single(self, product_id: str, n=5) -> List[Tuple[dict, float]]:
        """
        
        # Retrieve inner ids of the nearest neighbors of Toy Story.
        toy_story_neighbors = algo.get_neighbors(toy_story_inner_id, k=10)
        """
        recommendation_list: List[tuple[dict, float]] = []
        
        product_inner_id = self.model.trainset.to_inner_iid(product_id)
        
        neighbors = self.model.get_neighbors(product_inner_id, k=n)
        
        for neighbor in neighbors:
            product_serie = self.products.iloc[neighbor]
            product = product_serie.to_dict()
            recommendation_list.append((product, 1.0))
        
        return recommendation_list[:n]

    def collaborativestore_predict_population(self, transactions: List[str], n=5):
        """
        Adds the transactions to the use history
        'user_id', 'product_id', 'rate'
        """
        # Add transactions to the self.transactions_df as a new user
        transaction_rows = []
        random_user_id = "user" + str(random.randint(0, 1000000))
        for transaction in transactions:
            transaction_rows.append({'user_id': 'user_id', 'product_id': transaction, 'rate': 5})
        
        # Convert to a DataFrame
        new_transactions_df = pd.DataFrame(transaction_rows)

        # Append using concat
        all_transactions_df: pd.Dataframe = pd.concat([self.all_transactions_df, new_transactions_df], ignore_index=True)
        
        model = self.train(all_transactions_df, dont_save_self_state=True)
        
        return self.predict_recommendations(random_user_id, transactions, model, n)
    
    def predict_recommendations(self, user_id: str, transactions: List[str], model, n=5):
        books_to_predict = [book_id for book_id in self.product_ids if book_id not in transactions]
        predictions = []
        
        for book_id in books_to_predict:
            pred = model.predict(user_id, book_id)
            predictions.append((book_id, pred.est))
        
        pred_products = []
        # sort predictions
        predictions.sort(key=lambda x: x[1], reverse=True)
        for book_id, confidence in predictions[:n]:
            product = self.id_to_products[book_id]
            pred_products.append(product)
            
        return pred_products
        

    def recommend_from_past(self, transactions: List[str], n=10):
        """
        Calls for each transaction the recommend_from_single method.
        Gives Priority if seen multiple recommendations.
        Shuffle and returns :n
        """
        recs = set()
        recs_seen_times = {}
        products_dictionary = {}
        
        return self.collaborativestore_predict_population(
            transactions, n=n
        )
        
        for transaction in transactions:
            recs = self.recommend_from_single(transaction)
            for rec_id, confidence in recs:
                
                if rec_id in recs:
                    recs_seen_times[rec_id['id']] += 1
                else:
                    products_dictionary[rec_id['id']] = rec_id
                    recs_seen_times[rec_id['id']] = 1
        
        for rec_id in recs_seen_times:
            recs.append((products_dictionary[rec_id], recs_seen_times[rec_id]))
            
        recs = list(recs)
        # sort
        
        recs.sort(key=lambda x: x[1], reverse=True)
        return recs
    

In [19]:
  

engineRec = KNNWithMeansRecommender(productdf, product_data)
engineRec.train(transactions=transactiondf, auto_save=True)
# engineRec.load()
  
  

randomProduct = engineRec.get_random_recommendation()[0]
pprint.pprint(randomProduct)

print('======== RECOMENDATIONS SINGLE CASE =========== ')
rec = engineRec.recommend_from_single(randomProduct['id'])
pprint.pprint(rec)


# print("=============  RECOMENDATIONS RECOMMENDATIONS  ============")
# tansactions = ['0590353403', '0439139597']
# recomendations = engineRec.recommend_from_past(tansactions)
# pprint.pprint(recomendations)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
{'count': 12,
 'id': '0451456726',
 'product_id': '0451456726',
 'product_image': 'http://images.amazon.com/images/P/0451456726.01.MZZZZZZZ.jpg',
 'product_price': nan,
 'product_soup': 'Heir to the Shadows (The Black Jewels Trilogy, Book 2) Anne '
                 'Bishop Roc',
 'product_tags': nan,
 'product_title': 'Heir to the Shadows (The Black Jewels Trilogy, Book 2)'}
[({'count': 16,
   'id': '0312194390',
   'product_id': '0312194390',
   'product_image': 'http://images.amazon.com/images/P/0312194390.01.MZZZZZZZ.jpg',
   'product_price': nan,
   'product_soup': 'The Autobiography of Henry VIII: With Notes by His Fool, '
                   "Will Somers : A Novel Margaret George St. Martin's Press",
   'product_tags': nan,
   'product_title': 'The Autobiography of Henry VIII: With Notes by His Fool, '
                    'Will Somers : A Novel'},
  1.0),
 ({'count'