In [1]:
import tqdm as notebook_tqdm
import pandas as pd

from recpack.scenarios import WeakGeneralization, Timed
from recpack.preprocessing.preprocessors import DataFramePreprocessor
from recpack.preprocessing.filters import MinItemsPerUser, MinUsersPerItem

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#1:  Data collection
transactions_path = '../../00 - Data/transactions/transactions_train.csv'
transactions = pd.read_csv(transactions_path)
print("Original data has size of : " + str(len(transactions)))

sample = 0.0005
transactions_sample = transactions.sample(frac=sample, random_state=40)
print("Created a sample of " + str(sample) + " % with " + str(len(transactions_sample)) + " records")

Original data has size of : 31788324
Created a sample of 0.0005 % with 15894 records


In [3]:
#2: Data preprocessing

#        item1    item2   item3
#usr1      x                x
#usr2       x       x
proc = DataFramePreprocessor(item_ix='article_id', user_ix='customer_id', timestamp_ix='t_dat')
# #every user has at least 2 items bought
proc.add_filter(MinUsersPerItem(2, item_ix='article_id', user_ix='customer_id'))
# #every item is bought at least twice
proc.add_filter(MinItemsPerUser(2, item_ix='article_id', user_ix='customer_id'))

interaction_matrix = proc.process(transactions)

100%|██████████| 31588145/31588145 [00:49<00:00, 643028.37it/s]
100%|██████████| 31588145/31588145 [00:41<00:00, 763627.86it/s] 


In [4]:
#3 : Create scenario
#divide matrix into test-train (75-25)
scenario = WeakGeneralization(0.75, validation=True)
# scenario = Timed()
scenario.split(interaction_matrix)

1207280it [06:47, 2961.34it/s]
1207280it [08:30, 2364.12it/s]


: 

In [None]:
#4 : Create the builder object
from PipelineBuilder_modified import *


builder = PipelineBuilder()
builder.set_data_from_scenario(scenario)

#adds algorithms to use later on. Baseline algorithim, just recommends popular stuff
# builder.add_algorithm('Popularity') 
builder.add_algorithm('ItemKNN', grid={
    'K': [100, 200, 500],
    'similarity': ['cosine', 'conditional_probability'],
})
#Set the metric for optimisation of parameters in algorithms. What is NDCGK ??
builder.set_optimisation_metric('NDCGK', K=10)

#adds metric for evaluation
#NDCGK = Normalized Discounted Cumulative Gain at K
builder.add_metric('NDCGK', K=[10, 20, 50])
builder.add_metric('CoverageK', K=[10, 20])

In [None]:
#5 : Create and run the pipeline
pipeline = builder.build()
csr = pipeline.run2()

In [None]:
class UserRecommendations:
    def __init__(self):
        self.user_data = {}

    def add_rec(self, user_id, item_id, recommendation_value):
        if user_id not in self.user_data:
            self.user_data[user_id] = []
        self.user_data[user_id].append((item_id, recommendation_value))

    def get_rec_user(self, user_id):
        return self.user_data.get(user_id, [])

# Example usage:
user_rec = UserRecommendations()

In [None]:
#get the list of every user who has been recomended smth
user_ids = set()
for row in range(csr.shape[0]):
    if csr.indptr[row] != csr.indptr[row + 1]:
        user_ids.add(row)

for user in user_ids:
    print("User : " + str(user))
    #info sobre las recomendaciones de un usuario
    client_row = csr.getrow(user)
    # print(client_row)
    #indice del item con maxima recomendacion 
    rec_value_index = np.argmax(client_row.data)
    # print(rec_value_index)
    #valor asociado a dicha recomendacion
    rec_value = client_row.max()
    print("Max recommendation value : " + str(rec_value))
    #id del articulo recomendado
    article_id_rec= client_row.indices[rec_value_index]
    print("Recommended article id : " + str(article_id_rec))

    user_rec.add_rec(user,article_id_rec,rec_value)

In [None]:
# Assuming you have the `proc` DataFramePreprocessor instance and `interaction_matrix` containing processed data
# Get the item and user ID mappings from the DataFramePreprocessor instance
item_id_mapping = proc.item_id_mapping.set_index(interaction_matrix.ITEM_IX)[proc.item_ix].to_dict()
user_id_mapping = proc.user_id_mapping.set_index(interaction_matrix.USER_IX)[proc.user_ix].to_dict()

user_id_mapping


In [None]:
#6 : Get results

pipeline.get_metrics()
# pipeline.optimisation_results

#pipeline.saveResults()