This notebook runs and gets the predictions back for the WeakGeneralization scenario

In [1]:
import tqdm as notebook_tqdm
import pandas as pd

from recpack.scenarios import WeakGeneralization, Timed
from recpack.preprocessing.preprocessors import DataFramePreprocessor
from recpack.preprocessing.filters import MinItemsPerUser, MinUsersPerItem

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#1:  Data collection
transactions_path = '../../00 - Data/transactions/transactions_train.csv'
transactions = pd.read_csv(transactions_path)
print("Original data has size of : " + str(len(transactions)))

sample = 0.005
transactions_sample = transactions.sample(frac=sample, random_state=40)
print("Created a sample of " + str(sample) + " % with " + str(len(transactions_sample)) + " records")

Original data has size of : 31788324
Created a sample of 0.005 % with 158942 records


In [None]:
#1:  Data collection
transactions_path = '../../00 - Data/transactions/transactions_train.csv'
transactions = pd.read_csv(transactions_path)
print("Original data has size of : " + str(len(transactions)))

sample = 0.005
transactions_sample = transactions.sample(frac=sample, random_state=40)
print("Created a sample of " + str(sample) + " % with " + str(len(transactions_sample)) + " records")

Original data has size of : 31788324
Created a sample of 0.005 % with 158942 records


In [3]:
#2: Data preprocessing

#        item1    item2   item3
#usr1      x                x
#usr2       x       x
proc = DataFramePreprocessor(item_ix='article_id', user_ix='customer_id', timestamp_ix='t_dat')
# #every user has at least 2 items bought
proc.add_filter(MinUsersPerItem(2, item_ix='article_id', user_ix='customer_id'))
# #every item is bought at least twice
proc.add_filter(MinItemsPerUser(2, item_ix='article_id', user_ix='customer_id'))

interaction_matrix = proc.process(transactions_sample)

100%|██████████| 38842/38842 [00:00<00:00, 663113.93it/s]
100%|██████████| 38842/38842 [00:00<00:00, 649506.86it/s]


In [4]:
#3 : Create scenario
#divide matrix into test-train (75-25)
scenario = WeakGeneralization(0.75, validation=True)
# scenario = Timed()
scenario.split(interaction_matrix)

17229it [00:04, 4262.78it/s]
17229it [00:04, 3758.37it/s]
  warn(f"{name} resulting from {type(self).__name__} is unusually small.")


In [5]:
#4 : Create the builder object
from PipelineBuilder_modified import *


builder = PipelineBuilder()
builder.set_data_from_scenario(scenario)

#adds algorithms to use later on. Baseline algorithim, just recommends popular stuff
# builder.add_algorithm('Popularity') 
builder.add_algorithm('ItemKNN', grid={
    'K': [100, 200, 500],
    'similarity': ['cosine', 'conditional_probability'],
})
#Set the metric for optimisation of parameters in algorithms. What is NDCGK ??
builder.set_optimisation_metric('NDCGK', K=10)

#adds metric for evaluation
#NDCGK = Normalized Discounted Cumulative Gain at K
builder.add_metric('NDCGK', K=[10, 20, 50])
builder.add_metric('CoverageK', K=[10, 20])



In [6]:
#5 : Create and run the pipeline
pipeline = builder.build()
csr = pipeline.run2()

  self._set_arrayXarray(i, j, x)


2023-11-26 18:25:22,154 - base - recpack - INFO - Fitting ItemKNN complete - Took 0.454s


  self._set_arrayXarray(i, j, x)


2023-11-26 18:25:22,616 - base - recpack - INFO - Fitting ItemKNN complete - Took 0.447s


  self._set_arrayXarray(i, j, x)


2023-11-26 18:25:22,884 - base - recpack - INFO - Fitting ItemKNN complete - Took 0.255s


  self._set_arrayXarray(i, j, x)


2023-11-26 18:25:23,335 - base - recpack - INFO - Fitting ItemKNN complete - Took 0.438s


  self._set_arrayXarray(i, j, x)


2023-11-26 18:25:23,602 - base - recpack - INFO - Fitting ItemKNN complete - Took 0.253s


  self._set_arrayXarray(i, j, x)


2023-11-26 18:25:24,051 - base - recpack - INFO - Fitting ItemKNN complete - Took 0.437s


  self._set_arrayXarray(i, j, x)


2023-11-26 18:25:24,317 - base - recpack - INFO - Fitting ItemKNN complete - Took 0.25s


  0%|          | 0/1 [00:02<?, ?it/s]


In [7]:
class UserRecommendations:
    def __init__(self):
        self.user_data = {}

    def add_rec(self, user_id, item_id, recommendation_value):
        if user_id not in self.user_data:
            self.user_data[user_id] = []
        self.user_data[user_id].append((item_id, recommendation_value))

    def get_rec_user(self, user_id):
        return self.user_data.get(user_id, [])

# Example usage:
user_rec = UserRecommendations()

In [8]:
#get the list of every user who has been recomended smth
user_ids = set()
for row in range(csr.shape[0]):
    if csr.indptr[row] != csr.indptr[row + 1]:
        user_ids.add(row)

for user in user_ids:
    print("User : " + str(user))
    #info sobre las recomendaciones de un usuario
    client_row = csr.getrow(user)
    # print(client_row)
    #indice del item con maxima recomendacion 
    rec_value_index = np.argmax(client_row.data)
    # print(rec_value_index)
    #valor asociado a dicha recomendacion
    rec_value = client_row.max()
    print("Max recommendation value : " + str(rec_value))
    #id del articulo recomendado
    article_id_rec= client_row.indices[rec_value_index]
    print("Recommended article id : " + str(article_id_rec))

    user_rec.add_rec(user,article_id_rec,rec_value)

User : 4096
Max recommendation value : 0.5773502691896258
Recommended article id : 5363
User : 2049
Max recommendation value : 0.5773502691896258
Recommended article id : 1738
User : 6144
Max recommendation value : 0.4082482904638631
Recommended article id : 15200
User : 4
Max recommendation value : 0.5773502691896258
Recommended article id : 625
User : 8196
Max recommendation value : 0.3162277660168379
Recommended article id : 10311
User : 10248
Max recommendation value : 0.4999999999999999
Recommended article id : 9920
User : 12296
Max recommendation value : 0.5773502691896258
Recommended article id : 6894
User : 4106
Max recommendation value : 0.408248290463863
Recommended article id : 8311
User : 8211
Max recommendation value : 0.7071067811865475
Recommended article id : 13971
User : 10261
Max recommendation value : 0.35355339059327373
Recommended article id : 16404
User : 8214
Max recommendation value : 0.408248290463863
Recommended article id : 17388
User : 4119
Max recommendatio

In [9]:
# Assuming you have the `proc` DataFramePreprocessor instance and `interaction_matrix` containing processed data
# Get the item and user ID mappings from the DataFramePreprocessor instance
item_id_mapping = proc.item_id_mapping.set_index(interaction_matrix.ITEM_IX)[proc.item_ix].to_dict()
user_id_mapping = proc.user_id_mapping.set_index(interaction_matrix.USER_IX)[proc.user_ix].to_dict()

user_id_mapping


{0: 'ae89ba4c28f12a6b274290ee20e864635d1461d9f9d0c7a3d2283617587c6833',
 1: 'b1ea403d77c3e20a7a6b16b27f896d56b787543ea484b0ba8ff99917b77dbc1e',
 2: '7882ecf18a2107faf3ccb3e8ce3d66b7d26fea481313a19d3ec357282b94572b',
 3: 'eef40cd2628e7218a25cd06630cb028935e1220250c135b8acadfc2e0e0e72f3',
 4: 'f3f3b83a093df7d7f3c15797fd429efc11eaa6e3c75c6c34ee27c881a073afba',
 5: '306b4fa2850fe19f5133c26711071f22db0960382d5c967453edeb28d3c07eb1',
 6: 'f1c5e2c64b637f74c75b2e96c3e0aeec6a99065d88a5ed86cfd7eeab868ae214',
 7: '786f9cd9f5e12e80ea182de7b3cc3d39330e9d444306da657e9054ff05966ba2',
 8: 'bab365e0971c7bd56ac52a0316fc1085f52f90989dc543e050e9b03794377819',
 9: '1391a62b15ff029eb6ac769e6cfe61ac3425121a5bd9823eccdc89d7c9b1bffb',
 10: '6cc3b00f9d953198517184460e1f25e4b1a17a8cb72fcef4f9f56dc52f5845cf',
 11: '6abb94abae751677baec90f1391419118666275b64c6edf5946f8adf7cfe91e1',
 12: '8c72d62a0988097bc71ca4dcdc595b8af4bbcbafa0ddee8e533018f6a09ceda7',
 13: 'add8c4087fe42389a40b6cc144628fd3deee616a9e7b56c4ea11b17

In [10]:
#6 : Get results

pipeline.get_metrics()
# pipeline.optimisation_results

#pipeline.saveResults()

Unnamed: 0,CoverageK_10,CoverageK_20,NDCGK_10,NDCGK_20,NDCGK_50
"ItemKNN(K=100,normalize_X=False,normalize_sim=False,pop_discount=None,similarity=cosine)",0.26882,0.321882,0.000548,0.000892,0.000874
