First testing of reading CSR matrix back into decoded id's

In [10]:
import tqdm as notebook_tqdm
import pandas as pd

from recpack.scenarios import WeakGeneralization, Timed
from recpack.preprocessing.preprocessors import DataFramePreprocessor
from recpack.preprocessing.filters import MinItemsPerUser, MinUsersPerItem

In [17]:
#1:  Data collection
transactions_path = '../../00 - Data/transactions/transactions_train.csv'
transactions = pd.read_csv(transactions_path)
print("Original data has size of : " + str(len(transactions)))

sample = 0.005
transactions_sample = transactions.sample(frac=sample, random_state=41)
print("Created a sample of " + str(sample) + " % with " + str(len(transactions_sample)) + " records")

Original data has size of : 31788324
Created a sample of 0.005 % with 158942 records


In [18]:
#2: Data preprocessing

#       item1   item2   item3
#usr1      x                x
#usr2       x       x
proc = DataFramePreprocessor(item_ix='article_id', user_ix='customer_id', timestamp_ix='t_dat')
# #every user has at least 2 items bought
proc.add_filter(MinUsersPerItem(2, item_ix='article_id', user_ix='customer_id'))
# #every item is bought at least twice
proc.add_filter(MinItemsPerUser(2, item_ix='article_id', user_ix='customer_id'))

interaction_matrix = proc.process(transactions_sample)

100%|██████████| 38771/38771 [00:00<00:00, 666532.88it/s]
100%|██████████| 38771/38771 [00:00<00:00, 630480.68it/s]


In [19]:
#3 : Create scenario
#divide matrix into test-train (75-25)
scenario = WeakGeneralization(0.75, validation=True)
# scenario = Timed()
scenario.split(interaction_matrix)

17168it [00:05, 3431.67it/s]
17168it [00:05, 3387.81it/s]
  warn(f"{name} resulting from {type(self).__name__} is unusually small.")


In [20]:
#4 : Create the builder object
from PipelineBuilder_modified import *


builder = PipelineBuilder()
builder.set_data_from_scenario(scenario)

#adds algorithms to use later on. Baseline algorithim, just recommends popular stuff
builder.add_algorithm('Popularity') 
#Set the metric for optimisation of parameters in algorithms. What is NDCGK ??
builder.set_optimisation_metric('NDCGK', K=10)

#adds metric for evaluation
#NDCGK = Normalized Discounted Cumulative Gain at K
builder.add_metric('NDCGK', K=[10, 20, 50])
builder.add_metric('CoverageK', K=[10, 20])

In [21]:
#5 : Create and run the pipeline
pipeline = builder.build()
csr = pipeline.run2()

  0%|          | 0/1 [00:00<?, ?it/s]

2023-11-21 12:31:04,298 - base - recpack - INFO - Fitting Popularity complete - Took 0.0214s


  0%|          | 0/1 [00:00<?, ?it/s]


CSR.
CSR.indices = items
CSR.data = values

In [61]:
user_ids = set()
for row in range(csr.shape[0]):
    if csr.indptr[row] != csr.indptr[row + 1]:
        user_ids.add(row)

In [62]:
user_ids

{5,
 8,
 48,
 50,
 53,
 56,
 57,
 100,
 110,
 133,
 134,
 135,
 146,
 157,
 170,
 171,
 186,
 187,
 211,
 213,
 234,
 237,
 255,
 260,
 276,
 297,
 301,
 308,
 327,
 339,
 341,
 346,
 372,
 379,
 387,
 399,
 408,
 419,
 420,
 442,
 460,
 461,
 485,
 502,
 513,
 516,
 518,
 526,
 532,
 536,
 544,
 546,
 562,
 578,
 603,
 622,
 657,
 662,
 682,
 707,
 726,
 729,
 739,
 745,
 749,
 762,
 766,
 786,
 793,
 799,
 806,
 815,
 827,
 842,
 847,
 853,
 880,
 899,
 901,
 908,
 937,
 939,
 947,
 949,
 950,
 951,
 976,
 984,
 988,
 990,
 1002,
 1006,
 1010,
 1011,
 1015,
 1020,
 1024,
 1086,
 1092,
 1102,
 1110,
 1121,
 1131,
 1137,
 1139,
 1143,
 1148,
 1196,
 1221,
 1224,
 1246,
 1248,
 1256,
 1258,
 1278,
 1279,
 1286,
 1294,
 1310,
 1325,
 1335,
 1337,
 1421,
 1422,
 1436,
 1442,
 1443,
 1446,
 1448,
 1472,
 1474,
 1477,
 1481,
 1489,
 1507,
 1522,
 1549,
 1557,
 1565,
 1583,
 1584,
 1589,
 1591,
 1609,
 1616,
 1617,
 1623,
 1644,
 1654,
 1660,
 1664,
 1679,
 1690,
 1715,
 1721,
 1746,
 1748,


In [60]:
print(csr.indptr)

[     0      0      0 ... 159829 159829 159829]


In [38]:
print(csr)

  (5, 17)	0.171875
  (5, 53)	0.34375
  (5, 63)	0.1875
  (5, 94)	0.234375
  (5, 104)	0.15625
  (5, 106)	0.1875
  (5, 109)	0.34375
  (5, 115)	0.203125
  (5, 123)	0.5625
  (5, 132)	0.15625
  (5, 142)	0.296875
  (5, 147)	0.171875
  (5, 167)	0.1875
  (5, 188)	0.34375
  (5, 208)	0.234375
  (5, 220)	0.171875
  (5, 229)	0.15625
  (5, 270)	0.1875
  (5, 316)	0.15625
  (5, 331)	0.171875
  (5, 348)	0.296875
  (5, 354)	0.3125
  (5, 358)	0.234375
  (5, 363)	0.1875
  (5, 367)	0.171875
  :	:
  (16914, 4345)	0.15625
  (16914, 4455)	0.171875
  (16914, 4471)	0.1875
  (16914, 4532)	0.28125
  (16914, 4585)	0.15625
  (16914, 4708)	0.203125
  (16914, 4743)	0.15625
  (16914, 4767)	0.28125
  (16914, 4921)	0.28125
  (16914, 4990)	0.171875
  (16914, 5221)	0.171875
  (16914, 5300)	0.203125
  (16914, 5396)	0.25
  (16914, 5496)	0.21875
  (16914, 5549)	0.15625
  (16914, 5583)	0.15625
  (16914, 5751)	0.171875
  (16914, 6064)	0.171875
  (16914, 6104)	0.1875
  (16914, 6260)	0.21875
  (16914, 6328)	0.15625
  (16914, 640

In [None]:
# print(csr.indices)
# print(csr.getrow(5))
# print(csr.data)

In [71]:
# for user in user_ids:
user = 5
#info sobre las recomendaciones de un usuario
client_row = csr.getrow(user)
print(client_row)
#indice del item con maxima recomendacion 
rec_value_index = np.argmax(client_row.data)
print(rec_value_index)
#valor asociado a dicha recomendacion
rec_value = client_row.max()
print(rec_value)
#id del articulo recomendado
article_id_rec= client_row.indices[rec_value_index]
print(article_id_rec)

  (0, 17)	0.171875
  (0, 53)	0.34375
  (0, 63)	0.1875
  (0, 94)	0.234375
  (0, 104)	0.15625
  (0, 106)	0.1875
  (0, 109)	0.34375
  (0, 115)	0.203125
  (0, 123)	0.5625
  (0, 132)	0.15625
  (0, 142)	0.296875
  (0, 147)	0.171875
  (0, 167)	0.1875
  (0, 188)	0.34375
  (0, 208)	0.234375
  (0, 220)	0.171875
  (0, 229)	0.15625
  (0, 270)	0.1875
  (0, 316)	0.15625
  (0, 331)	0.171875
  (0, 348)	0.296875
  (0, 354)	0.3125
  (0, 358)	0.234375
  (0, 363)	0.1875
  (0, 367)	0.171875
  :	:
  (0, 4345)	0.15625
  (0, 4455)	0.171875
  (0, 4471)	0.1875
  (0, 4532)	0.28125
  (0, 4585)	0.15625
  (0, 4708)	0.203125
  (0, 4743)	0.15625
  (0, 4767)	0.28125
  (0, 4921)	0.28125
  (0, 4990)	0.171875
  (0, 5221)	0.171875
  (0, 5300)	0.203125
  (0, 5396)	0.25
  (0, 5496)	0.21875
  (0, 5549)	0.15625
  (0, 5583)	0.15625
  (0, 5751)	0.171875
  (0, 6064)	0.171875
  (0, 6104)	0.1875
  (0, 6260)	0.21875
  (0, 6328)	0.15625
  (0, 6407)	0.1875
  (0, 7389)	0.171875
  (0, 7562)	0.296875
  (0, 8898)	0.203125
71
1.0
1219


In [79]:
for user in user_ids:
    print("User : " + str(user))
    #info sobre las recomendaciones de un usuario
    client_row = csr.getrow(user)
    # print(client_row)
    #indice del item con maxima recomendacion 
    rec_value_index = np.argmax(client_row.data)
    # print(rec_value_index)
    #valor asociado a dicha recomendacion
    rec_value = client_row.max()
    print("Max recommendation value : " + str(rec_value))
    #id del articulo recomendado
    article_id_rec= client_row.indices[rec_value_index]
    print("Recommended article id : " + str(article_id_rec))

User : 5
Max recommendation value : 1.0
Recommended article id : 1219
User : 8
Max recommendation value : 1.0
Recommended article id : 1219
User : 4110
Max recommendation value : 1.0
Recommended article id : 1219
User : 10260
Max recommendation value : 1.0
Recommended article id : 1219
User : 4128
Max recommendation value : 1.0
Recommended article id : 1219
User : 8225
Max recommendation value : 1.0
Recommended article id : 1219
User : 6178
Max recommendation value : 1.0
Recommended article id : 1219
User : 12320
Max recommendation value : 1.0
Recommended article id : 1219
User : 14373
Max recommendation value : 1.0
Recommended article id : 1219
User : 2086
Max recommendation value : 1.0
Recommended article id : 1219
User : 4134
Max recommendation value : 1.0
Recommended article id : 1219
User : 14376
Max recommendation value : 1.0
Recommended article id : 1219
User : 2089
Max recommendation value : 1.0
Recommended article id : 1219
User : 4137
Max recommendation value : 1.0
Recommende

In [None]:
num_unique_users = csr.shape[0]  # Number of rows represents users
print("Number of unique users:", num_unique_users)

In [None]:
for user_id in range(num_unique_users):
    user_row = csr_matrix[user_id]  # Get the user's interaction row
    
    
    # Extract top recommendation for the user
    for idx in sorted_indices:
        # Check if the user hasn't interacted with the item (value in the user row is zero)
        if user_row[idx] == 0:
            top_recommendation = idx  # This is the top recommendation for the user
            break  # Break after finding the first recommendation

    # Process or store the top recommendation for the current user
    process_recommendation(user_id, top_recommendation)  # Process or store recommendation


In [None]:
print("Data:", csr.data)       # Print the data array
print("Indices:", csr.indices) # Print the indices array
print("Indptr:", csr.indptr)  # Print the index pointer array

In [None]:
# Finding the item with the highest probability for each user
num_users = csr.shape[0]
for user_idx in range(num_users):
    user_data = csr.getrow(user_idx)
    max_prob_idx = user_data.argmax()
    max_prob_value = user_data.max()
    print(f"For user {user_idx + 1}, the item with the highest probability is item{max_prob_idx} with probability {max_prob_value}.")

In [None]:
# Assuming you have the `proc` DataFramePreprocessor instance and `interaction_matrix` containing processed data

# Get the item and user ID mappings from the DataFramePreprocessor instance
item_id_mapping = proc.item_id_mapping.set_index(interaction_matrix.ITEM_IX)[proc.item_ix].to_dict()
user_id_mapping = proc.user_id_mapping.set_index(interaction_matrix.USER_IX)[proc.user_ix].to_dict()



# Reverse the item and user mappings
reversed_item_mapping = {v: k for k, v in item_id_mapping.items()}
reversed_user_mapping = {v: k for k, v in user_id_mapping.items()}

# Get the original row and column indices
original_rows = np.array([reversed_user_mapping.get(row, None) for row, _ in csr_matrix_data])
original_cols = np.array([reversed_item_mapping.get(col, None) for _, col in csr_matrix_data])

# Filter out None values (entries without original IDs)
valid_indices = np.where((original_rows is not None) & (original_cols is not None))[0]

original_rows = original_rows[valid_indices]
original_cols = original_cols[valid_indices]
values = np.array([val for _, _, val in csr_matrix_data])[valid_indices]

# Find indices of original IDs in the arrays
user_indices = np.searchsorted(original_user_ids, original_rows)
item_indices = np.searchsorted(original_item_ids, original_cols)

# Reconstruct the CSR matrix
num_users = len(original_user_ids)
num_items = len(original_item_ids)
interaction_matrix = csr_matrix((values, (user_indices, item_indices)), shape=(num_users, num_items))

In [None]:

# Assuming you have the `proc` DataFramePreprocessor instance and `interaction_matrix` containing processed data

# Get the item and user ID mappings from the DataFramePreprocessor instance
item_id_mapping = proc.item_id_mapping.set_index(interaction_matrix.ITEM_IX)[proc.item_ix].to_dict()
user_id_mapping = proc.user_id_mapping.set_index(interaction_matrix.USER_IX)[proc.user_ix].to_dict()

user_id_mapping


In [None]:
# Reverse the user_id_mapping dictionary to get original values
reverse_user_id_mapping = {v: k for k, v in user_id_mapping.items()}
reverse_item_id_mapping = {v: k for k, v in item_id_mapping.items()}

In [None]:
# Reconstruct the full matrix
full_matrix = np.zeros(csr.shape)

# Iterate through the CSR matrix to populate the full matrix
for user_idx in range(csr.shape[0]):
    user_id = user_id_mapping.get(user_idx)
    if user_id is None:
        continue
    
    for item_idx, value in zip(csr.indices[csr.indptr[user_idx]:csr.indptr[user_idx + 1]],
                               csr.data[csr.indptr[user_idx]:csr.indptr[user_idx + 1]]):
        item_id = item_id_mapping.get(item_idx)
        if item_id is None:
            continue
        
        full_matrix[user_idx, item_idx] = value

# Display the reconstructed matrix
print(full_matrix)

In [None]:

# Initialize an empty list to store the new data for the CSR matrix
new_data = []
new_row_indices = []
new_col_indices = []

for row in range(csr.shape[0]):
    for col in csr.indices[csr.indptr[row]:csr.indptr[row+1]]:
        new_row_indices.append(row)
        new_col_indices.append(col)
        user_id = user_id_mapping[col]
        item_id = item_id_mapping[row]
        new_data.append(csr[row, col])

# Creating a new CSR matrix using the mapped data
new_csr = csr_matrix((new_data, (new_row_indices, new_col_indices)), shape=csr.shape)

print(new_csr)

In [None]:
# Find top recommended item for each user
user_count, item_count = csr.shape
row_ind = np.arange(user_count)
max_indices = csr.argmax(axis=1)
max_values = csr[row_ind, max_indices]

print("Top recommended items for each user:")
for user, item in enumerate(max_indices):
    print(f"User {user}: Item {item} with score {max_values[user]}")

In [None]:
# Convert (user_id, item_id) tuples to (user_idx, item_idx) using the mappings
mapped_data = { (user_id_mapping[user_idx], item_id_mapping[item_idx]): value for (user_idx, item_idx), value in csr.items()}

# Create a DataFrame from the mapped data
df = pd.DataFrame(np.nan, index=list(item_id_mapping.values()), columns=list(user_id_mapping.values()))

# Fill the DataFrame with mapped values
for (user_id, item_id), value in mapped_data.items():
    df.loc[item_id, user_id] = value

print(df)

In [None]:
csr_matrix_with_mapped_values = csr.copy()
# Get the row and column indices as well as data from the CSR matrix
row_indices, col_indices = csr_matrix_with_mapped_values.nonzero()
data = csr_matrix_with_mapped_values.data

# Replace the row indices with original values using reverse_user_id_mapping
original_row_indices = np.array([user_id_mapping[i] for i in row_indices])

# Reconstruct the CSR matrix with original values
reconstructed_csr_matrix = csr_matrix(
    (data, (original_row_indices, col_indices)),
    shape=(len(user_id_mapping), csr_matrix_with_mapped_values.shape[1])
)

print(reconstructed_csr_matrix)

In [None]:

# Get the row and column indices as well as data from the CSR matrix
row_indices, col_indices = csr.nonzero()
data = csr.data

# Replace the row indices with original values
original_row_indices = np.array([reverse_user_id_mapping[i] for i in row_indices])

# Reconstruct the CSR matrix with original values
reconstructed_csr_matrix = csr_matrix(
    (data, (original_row_indices, col_indices)),
    shape=(len(user_id_mapping), csr.shape[1])
)

print(reconstructed_csr_matrix)

In [None]:
# Assuming you have the `proc` DataFramePreprocessor instance and `interaction_matrix` containing processed data

# Get the item and user ID mappings from the DataFramePreprocessor instance
item_id_mapping = proc.item_id_mapping.set_index(interaction_matrix.ITEM_IX)[proc.item_ix].to_dict()
user_id_mapping = proc.user_id_mapping.set_index(interaction_matrix.USER_IX)[proc.user_ix].to_dict()

# Assuming you have your CSR matrix `csr` and the ID mappings

# Revert item IDs in the CSR matrix
reverted_item_ids = [item_id_mapping.get(internal_id, internal_id) for internal_id in csr.indices]

# Revert user IDs in the CSR matrix
reverted_user_ids = [user_id_mapping.get(internal_id, internal_id) for internal_id in csr.indptr]

# Now `reverted_item_ids` contains the item IDs in their original form
# And `reverted_user_ids` contains the user IDs in their original form

# Update item and user IDs in the original interaction matrix
csr[InteractionMatrix.ITEM_IX] = reverted_item_ids
csr[InteractionMatrix.USER_IX] = reverted_user_ids


In [None]:
# from scipy.sparse import csr_matrix
# dense_array = csr.toarray()
# print(dense_array)
# dense_matrix = csr.todense()
# print(dense_matrix)


# from recpack.postprocessing import postprocessors
# post = Postprocessor()
# csr_processed = post.process(csr)

In [None]:
#6 : Get results

pipeline.get_metrics()
# pipeline.optimisation_results

#pipeline.saveResults()