In [3]:
import pandas as pd
import numpy as np
import scipy.sparse as sps

from scipy.sparse import *

In [4]:
import time

In [5]:
from sklearn.metrics import roc_auc_score

# BPR

In [6]:
urm_path = '../../content/data_train.csv'

urm_all_df = pd.read_csv(filepath_or_buffer=urm_path,
                                sep=",",
                                header=0,
                                dtype={0:int, 1:int, 2:float},
                                engine='python')

urm_all_df.columns = ["UserID", "ItemID", "Interaction"]

In [7]:
urm_all_df.head(10)

Unnamed: 0,UserID,ItemID,Interaction
0,1,7,1.0
1,1,15,1.0
2,1,16,1.0
3,1,133,1.0
4,1,161,1.0
5,1,187,1.0
6,1,205,1.0
7,1,222,1.0
8,1,237,1.0
9,1,354,1.0


In [8]:
print ("The number of interactions is {}".format(len(urm_all_df)))

The number of interactions is 478730


In [9]:
userID_unique = urm_all_df["UserID"].unique()
itemID_unique = urm_all_df["ItemID"].unique()

n_users = len(userID_unique)
n_items = len(itemID_unique)
n_interactions = len(urm_all_df)

print ("Number of items\t {}, Number of users\t {}".format(n_items, n_users))
print ("Max ID items\t {}, Max Id users\t {}\n".format(max(itemID_unique), max(userID_unique)))
print ("Average interactions per user {:.2f}".format(n_interactions/n_users))
print ("Average interactions per item {:.2f}\n".format(n_interactions/n_items))

print ("Sparsity {:.2f} %".format((1-float(n_interactions)/(n_items*n_users))*100))

Number of items	 22222, Number of users	 12638
Max ID items	 22347, Max Id users	 13024

Average interactions per user 37.88
Average interactions per item 21.54

Sparsity 99.83 %


Not all users are in the urm:

In [10]:
mapped_id, original_id = pd.factorize(urm_all_df["UserID"].unique())
user_original_ID_to_index = pd.Series(mapped_id, index=original_id)

mapped_id, original_id = pd.factorize(urm_all_df["ItemID"].unique())
item_original_ID_to_index = pd.Series(mapped_id, index=original_id)

We now replace the IDs in the dataframe and we are ready to use the data

In [11]:
urm_all_df["UserID"] = urm_all_df["UserID"].map(user_original_ID_to_index)
urm_all_df["ItemID"] = urm_all_df["ItemID"].map(item_original_ID_to_index)

In [12]:
urm_all_df.head(n=10)

Unnamed: 0,UserID,ItemID,Interaction
0,0,0,1.0
1,0,1,1.0
2,0,2,1.0
3,0,3,1.0
4,0,4,1.0
5,0,5,1.0
6,0,6,1.0
7,0,7,1.0
8,0,8,1.0
9,0,9,1.0


In [13]:
urm_all = sps.coo_matrix((urm_all_df["Interaction"].values,
                          (urm_all_df["UserID"].values, urm_all_df["ItemID"].values)))

urm_all

<12638x22222 sparse matrix of type '<class 'numpy.float64'>'
	with 478730 stored elements in COOrdinate format>

In [14]:
urm_all.tocsr()

<12638x22222 sparse matrix of type '<class 'numpy.float64'>'
	with 478730 stored elements in Compressed Sparse Row format>

In [15]:
def precision(recommended_items, relevant_items):

    is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)

    precision_score = np.sum(is_relevant, dtype=np.float32) / len(is_relevant)

    return precision_score

def recall(recommended_items, relevant_items):

    is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)

    recall_score = np.sum(is_relevant, dtype=np.float32) / relevant_items.shape[0]

    return recall_score

def AP(recommended_items, relevant_items):

    is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)

    # Cumulative sum: precision at 1, at 2, at 3 ...
    p_at_k = is_relevant * np.cumsum(is_relevant, dtype=np.float32) / (1 + np.arange(is_relevant.shape[0]))

    ap_score = np.sum(p_at_k) / np.min([relevant_items.shape[0], is_relevant.shape[0]])

    return ap_score

In [16]:
def evaluate_algorithm(URM_test, recommender_object, at=5):

    cumulative_precision = 0.0
    cumulative_recall = 0.0
    cumulative_AP = 0.0

    num_eval = 0


    for user_id in range(URM_test.shape[0]):

        relevant_items = URM_test.indices[URM_test.indptr[user_id]:URM_test.indptr[user_id+1]]

        if len(relevant_items)>0:

            recommended_items = recommender_object.recommend(user_id, at=at)
            num_eval+=1

            cumulative_precision += precision(recommended_items, relevant_items)
            cumulative_recall += recall(recommended_items, relevant_items)
            cumulative_AP += AP(recommended_items, relevant_items)

    cumulative_precision /= num_eval
    cumulative_recall /= num_eval
    MAP = cumulative_AP / num_eval

    print("Recommender results are: Precision = {:.4f}, Recall = {:.4f}, MAP = {:.4f}".format(
        cumulative_precision, cumulative_recall, MAP))

---

# Splitting Train and Validation set

In [17]:
def train_test_split_urm(urm, train_percentage=0.8, seed=None):
    """
    Splits the User-Item Interaction Matrix (URM) into training and testing sets.

    Parameters:
    - urm (csr_matrix): The User-Item Interaction Matrix.
    - train_percentage (float): The percentage of interactions to include in the training set.
    - seed (int): Seed for reproducibility.

    Returns:
    - train_urm (csr_matrix): Training User-Item Interaction Matrix.
    - test_urm (csr_matrix): Testing User-Item Interaction Matrix.
    """

    if not (0 < train_percentage < 1):
        raise ValueError("train_percentage must be between 0 and 1 exclusive")

    # Set seed for reproducibility
    if seed is not None:
        np.random.seed(seed)

    # Get the non-zero indices (user and item indices)
    non_zero_indices = urm.nonzero()

    # Get the number of non-zero interactions
    num_interactions = len(non_zero_indices[0])

    # Randomly shuffle the indices
    shuffled_indices = np.arange(num_interactions)
    np.random.shuffle(shuffled_indices)

    # Calculate the number of interactions for the training set
    num_train_interactions = int(train_percentage * num_interactions)

    # Split the indices into training and testing sets
    train_indices = shuffled_indices[:num_train_interactions]
    test_indices = shuffled_indices[num_train_interactions:]

    # Create masks for indexing
    train_mask = np.zeros(num_interactions, dtype=bool)
    test_mask = np.zeros(num_interactions, dtype=bool)

    train_mask[train_indices] = True
    test_mask[test_indices] = True

    # Extract values and create new matrices
    train_data = urm.data[train_mask]
    train_rows = non_zero_indices[0][train_mask]
    train_cols = non_zero_indices[1][train_mask]

    test_data = urm.data[test_mask]
    test_rows = non_zero_indices[0][test_mask]
    test_cols = non_zero_indices[1][test_mask]

    # Create new matrices using the extracted values
    train_urm = coo_matrix((train_data, (train_rows, train_cols)), shape=urm.shape).tocsr()
    test_urm = coo_matrix((test_data, (test_rows, test_cols)), shape=urm.shape).tocsr()

    return train_urm, test_urm


Split the dataset into train and validation. We set the split to 0.8-0.2 as a standard. It's an hyperparameter: we should fine tune it.

In [18]:
# Assuming urm is your User-Item Interaction Matrix in csr_matrix format
train_urm, test_urm = train_test_split_urm(urm_all, train_percentage=0.8, seed=42)

-------------------------------------------------------------------------------

# BRP implementation

## Step 1: We create the dense latent factor matrices

In [19]:
n_users, n_items = train_urm.shape

In [20]:
num_factors = 20

user_factors = np.random.random((n_users, num_factors))
item_factors = np.random.random((n_items, num_factors))

## Step 2: We sample a triplet

#### Create a mask of positive interactions.

In [21]:
URM_mask = train_urm.copy()
URM_mask.data[URM_mask.data < 1.0] = 0

URM_mask.eliminate_zeros()
URM_mask

<12638x22222 sparse matrix of type '<class 'numpy.float64'>'
	with 382984 stored elements in Compressed Sparse Row format>

## Create a function to get a triplet (user_id, pos_item_id, neg_item_id)

In [22]:
def sample_triplet():
    
    non_empty_user = False
    
    while not non_empty_user:
        user_id = np.random.choice(n_users)
        user_seen_items = URM_mask.indices[URM_mask.indptr[user_id]:URM_mask.indptr[user_id+1]]
        
        if len(user_seen_items)>0:
            non_empty_user = True

    pos_item_id = np.random.choice(user_seen_items)

    neg_item_selected = False

    # It's faster to just try again then to build a mapping of the non-seen items
    while (not neg_item_selected):
        neg_item_id = np.random.randint(0, n_items)

        if (neg_item_id not in user_seen_items):
            neg_item_selected = True

    return user_id, pos_item_id, neg_item_id    

## Create the function to train the model

In [23]:
learning_rate = 1e-4
regularization = 1e-4

In [24]:
def train_one_epoch(user_factors, item_factors, learning_rate):

    start_time = time.time()
    for sample_num in range(n_users):

        # Sample triplet
        user_id, pos_item_id, neg_item_id = sample_triplet()
        
        # Prediction
        x_ui = np.dot(user_factors[user_id,:], item_factors[pos_item_id,:])
        x_uj = np.dot(user_factors[user_id,:], item_factors[neg_item_id,:])
        
        # Gradient
        x_uij = x_ui - x_uj

        sigmoid_item = 1 / (1 + np.exp(x_uij))
                
        H_i = item_factors[pos_item_id,:]
        H_j = item_factors[neg_item_id,:]
        W_u = user_factors[user_id,:]


        user_factors[user_id,:] += learning_rate * (sigmoid_item * ( H_i - H_j ) - regularization * W_u)
        item_factors[pos_item_id,:] += learning_rate * (sigmoid_item * ( W_u ) - regularization * H_i)
        item_factors[neg_item_id,:] += learning_rate * (sigmoid_item * (-W_u ) - regularization * H_j)

        # Print some stats
        if (sample_num +1)% 50000 == 0 or (sample_num +1) == n_users:
            elapsed_time = time.time() - start_time
            samples_per_second = (sample_num +1)/elapsed_time
            print("Iteration {} in {:.2f} seconds. Samples per second {:.2f}".format(sample_num+1, elapsed_time, samples_per_second))
        
    return user_factors, item_factors, samples_per_second

---

# Actual training

In [25]:
# List of tuples (user_id, positive_movie) and (user_id, negative_movie)
num_samples = 1000  # Adjust the number of samples based on your needs

positive_samples = []
negative_samples = []

for _ in range(num_samples):
    user_id, pos_item_id, neg_item_id = sample_triplet()
    
    positive_samples.append((user_id, pos_item_id))  # Label 1 for positive samples
    negative_samples.append((user_id, neg_item_id))  # Label 0 for negative samples

In [26]:
for n_epoch in range(100):
  user_factors, item_factors, samples_per_second = train_one_epoch(user_factors, item_factors, learning_rate)
  true_labels = []
  predicted_scores = []

  for user, item in positive_samples:
    true_labels.append(1)
    predicted_scores.append(np.dot(user_factors[user], item_factors[item]))

  for user, item in negative_samples:
    true_labels.append(0)
    predicted_scores.append(np.dot(user_factors[user], item_factors[item]))
  auc_score = roc_auc_score(true_labels, predicted_scores)
  print("AUC Score:", auc_score)

Iteration 12638 in 1.57 seconds. Samples per second 8033.67
AUC Score: 0.5096809999999999
Iteration 12638 in 1.36 seconds. Samples per second 9291.93
AUC Score: 0.510188
Iteration 12638 in 1.50 seconds. Samples per second 8413.58
AUC Score: 0.5107039999999999
Iteration 12638 in 1.75 seconds. Samples per second 7212.99
AUC Score: 0.511192
Iteration 12638 in 1.51 seconds. Samples per second 8352.31
AUC Score: 0.511715
Iteration 12638 in 1.59 seconds. Samples per second 7932.85
AUC Score: 0.512192
Iteration 12638 in 1.55 seconds. Samples per second 8147.74
AUC Score: 0.5126759999999999
Iteration 12638 in 1.56 seconds. Samples per second 8126.71
AUC Score: 0.513193
Iteration 12638 in 1.58 seconds. Samples per second 8013.25
AUC Score: 0.513685
Iteration 12638 in 1.54 seconds. Samples per second 8227.33
AUC Score: 0.514162
Iteration 12638 in 1.62 seconds. Samples per second 7795.89
AUC Score: 0.514643
Iteration 12638 in 1.60 seconds. Samples per second 7898.21
AUC Score: 0.515123
Iteration 

---

# AUC Score
We use the AUC score to know how we're doing

In [27]:
true_labels = []
predicted_scores = []

for user, item in positive_samples:
    true_labels.append(1)
    predicted_scores.append(np.dot(user_factors[user], item_factors[item]))

for user, item in negative_samples:
    true_labels.append(0)
    predicted_scores.append(np.dot(user_factors[user], item_factors[item]))

In [28]:
auc_score = roc_auc_score(true_labels, predicted_scores)
print("AUC Score:", auc_score)

AUC Score: 0.551023


# Predictions

In [29]:
# After training, you can use the learned user and item factors for making predictions
# For example, to predict the score for a specific user and item:
user_id = 0
item_id = 1
prediction = np.dot(user_factors[user_id, :], item_factors[item_id, :])
print(f"Prediction for user {user_id} and item {item_id}: {prediction}")

Prediction for user 0 and item 1: 5.45743876616588


**Now let's read the data we want to predict for and see if there're overlaps with UserIDs in train data**

In [30]:
urm_pred_path = '../../content/data_target_users_test.csv'

urm_pred_df = pd.read_csv(filepath_or_buffer=urm_pred_path,
                                sep=",",
                                header=0,
                                dtype={0:int},
                                engine='python')

urm_pred_df.columns = ["UserID"]
len(urm_pred_df['UserID'])
print('Unique user id to predict:', urm_pred_df['UserID'].nunique())

Unique user id to predict: 10882


We need to convert it to the mapping we've already done for urm_all_df. But we may then have duplicates ids. So we need another method. We may want to convert to the mapped id only when we make a prediction. If the user is not in the mapping then we return the top recommender result ("[517, 189, 44, 0, 284, 808, 285, 1, 557, 1266]")

In [36]:
def predict_for_user(user_id, user_factors, item_factors, user_original_ID_to_index):
    # Check if the user is in the mapping
    if user_id in user_original_ID_to_index:
        mapped_user_id = user_original_ID_to_index[user_id]
        item_for_user = np.dot(user_factors[mapped_user_id, :], item_factors.T)
        top_item_indices = np.argsort(item_for_user)[::-1][:10]
        prediction = item_original_ID_to_index.index[top_item_indices].values
        
    else:
        # Use default recommendations for users not in the mapping
        # But I need to convert the default recommendation to the mapped one!
        # And then remap it to the original
        top_recommendations = [517, 189, 44, 0, 284, 808, 285, 1, 557, 1266]
        prediction = top_recommendations
    
    return prediction

In [37]:
predictions = []

for user_id in urm_pred_df['UserID']:
    prediction = predict_for_user(user_id, user_factors, item_factors, user_original_ID_to_index)
    predictions.append(prediction)

In [39]:
urm_pred_df

Unnamed: 0,UserID
0,1
1,2
2,3
3,4
4,5
...,...
10877,13020
10878,13021
10879,13022
10880,13023


The predictions array should have the same length of the length of the users to predict

In [40]:
len(predictions)

10882

In [103]:
predictions_as_strings = [' '.join(map(str, arr)) for arr in predictions]

In [105]:
pred_df = pd.DataFrame(columns = ['user_id','item_list'])

for i in range(len(predictions)):
  pred_df.at[i,'user_id'] = urm_pred_df.loc[i, 'UserID']
  pred_df.at[i, 'item_list'] = predictions_as_strings[i]

In [106]:
pred_df.loc[54]

user_id                                       60
item_list    517 189 44 0 284 808 285 1 557 1266
Name: 54, dtype: object

In [None]:
pred_df.to_csv('/content/predCF_Max.csv',index=False)