## Data Preparation

In [3]:
import pandas as pd
import numpy as np
import random
from tqdm.auto import tqdm
from sklearn.metrics import roc_auc_score
from collections import defaultdict

seed = 42
random.seed(seed)
np.random.seed(seed)
data_dir = '/home/sagemaker-user/Data/'
model_dir = '/home/sagemaker-user/Models/'
log_dir = '/home/sagemaker-user/Logs/'

train_df = pd.read_pickle(data_dir+'clean_data/train_set.pkl')
test_df = pd.read_pickle(data_dir+'clean_data/test_set.pkl')

In [6]:
filtered_data = pd.read_pickle(data_dir+'clean_data/filtered_data.pkl')
filtered_data=filtered_data[['parent_asin','text']].drop_duplicates()

In [7]:
filtered_data.head()

Unnamed: 0,parent_asin,text
0,B0002C7FHC,dog whole lot say basic bark control collar ef...
1,B00UFKQKLS,hamilton nylon comfort dog harness come vibran...
2,B01I6X61OQ,high quality adjustable collar quickrelease bu...
3,B09C8C44PQ,petnation portacrate portable easy carry take ...
4,B001150X9G,poochiebells leader dog potty training communi...


In [8]:
# Import necessary libraries
import pandas as pd
from multiprocessing import Pool, cpu_count  # For parallel processing
from tqdm import tqdm  # For progress bars
import numpy as np  # For array splitting
import gc  # For garbage collection to manage memory

# Select the columns to merge from the filtered data
filtered_subset = filtered_data[['parent_asin', 'text']]  # Keep only 'parent_asin' and 'text' columns

# Perform garbage collection to free up memory
gc.collect()

# Display train_df and test_df structure before merging
print("Before merging:")
print("train_df columns:", train_df.columns.tolist())
print("test_df columns:", test_df.columns.tolist())

# Define the batch merge function
def merge_batch(args):
    """
    Merges a batch of data with the filtered subset on 'parent_asin' column.

    Parameters:
        args (tuple): A tuple containing the batch DataFrame and its index.

    Returns:
        DataFrame: Merged batch DataFrame.
    """
    batch, batch_index = args  # Unpack the batch and its index
    # Merge the batch with filtered_subset on 'parent_asin' (left join)
    merged_batch = batch.merge(filtered_subset, on=['parent_asin'], how='left')
    # Perform garbage collection to release memory immediately after merging
    gc.collect()
    return merged_batch

# Get the number of CPU cores
num_cores = cpu_count()
# Set the number of processes for parallel processing (adjustable based on memory)
num_processes = max(1, num_cores // 2)
# Define the number of batches (based on memory availability)
num_batches = num_processes * 2

# Process train_df in batches
print("Processing train_df in batches...")

# Split train_df into multiple batches
train_batches = np.array_split(train_df, num_batches)

# Store results of merged batches
train_results = []
# Use multiprocessing Pool for parallel processing
with Pool(processes=num_processes) as pool:
    # Process each batch and append results
    for result in tqdm(pool.imap(merge_batch, [(batch, idx) for idx, batch in enumerate(train_batches)]), total=num_batches):
        train_results.append(result)
        # Perform garbage collection after processing each batch
        gc.collect()

# Concatenate all processed batches into a single DataFrame
train_df_enriched = pd.concat(train_results, ignore_index=True)
# Clean up memory
del train_results, train_batches, train_df
gc.collect()

# Process test_df in batches
print("Processing test_df in batches...")

# Split test_df into multiple batches
test_batches = np.array_split(test_df, num_batches)

# Store results of merged batches
test_results = []
# Use multiprocessing Pool for parallel processing
with Pool(processes=num_processes) as pool:
    # Process each batch and append results
    for result in tqdm(pool.imap(merge_batch, [(batch, idx) for idx, batch in enumerate(test_batches)]), total=num_batches):
        test_results.append(result)
        # Perform garbage collection after processing each batch
        gc.collect()

# Concatenate all processed batches into a single DataFrame
test_df_enriched = pd.concat(test_results, ignore_index=True)
# Clean up memory
del test_results, test_batches, test_df
gc.collect()

# Delete unnecessary variables and perform garbage collection
del filtered_subset
gc.collect()

# Display train_df and test_df structure after merging
print("\nAfter merging:")
print("train_df_enriched columns:", train_df_enriched.columns.tolist())
print("test_df_enriched columns:", test_df_enriched.columns.tolist())

# Display a sample of the enriched train_df
print("\nSample of enriched train_df:")
print(train_df_enriched.head())

# Display a sample of the enriched test_df
print("\nSample of enriched test_df:")
print(test_df_enriched.head())


Before merging:
train_df columns: ['user_id', 'parent_asin', 'label', 'rating', 'timestamp']
test_df columns: ['user_id', 'parent_asin', 'label', 'rating', 'timestamp']
Processing train_df in batches...


  return bound(*args, **kwds)
100%|██████████| 8/8 [00:07<00:00,  1.09it/s]


Processing test_df in batches...


  return bound(*args, **kwds)
100%|██████████| 8/8 [00:20<00:00,  2.53s/it]



After merging:
train_df_enriched columns: ['user_id', 'parent_asin', 'label', 'rating', 'timestamp', 'text']
test_df_enriched columns: ['user_id', 'parent_asin', 'label', 'rating', 'timestamp', 'text']

Sample of enriched train_df:
                        user_id parent_asin  label  rating  \
0  AE22236AFRRSMQIKGG7TPTB75QEA  B0002C7FHC      1       5   
1  AE22236AFRRSMQIKGG7TPTB75QEA  B00UFKQKLS      1       5   
2  AE22236AFRRSMQIKGG7TPTB75QEA  B01I6X61OQ      1       5   
3  AE22236AFRRSMQIKGG7TPTB75QEA  B0713WBZM7      0       0   
4  AE22236AFRRSMQIKGG7TPTB75QEA  B0BVM3J8GW      0       0   

            timestamp                                               text  
0 2009-09-19 19:42:10  dog whole lot say basic bark control collar ef...  
1 2014-03-07 15:31:31  hamilton nylon comfort dog harness come vibran...  
2 2014-03-07 17:06:29  high quality adjustable collar quickrelease bu...  
3 2009-09-19 19:42:10                                                NaN  
4 2014-03-07 15:31:

In [9]:
train_df_enriched.head()

Unnamed: 0,user_id,parent_asin,label,rating,timestamp,text
0,AE22236AFRRSMQIKGG7TPTB75QEA,B0002C7FHC,1,5,2009-09-19 19:42:10,dog whole lot say basic bark control collar ef...
1,AE22236AFRRSMQIKGG7TPTB75QEA,B00UFKQKLS,1,5,2014-03-07 15:31:31,hamilton nylon comfort dog harness come vibran...
2,AE22236AFRRSMQIKGG7TPTB75QEA,B01I6X61OQ,1,5,2014-03-07 17:06:29,high quality adjustable collar quickrelease bu...
3,AE22236AFRRSMQIKGG7TPTB75QEA,B0713WBZM7,0,0,2009-09-19 19:42:10,
4,AE22236AFRRSMQIKGG7TPTB75QEA,B0BVM3J8GW,0,0,2014-03-07 15:31:31,


In [10]:
# Drop rows with missing values from the enriched train DataFrame
train_df_enriched_cleaned = train_df_enriched.dropna()

# Drop rows with missing values from the enriched test DataFrame
test_df_enriched_cleaned = test_df_enriched.dropna()

# Display the number of rows after removing missing values
print("Number of rows in train_df_enriched after dropping missing values:", len(train_df_enriched_cleaned))
print("Number of rows in test_df_enriched after dropping missing values:", len(test_df_enriched_cleaned))

Number of rows in train_df_enriched after dropping missing values: 938741
Number of rows in test_df_enriched after dropping missing values: 2657241


In [11]:
train_df_enriched_cleaned.head()

Unnamed: 0,user_id,parent_asin,label,rating,timestamp,text
0,AE22236AFRRSMQIKGG7TPTB75QEA,B0002C7FHC,1,5,2009-09-19 19:42:10,dog whole lot say basic bark control collar ef...
1,AE22236AFRRSMQIKGG7TPTB75QEA,B00UFKQKLS,1,5,2014-03-07 15:31:31,hamilton nylon comfort dog harness come vibran...
2,AE22236AFRRSMQIKGG7TPTB75QEA,B01I6X61OQ,1,5,2014-03-07 17:06:29,high quality adjustable collar quickrelease bu...
5,AE22236AFRRSMQIKGG7TPTB75QEA,B0085JN2JY,0,0,2014-03-07 17:06:29,discover delicious way help adult cat achieve ...
6,AE222MW56PH6JXPIB6XSAMCBTLNQ,B00GIHFUNG,1,3,2012-12-07 02:15:25,inch wide standard black leather collar made l...


## Creating Sequences & Vocabulary for Text Column

In [14]:
import pandas as pd
import numpy as np
import random
import re
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, classification_report
from tqdm import tqdm

In [12]:
# Reload enriched cleaned data
train_data = train_df_enriched_cleaned.copy()
test_data = test_df_enriched_cleaned.copy()

In [15]:
# encoding user_id and parent_asin
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

user_encoder.fit(pd.concat([train_data['user_id'], test_data['user_id']]))
item_encoder.fit(pd.concat([train_data['parent_asin'], test_data['parent_asin']]))

train_data['user_id_idx'] = user_encoder.transform(train_data['user_id'])
train_data['item_id_idx'] = item_encoder.transform(train_data['parent_asin'])

test_data['user_id_idx'] = user_encoder.transform(test_data['user_id'])
test_data['item_id_idx'] = item_encoder.transform(test_data['parent_asin'])


In [16]:
from collections import Counter

# build vocabulary
def build_vocab(sentences, max_vocab_size=1000000):
    word_count = Counter()
    for sent in sentences:
        word_count.update(sent.split())
    most_common_words = word_count.most_common(max_vocab_size - 2)  # 留出位置给 <PAD> 和 <UNK>
    idx_to_word = ['<PAD>', '<UNK>'] + [word for word, _ in most_common_words]
    word_to_idx = {word: idx for idx, word in enumerate(idx_to_word)}
    return word_to_idx, idx_to_word

all_sentences = pd.concat([train_data['text'], test_data['text']])
word_to_idx, idx_to_word = build_vocab(all_sentences)

vocab_size = len(word_to_idx)
print(f"size of vacab：{vocab_size}")


size of vacab：111152


In [17]:
MAX_SEQUENCE_LENGTH = 256  # set the max sequence length

def text_to_sequence(text, word_to_idx, max_len):
    tokens = text.split()
    sequence = [word_to_idx.get(word, word_to_idx['<UNK>']) for word in tokens]
    if len(sequence) < max_len:
        sequence += [word_to_idx['<PAD>']] * (max_len - len(sequence))
    else:
        sequence = sequence[:max_len]
    return sequence

train_data['text_seq'] = train_data['text'].apply(lambda x: text_to_sequence(x, word_to_idx, MAX_SEQUENCE_LENGTH))
test_data['text_seq'] = test_data['text'].apply(lambda x: text_to_sequence(x, word_to_idx, MAX_SEQUENCE_LENGTH))


In [19]:
train_dataset, val_dataset = train_test_split(train_data, test_size=0.1, random_state=seed)

In [20]:
val_dataset.label.value_counts()

label
1    60553
0    33322
Name: count, dtype: int64

## Train-Test Split

In [22]:
# create user-item pairs for training
train_pairs = set(zip(train_data['user_id_idx'], train_data['item_id_idx']))

# create user-item pairs for testing
test_pairs = set(zip(test_data['user_id_idx'], test_data['item_id_idx']))

In [None]:
# find the overlapped user-item pairs
overlap_pairs = train_pairs.intersection(test_pairs)
print(f"number of overlapped pairs in training and testing：{len(overlap_pairs)}")

In [27]:
# drop overlap from testing set
if len(overlap_pairs) > 0:
    test_data_no_overlap = test_data[~test_data.apply(lambda row: (row['user_id_idx'], row['item_id_idx']) in overlap_pairs, axis=1)]
    print(f"length of test_data after dedup：{len(test_data_no_overlap)}")
else:
    test_data_no_overlap = test_data

length of test_data after dedup：2646215


In [28]:
# drop overlap from training set before split validation set
train_data_no_overlap = train_data[~train_data.apply(lambda row: (row['user_id_idx'], row['item_id_idx']) in overlap_pairs, axis=1)]
print(f"length of train_data after dedup：{len(train_data_no_overlap)}")

length of train_data after dedup：928436


In [30]:
train_dataset, val_dataset = train_test_split(train_data_no_overlap, test_size=0.1, random_state=seed)


In [31]:
train_pairs = set(zip(train_dataset['user_id_idx'], train_dataset['item_id_idx']))
val_pairs = set(zip(val_dataset['user_id_idx'], val_dataset['item_id_idx']))
overlap_train_val = train_pairs.intersection(val_pairs)
print(f"number of overlapped pairs in training and validation：{len(overlap_train_val)}")

number of overlapped pairs in training and validation：1240


In [32]:
if len(overlap_train_val) > 0:
    val_dataset = val_dataset[~val_dataset.apply(lambda row: (row['user_id_idx'], row['item_id_idx']) in train_pairs, axis=1)]
    print(f"length of train_data after dedup：{len(val_dataset)}")

length of train_data after dedup：91581


In [21]:
class NCFDataset(Dataset):
    def __init__(self, user_ids, item_ids, text_seqs, labels):
        self.user_ids = user_ids
        self.item_ids = item_ids
        self.text_seqs = text_seqs
        self.labels = labels

    def __len__(self):
        return len(self.user_ids)

    def __getitem__(self, idx):
        return {
            'user_id': self.user_ids[idx],
            'item_id': self.item_ids[idx],
            'text_seq': torch.tensor(self.text_seqs[idx], dtype=torch.long),
            'label': torch.tensor(self.labels[idx], dtype=torch.float32)
        }


In [34]:
BATCH_SIZE = 256

In [35]:
# train
train_dataset_obj = NCFDataset(
    user_ids=train_dataset['user_id_idx'].values,
    item_ids=train_dataset['item_id_idx'].values,
    text_seqs=train_dataset['text_seq'].values,
    labels=train_dataset['label'].values
)
train_loader = DataLoader(train_dataset_obj, batch_size=BATCH_SIZE, shuffle=True)

# validation
val_dataset_obj = NCFDataset(
    user_ids=val_dataset['user_id_idx'].values,
    item_ids=val_dataset['item_id_idx'].values,
    text_seqs=val_dataset['text_seq'].values,
    labels=val_dataset['label'].values
)
val_loader = DataLoader(val_dataset_obj, batch_size=BATCH_SIZE, shuffle=False)


# test
test_dataset_obj = NCFDataset(
    user_ids=test_data_no_overlap['user_id_idx'].values,
    item_ids=test_data_no_overlap['item_id_idx'].values,
    text_seqs=test_data_no_overlap['text_seq'].values,
    labels=test_data_no_overlap['label'].values,
)
test_loader = DataLoader(test_dataset_obj, batch_size=BATCH_SIZE, shuffle=False)

## Model Definition

In [36]:
class NCFModel(nn.Module):
    def __init__(self, num_users, num_items, vocab_size, embedding_dim=32, text_embedding_dim=32):
        super(NCFModel, self).__init__()
        # user and item embedding
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)

        # text embedding
        self.text_embedding = nn.Embedding(vocab_size, text_embedding_dim, padding_idx=word_to_idx['<PAD>'])
        self.conv1d = nn.Conv1d(in_channels=text_embedding_dim, out_channels=64, kernel_size=3)
        self.pooling = nn.AdaptiveMaxPool1d(1)

        # fully connected layer
        self.fc1 = nn.Linear(embedding_dim * 2 + 64, 128)  # 注意这里的输入维度
        self.dropout1 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(128, 64)
        self.dropout2 = nn.Dropout(0.5)
        self.output = nn.Linear(64, 1)

    def forward(self, user_ids, item_ids, text_seqs):
        # user and item embedding
        user_embed = self.user_embedding(user_ids)
        item_embed = self.item_embedding(item_ids)

        # text embedding and convolution
        text_embed = self.text_embedding(text_seqs).permute(0, 2, 1)
        text_conv = self.conv1d(text_embed)
        text_pool = self.pooling(text_conv).squeeze(2)

        # concatenation
        concat = torch.cat([user_embed, item_embed, text_pool], dim=1)

        # fully connected layer
        x = self.fc1(concat)
        x = nn.ReLU()(x)
        x = self.dropout1(x)
        x = self.fc2(x)
        x = nn.ReLU()(x)
        x = self.dropout2(x)
        x = self.output(x)
        x = torch.sigmoid(x)
        return x.squeeze()

In [37]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

num_users = len(user_encoder.classes_)
num_items = len(item_encoder.classes_)
vocab_size = len(word_to_idx)

model = NCFModel(num_users, num_items, vocab_size).to(device)

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

## Training Loop

In [42]:
EPOCHS = 20
best_recall = 0.0  # Initialize best recall as 0
best_model_path = './Results/NCF/best_model.pth'  # Define the model saving path

# training loop
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}")
    for batch in pbar:
        user_ids = batch['user_id'].to(device)
        item_ids = batch['item_id'].to(device)
        text_seqs = batch['text_seq'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(user_ids, item_ids, text_seqs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        pbar.set_postfix({'Loss': total_loss / (pbar.n + 1)})

    # validation loop
    model.eval()
    val_labels = []
    val_preds = []
    with torch.no_grad():
        for batch in val_loader:
            user_ids = batch['user_id'].to(device)
            item_ids = batch['item_id'].to(device)
            text_seqs = batch['text_seq'].to(device)
            labels = batch['label'].to(device)

            outputs = model(user_ids, item_ids, text_seqs)
            val_labels.extend(labels.cpu().numpy())
            val_preds.extend(outputs.cpu().numpy())

    val_preds_binary = [1 if x > 0.5 else 0 for x in val_preds]
    precision = precision_score(val_labels, val_preds_binary)
    recall = recall_score(val_labels, val_preds_binary)
    f1 = f1_score(val_labels, val_preds_binary)
    auc = roc_auc_score(val_labels, val_preds)

    print(f"Validation Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}, AUC: {auc:.4f}")

    # Save the best model based on recall
    if recall > best_recall:
        best_recall = recall
        torch.save(model.state_dict(), best_model_path)
        print(f"Found a better model, saved the current model. Best Recall: {best_recall:.4f}")

print("Training completed!")
print(f"Best model Recall: {best_recall:.4f}")


Epoch 1/20: 100%|██████████| 3265/3265 [01:07<00:00, 48.26it/s, Loss=0.464]


Validation Precision: 0.7975, Recall: 0.8448, F1: 0.8205, AUC: 0.8371
Found a better model, saved the current model. Best Recall: 0.8448


Epoch 2/20: 100%|██████████| 3265/3265 [01:10<00:00, 46.36it/s, Loss=0.45] 


Validation Precision: 0.8268, Recall: 0.8084, F1: 0.8175, AUC: 0.8391


Epoch 3/20: 100%|██████████| 3265/3265 [01:10<00:00, 46.53it/s, Loss=0.436]


Validation Precision: 0.8152, Recall: 0.8268, F1: 0.8210, AUC: 0.8387


Epoch 4/20: 100%|██████████| 3265/3265 [01:18<00:00, 41.73it/s, Loss=0.422]


Validation Precision: 0.7870, Recall: 0.8664, F1: 0.8248, AUC: 0.8364
Found a better model, saved the current model. Best Recall: 0.8664


Epoch 5/20: 100%|██████████| 3265/3265 [01:09<00:00, 46.74it/s, Loss=0.407]


Validation Precision: 0.7767, Recall: 0.8785, F1: 0.8245, AUC: 0.8321
Found a better model, saved the current model. Best Recall: 0.8785


Epoch 6/20: 100%|██████████| 3265/3265 [01:10<00:00, 46.53it/s, Loss=0.393]


Validation Precision: 0.7701, Recall: 0.8861, F1: 0.8240, AUC: 0.8283
Found a better model, saved the current model. Best Recall: 0.8861


Epoch 7/20: 100%|██████████| 3265/3265 [01:10<00:00, 46.58it/s, Loss=0.377]


Validation Precision: 0.7662, Recall: 0.8924, F1: 0.8245, AUC: 0.8231
Found a better model, saved the current model. Best Recall: 0.8924


Epoch 8/20: 100%|██████████| 3265/3265 [01:17<00:00, 42.09it/s, Loss=0.363]


Validation Precision: 0.7712, Recall: 0.8878, F1: 0.8254, AUC: 0.8172


Epoch 9/20: 100%|██████████| 3265/3265 [01:10<00:00, 46.60it/s, Loss=0.35] 


Validation Precision: 0.7739, Recall: 0.8492, F1: 0.8098, AUC: 0.8090


Epoch 10/20: 100%|██████████| 3265/3265 [01:09<00:00, 46.66it/s, Loss=0.338]


Validation Precision: 0.7956, Recall: 0.7781, F1: 0.7868, AUC: 0.8024


Epoch 11/20: 100%|██████████| 3265/3265 [01:10<00:00, 46.61it/s, Loss=0.326]


Validation Precision: 0.7905, Recall: 0.8010, F1: 0.7957, AUC: 0.8025


Epoch 12/20: 100%|██████████| 3265/3265 [01:17<00:00, 41.89it/s, Loss=0.314]


Validation Precision: 0.7898, Recall: 0.7691, F1: 0.7793, AUC: 0.7916


Epoch 13/20: 100%|██████████| 3265/3265 [01:09<00:00, 46.86it/s, Loss=0.304]


Validation Precision: 0.7924, Recall: 0.7638, F1: 0.7778, AUC: 0.7896


Epoch 14/20: 100%|██████████| 3265/3265 [01:10<00:00, 46.42it/s, Loss=0.295]


Validation Precision: 0.7968, Recall: 0.7344, F1: 0.7643, AUC: 0.7836


Epoch 15/20: 100%|██████████| 3265/3265 [01:10<00:00, 46.59it/s, Loss=0.285]


Validation Precision: 0.7899, Recall: 0.7427, F1: 0.7656, AUC: 0.7739


Epoch 16/20: 100%|██████████| 3265/3265 [01:18<00:00, 41.74it/s, Loss=0.277]


Validation Precision: 0.7840, Recall: 0.7526, F1: 0.7680, AUC: 0.7703


Epoch 17/20: 100%|██████████| 3265/3265 [01:09<00:00, 46.84it/s, Loss=0.269]


Validation Precision: 0.7817, Recall: 0.7464, F1: 0.7637, AUC: 0.7645


Epoch 18/20: 100%|██████████| 3265/3265 [01:09<00:00, 46.75it/s, Loss=0.261]


Validation Precision: 0.7808, Recall: 0.7331, F1: 0.7562, AUC: 0.7498


Epoch 19/20: 100%|██████████| 3265/3265 [01:09<00:00, 46.70it/s, Loss=0.253]


Validation Precision: 0.7866, Recall: 0.7284, F1: 0.7564, AUC: 0.7542


Epoch 20/20: 100%|██████████| 3265/3265 [01:18<00:00, 41.85it/s, Loss=0.247]


Validation Precision: 0.7863, Recall: 0.7334, F1: 0.7589, AUC: 0.7525
Training completed!
Best model Recall: 0.8924


## Model Evaluation

In [43]:
# load the best NCF model
model.load_state_dict(torch.load(best_model_path))
model.eval()

# predict and validate on testing data
test_labels = []
test_preds = []
with torch.no_grad():
    for batch in tqdm(test_loader, desc='Testing'):
        user_ids = batch['user_id'].to(device)
        item_ids = batch['item_id'].to(device)
        text_seqs = batch['text_seq'].to(device)
        labels = batch['label'].to(device)

        outputs = model(user_ids, item_ids, text_seqs)

        test_labels.extend(labels.cpu().numpy())
        test_preds.extend(outputs.cpu().numpy())

# calculate eval metrics
test_preds_binary = [1 if x > 0.5 else 0 for x in test_preds]
precision = precision_score(test_labels, test_preds_binary)
recall = recall_score(test_labels, test_preds_binary)
f1 = f1_score(test_labels, test_preds_binary)
auc = roc_auc_score(test_labels, test_preds)

print(f"Test Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}, AUC: {auc:.4f}")

Testing: 100%|██████████| 10337/10337 [01:57<00:00, 87.79it/s]


Test Precision: 0.7588, Recall: 0.8715, F1: 0.8113, AUC: 0.8146


In [44]:
# load the best NCF model
model.load_state_dict(torch.load(best_model_path))
model.eval()

# define top K threshold
K_values = [5, 10, 15]

# restore testing results
test_user_ids = []
test_item_ids = []
test_labels = []
test_preds = []

# collect predictions on testing data
with torch.no_grad():
    for batch in tqdm(test_loader, desc='Testing'):
        user_ids = batch['user_id'].to(device)
        item_ids = batch['item_id'].to(device)
        text_seqs = batch['text_seq'].to(device)
        labels = batch['label'].to(device)

        outputs = model(user_ids, item_ids, text_seqs)

        test_labels.extend(labels.cpu().numpy())
        test_preds.extend(outputs.cpu().numpy())
        test_user_ids.extend(user_ids.cpu().numpy())
        test_item_ids.extend(item_ids.cpu().numpy())

# save testing results to df
test_results_df = pd.DataFrame({
    'user_id': test_user_ids,
    'item_id': test_item_ids,
    'label': test_labels,
    'pred_score': test_preds
})

# test by user groups
grouped = test_results_df.groupby('user_id')

# save Precision@K and Recall@K for each k value to dict
precision_at_K = {k: [] for k in K_values}
recall_at_K = {k: [] for k in K_values}

# for each user in each group
for user_id, group in grouped:
    user_df = group.sort_values('pred_score', ascending=False)
    total_relevant = user_df['label'].sum()
    # skip users without relevant items to avoid divided by 0
    if total_relevant == 0:
        continue
    for K in K_values:
        # get top K items
        top_K = user_df.head(K)
        # calculate number of relevant items in top K items
        num_relevant_in_top_K = top_K['label'].sum()
        # calculate Precision@K and Recall@K for each user
        precision = num_relevant_in_top_K / K
        recall = num_relevant_in_top_K / total_relevant
        # save results
        precision_at_K[K].append(precision)
        recall_at_K[K].append(recall)

# calculate average Precision@K and Recall@K for all users
for K in K_values:
    avg_precision = sum(precision_at_K[K]) / len(precision_at_K[K])
    avg_recall = sum(recall_at_K[K]) / len(recall_at_K[K])
    print(f"Average Precision@{K}: {avg_precision:.4f}")
    print(f"Average Recall@{K}: {avg_recall:.4f}")


Testing: 100%|██████████| 10337/10337 [02:06<00:00, 81.98it/s]


Average Precision@5: 0.8119
Average Recall@5: 0.7621
Average Precision@10: 0.5498
Average Recall@10: 0.9616
Average Precision@15: 0.3885
Average Recall@15: 0.9979


# Generate Recommendations

In [45]:
# Select a specific user for generating recommendations
specific_user_id = 'AE22236AFRRSMQIKGG7TPTB75QEA'  # Replace with the user ID for which you want recommendations
user_idx = user_encoder.transform([specific_user_id])[0]  # Encode the user ID to its corresponding index

# Get indices of all items
all_item_indices = np.arange(len(item_encoder.classes_))

# Retrieve items the user has interacted with in the training and test datasets
user_train_items = train_data_no_overlap[train_data_no_overlap['user_id_idx'] == user_idx]['item_id_idx'].tolist()
user_test_items = test_data_no_overlap[test_data_no_overlap['user_id_idx'] == user_idx]['item_id_idx'].tolist()

user_interacted_items = set(user_train_items + user_test_items)  # Combine train and test interactions

# Identify items the user has not interacted with
user_unseen_items = np.setdiff1d(all_item_indices, list(user_interacted_items))  # Items not seen by the user

print(f"User {specific_user_id} has interacted with {len(user_interacted_items)} items.")
print(f"User {specific_user_id} has {len(user_unseen_items)} unseen items.")

# Prepare input data for the model
# Repeat the user index for all unseen items
user_indices = np.full(len(user_unseen_items), user_idx, dtype=np.int64)
item_indices = user_unseen_items

# Prepare text sequences for unseen items
# Create a DataFrame for unseen items
unseen_items_df = pd.DataFrame({'item_id_idx': item_indices})

# Retrieve item text information
item_texts = filtered_data[['parent_asin', 'text']].drop_duplicates()  # Keep unique item text data
item_texts['item_id_idx'] = item_encoder.transform(item_texts['parent_asin'])  # Encode item IDs

# Merge text data with unseen items
unseen_items_df = unseen_items_df.merge(item_texts[['item_id_idx', 'text']], on='item_id_idx', how='left')

# Drop items without text
unseen_items_df = unseen_items_df.dropna(subset=['text'])

# Convert text to sequences
unseen_items_df['text_seq'] = unseen_items_df['text'].apply(lambda x: text_to_sequence(x, word_to_idx, MAX_SEQUENCE_LENGTH))

# Update item and user indices to match the filtered unseen items
item_indices = unseen_items_df['item_id_idx'].values
user_indices = np.full(len(item_indices), user_idx, dtype=np.int64)
text_seqs = np.stack(unseen_items_df['text_seq'].values)  # Stack sequences into a NumPy array

# Convert to tensors for model input
user_tensor = torch.tensor(user_indices, dtype=torch.long).to(device)
item_tensor = torch.tensor(item_indices, dtype=torch.long).to(device)
text_tensor = torch.tensor(text_seqs, dtype=torch.long).to(device)

# Generate recommendations in batches if the data is too large
batch_size = 1024
num_batches = (len(user_indices) + batch_size - 1) // batch_size  # Calculate the number of batches

all_scores = []  # Store scores for all unseen items

# Disable gradient computation for inference
with torch.no_grad():
    for i in tqdm(range(num_batches), desc='Generating recommendations'):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(user_indices))
        batch_user = user_tensor[start_idx:end_idx]
        batch_item = item_tensor[start_idx:end_idx]
        batch_text = text_tensor[start_idx:end_idx]

        # Get scores from the model
        scores = model(batch_user, batch_item, batch_text)
        all_scores.extend(scores.cpu().numpy())  # Append scores to the list

# Add scores to the unseen items DataFrame
unseen_items_df['score'] = all_scores

# Sort items by score in descending order
unseen_items_df = unseen_items_df.sort_values('score', ascending=False)

# Get the top N recommendations
N = 20
top_N_items = unseen_items_df.head(N)

# Map item indices back to original item IDs
top_N_items['parent_asin'] = item_encoder.inverse_transform(top_N_items['item_id_idx'])

# Display the top N recommendations
print(f"Top {N} recommendations for user {specific_user_id}:")
for idx, row in top_N_items.iterrows():
    print(f"Item ID: {row['parent_asin']}, Predicted Score: {row['score']:.4f}")

User AE22236AFRRSMQIKGG7TPTB75QEA has interacted with 18 items.
User AE22236AFRRSMQIKGG7TPTB75QEA has 48311 unseen items.


Generating recommendations: 100%|██████████| 48/48 [00:00<00:00, 171.38it/s]

Top 20 recommendations for user AE22236AFRRSMQIKGG7TPTB75QEA:
Item ID: B081PM9QD5, Predicted Score: 1.0000
Item ID: B0C5K9Z2W4, Predicted Score: 1.0000
Item ID: B07615C5Y9, Predicted Score: 1.0000
Item ID: B013WNS1PW, Predicted Score: 1.0000
Item ID: B0C8NNWS6W, Predicted Score: 0.9999
Item ID: B00BUFTSV6, Predicted Score: 0.9999
Item ID: B003FLM018, Predicted Score: 0.9999
Item ID: B07CL37VB8, Predicted Score: 0.9999
Item ID: B0053HLAA4, Predicted Score: 0.9999
Item ID: B07NKXRG8J, Predicted Score: 0.9998
Item ID: B0BFK266J3, Predicted Score: 0.9998
Item ID: B0BJ16KKML, Predicted Score: 0.9998
Item ID: B00YYIHGXS, Predicted Score: 0.9998
Item ID: B07BMB1BZD, Predicted Score: 0.9998
Item ID: B0BCPLFZD1, Predicted Score: 0.9998
Item ID: B0BFYZY2DZ, Predicted Score: 0.9997
Item ID: B0BNK5Y3R8, Predicted Score: 0.9996
Item ID: B09D451T23, Predicted Score: 0.9996
Item ID: B09HSRY37F, Predicted Score: 0.9995
Item ID: B0016HNU12, Predicted Score: 0.9994



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_N_items['parent_asin'] = item_encoder.inverse_transform(top_N_items['item_id_idx'])
