In [None]:
#import dependencies
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
import random
import pickle
import random
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
import pairwise_model
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

In [None]:
# set device to cuda if avaiable, else cpu
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if device.type == 'cuda':
    print("Using GPU")
else:
    print("Using CPU")

In [None]:
# load the post and reward dataframe
with open("../data/telegram_df.pkl", 'rb') as file:
    telegram_df = pickle.load(file)

In [None]:
telegram_df.columns = ['text', 'reward']
telegram_df

In [None]:
# load the SBERT embeddings
with open("../data/sbert_embeddings.pkl", "rb") as f:
    telegram_embs = pickle.load(f)
telegram_tensors = [torch.tensor(emb) for emb in telegram_embs]
# put embeddings into one tensor for easy access
telegram_tensors = torch.vstack(telegram_tensors)
telegram_tensors = telegram_tensors.to(device)
tensor_dims = telegram_tensors.size(1) # number features used for neural network

In [None]:
X = [idx for idx in range(len(telegram_tensors))]
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)
X_val, X_test = train_test_split(X_test, test_size=0.5, random_state=42)


In [None]:
# formula for calculating ndcg
def ndcg(relevance_scores, k):
    relevance_scores = [score if score >=0 else 0 for score in relevance_scores]
    if len(relevance_scores) == 0:
        return 0
    k = min(k, len(relevance_scores))
    dcg = np.sum(relevance_scores[:k] / np.log2(np.arange(2, k + 2)))
    ideal_relevance_scores = sorted(relevance_scores, reverse = True)
    idcg = np.sum(ideal_relevance_scores[:k] / np.log2(np.arange(2, k + 2)))
    if idcg ==0:
        return 0
    ndcg = dcg / idcg
    # error checks
    if ndcg > 1:
        raise ValueError("NDCG cannot be greater than 1")
    if ndcg < 0:
        raise ValueError("NDCG cannot be less than 0")
    return ndcg

In [None]:
# keep track of the best loss and ndcg to determine best model
best_model = None
best_ndcg = 0
best_loss = np.inf
train_sizes = [5, 10, 15, 20]
learning_rates = [1e-4, 1e-3, 1e-2, 1e-1, 1]
#save data for analysis and visualizations
model_training_data = {}

# gather hyperparameter data
for lr in learning_rates:
    for train_size in tqdm(train_sizes):
        agent = pairwise_model.RankingAgent(tensor_dims, learning_rate = lr, train_size = train_size)
        num_posts_used = 0
        posts_used = []
        model_ndcgs = []
        model_losses = []

        while num_posts_used<1000:
            train_indices = random.sample(X_train, train_size)
            for idx in train_indices:
                embedding = telegram_tensors[idx]
                relevance_score = telegram_df['reward'][idx]
                agent.save_post_ranking(relevance_score, embedding)
            agent.train()
            iteration_loss = agent.loss/(train_size*(train_size-1)/2)
            model_losses.append(iteration_loss)
            val_indices = random.sample(X_val, 50)
            post_embeddings = telegram_tensors[val_indices]
            ranked_indices = agent.rank_posts(post_embeddings)
            ranked_post_indices = [val_indices[idx] for idx in ranked_indices]
            model_rewards = [telegram_df['reward'][idx] for idx in ranked_post_indices]
            model_ndcg = ndcg(model_rewards, 10)
            model_ndcgs.append(model_ndcg)
            if model_ndcg > best_ndcg and iteration_loss < best_loss:
                best_ndcg = model_ndcg
                best_loss = iteration_loss
                best_model = agent.model.state_dict()
            num_posts_used += train_size
            posts_used.append(num_posts_used)
            agent.loss = 0
        print('Training Size:',train_size,'Learning Rate:',lr,'Average NDCG:',np.mean(model_ndcgs),'Best NDCG:',best_ndcg)
        model_training_data[str(lr) + '_' + str(train_size)] = (model_ndcgs, model_losses, posts_used)




In [None]:
# optionally save the data
#with open('../data/pairwise_model_data.pkl', 'wb') as file:
 #   pickle.dump(model_training_data, file)
#with open('../data/best_pairwise_model', 'wb') as f:
#        pickle.dump(best_model, f)

In [None]:
# open the data if already generated
#with open('../data/pairwise_model_data.pkl', 'rb') as file:
#    model_training_data = pickle.load(file)

In [None]:
# generate random data for comparison
train_sizes = [5, 10, 15, 20]
learning_rates = [1e-4, 1e-3, 1e-2, 1e-1, 1]
random_ndcgs = []
random_preds = []
random_true = []
for _ in range(1000):
  random_indices = random.sample(X_val, 50)
  random_ranking = random.sample(random_indices, len(random_indices))
  random_rewards = [telegram_df['reward'][idx] for idx in random_ranking]
  random_ndcgs.append(ndcg(random_rewards, 10))

for _ in range(1000):
  random_choice = random.choice([0,1])
  random_preds.append(random_choice)
  random_reward = random.choice(telegram_df['reward'])
  random_true.append(1 if random_reward >0 else 0)

fig, axes = plt.subplots(3, 2, figsize=(20, 15))
axes = axes.flatten()

# visualize ndcg distributions with kernel density estimation
for idx, lr in enumerate(learning_rates):
    ax = axes[idx]
    sns.kdeplot(random_ndcgs, color='blue', label='Random Strategy', ax=ax)
    for train_size in train_sizes:
        model_ndcgs = model_training_data[f"{lr}_{train_size}"][0]
        sns.kdeplot(model_ndcgs, label=f'Training Size: {train_size}', ax=ax)
    ax.set_title(f'KDE of NDCG Scores at Learning Rate={lr}', fontsize=14)
    ax.set_xlabel('NDCG Score', fontsize=12)
    ax.set_ylabel('Density', fontsize=12)
    ax.legend(fontsize=10)

plt.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(3, 2, figsize=(20, 20))
axes = axes.flatten()

# visualize model losses over time
for idx, lr in enumerate(learning_rates):
    ax = axes[idx]

    for train_size in train_sizes:
        model_losses = model_training_data[f"{lr}_{train_size}"][1]
        num_posts = model_training_data[f"{lr}_{train_size}"][2]
        ax.plot(num_posts, model_losses, label=f'Train Size {train_size}')

    ax.set_title(f'Model Losses by Number of Posts Sampled\nLearning Rate={lr}', fontsize=14)
    ax.set_xlabel('Number of Posts Used', fontsize=12)
    ax.set_ylabel('Loss', fontsize=12)
    ax.legend(fontsize=10)

plt.tight_layout()
plt.savefig('model_losses_by_posts.png', bbox_inches='tight')
plt.show()

In [None]:
fig, axes = plt.subplots(3, 2, figsize=(20, 15))
axes = axes.flatten()

# visualize ndcgs over time
avg_random_ndcg = np.mean(random_ndcgs)
for idx, lr in enumerate(learning_rates):
    ax = axes[idx]
    for train_size in train_sizes:
        model_ndcgs = model_training_data[f"{lr}_{train_size}"][0]
        num_posts = model_training_data[f"{lr}_{train_size}"][2]
        ax.plot(num_posts, model_ndcgs, label=f'Train Size {train_size}')

    ax.axhline(y=avg_random_ndcg, color='red', linestyle='--', label='Average Random NDCG')
    ax.set_title(f'Model NDCGs by Number of Posts Sampled\nLearning Rate={lr}', fontsize=14)
    ax.set_xlabel('Number of Posts Used', fontsize=12)
    ax.set_ylabel('NDCG', fontsize=12)
    ax.legend(fontsize=10)
plt.tight_layout()
plt.savefig('model_ndcgs_by_posts.png', bbox_inches='tight')
plt.show()

In [None]:
# load best model to make predictions
agent = pairwise_model.RankingAgent(tensor_dims, learning_rate = 0.001, train_size = 15, pretrained_model=best_model)

# simulate offline batch processing with data stream
ranked_lists = []
for _ in tqdm(range(100)):
    random_indices = random.sample(X_val, 50)
    post_embeddings = telegram_tensors[random_indices]
    ranked_indices = agent.rank_posts(post_embeddings)
    ranked_post_indices = [random_indices[idx] for idx in ranked_indices]
    ranked_lists.append(ranked_post_indices)



In [None]:
# generate binary ranked lists with cutoffs
cutoffs = [i*5 for i in range(11)]
model_recalls = []
model_precisions = []
model_f1s = []
for cutoff in cutoffs:
    model_preds = []
    true_labels = []
    for ranked_list in ranked_lists:
        for idx, post_idx in enumerate(ranked_list):
            if idx < cutoff:
                model_preds.append(1)
            else:
                model_preds.append(0)
            true_label = 1 if telegram_df['reward'][post_idx] > 0 else 0
            true_labels.append(true_label)
    # collect precision, recall, and f1 from each ranked list
    model_recalls.append(recall_score(true_labels, model_preds))
    model_precisions.append(precision_score(true_labels, model_preds))
    model_f1s.append(f1_score(true_labels, model_preds))

In [None]:
#visualize precision-recall curve
plt.scatter(model_recalls, model_precisions, label='Precision-Recall Curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.show()

In [None]:
#visualizae f1 score at each cutoff
plt.plot(cutoffs, model_f1s)
plt.xticks([i*5 for i in range(11)])
for x in [i * 5 for i in range(11)]:
    plt.axvline(x=x, color='black', linestyle='--')
plt.xlabel('Cutoff Index')
plt.ylabel('F1 Score')
plt.title('F1 score by Index Cutoff')
plt.show()