In [None]:
# import dependencies
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
import random
import pickle
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [None]:
# mount cuda if available else cpu
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if device.type == 'cuda':
    print("Using GPU")
else:
    print("Using CPU")

In [None]:
with open('../data/telegram_df', 'rb') as file:
    telegram_df = pickle.load(file)

In [None]:
# new columns to reflect actions (1 is recommend 0 is dont recommend)
telegram_df.columns = ['text', 1]
telegram_df[0] = telegram_df[1].apply(lambda score:-10 if score < 0 else 10)
telegram_df

In [None]:
with open("../data/sbert_embeddings.pkl", "rb") as f:
    telegram_embs = pickle.load(f)
telegram_tensors = [torch.tensor(emb) for emb in telegram_embs]
telegram_tensors = torch.vstack(telegram_tensors)
telegram_tensors = telegram_tensors.to(device)
tensor_dims = telegram_tensors.size(1)

In [None]:
X = [idx for idx in range(len(telegram_tensors))]
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)
X_val, X_test = train_test_split(X_test, test_size=0.5, random_state=42)


In [None]:
# formula for ndcg
def ndcg(relevance_scores, k):
    relevance_scores = [score if score >=0 else 0 for score in relevance_scores]
    if len(relevance_scores) == 0:
        return 0
    k = min(k, len(relevance_scores))
    dcg = np.sum(relevance_scores[:k] / np.log2(np.arange(2, k + 2)))
    ideal_relevance_scores = sorted(relevance_scores, reverse = True)
    idcg = np.sum(ideal_relevance_scores[:k] / np.log2(np.arange(2, k + 2)))
    if idcg ==0:
        return 0
    ndcg = dcg / idcg
    if ndcg > 1:
        print('NDCG ERROR')
        print('relevance scores:', relevance_scores)
        print('dcg:', dcg)
        print('idcg:', idcg)
    return ndcg

In [None]:
# make class for bandit model
class QNetwork(nn.Module):
    def __init__(self, input_dim, dropout_prob=0.3):
        super(QNetwork, self).__init__()
        self.lin_1 = nn.Linear(input_dim, input_dim // 2)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=dropout_prob)
        self.lin_2 = nn.Linear(input_dim // 2, 2)

    def forward(self, state):
        x = self.lin_1(state)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.lin_2(x)
        return x

class RankingAgent:
    def __init__(self, input_dim, learning_rate=0.001, epsilon=1, pretrained_model=None):
        self.input_dim = input_dim
        self.learning_rate = learning_rate
        self.epsilon = epsilon
        self.model = QNetwork(input_dim).to(device)
        if pretrained_model:
            self.model.load_state_dict(pretrained_model)
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)
        self.loss_fn = nn.MSELoss()
        self.epsilon_decay = 0.995
        self.epsilon_min = 0.01


    def choose_action(self, current_state):
        current_state = current_state.to(device)
        q_scores = self.model(current_state)
        if torch.rand(1).item() > self.epsilon:
            action = torch.argmax(q_scores).item()
        else:
            action = random.choice([0, 1])
        self.epsilon = max(self.epsilon * self.epsilon_decay, self.epsilon_min)
        return action

    def eval(self, selected_post_embs, indices):
        selected_post_embs = selected_post_embs.to(device)
        with torch.no_grad():
          q_scores = self.model(selected_post_embs)
          actions = torch.argmax(q_scores, dim=1).cpu().numpy()
          highest_q_scores = torch.max(q_scores, dim=1).values.cpu().numpy()
        sorted_posts = sorted(zip(indices, highest_q_scores, actions), key=lambda x: x[1], reverse=True)
        selected_posts = [post[0] for post in sorted_posts if post[2] == 1]
        return selected_posts

    def update(self, current_state, action, reward):
        current_state = current_state.to(device)
        q_value = torch.max(self.model(current_state))
        target_q_value = torch.tensor(reward, dtype=torch.float32).to(device)
        loss = self.loss_fn(q_value, target_q_value)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

In [None]:
# generate random data to use as baseline comparision
NUM_STEPS = 1000
SAMPLE_SIZE = 50
random_rewards = []
random_ndcgs = []
random_preds = []
random_true = []
random_cumulative_reward = 0
for _ in range(NUM_STEPS):
    random_idx = random.choice(X_val)
    random_action = random.choice([0, 1])
    random_preds.append(random_action)
    random_true.append(1 if telegram_df[random_action][random_idx] > 0 else 0)
    reward = telegram_df[random_action][random_idx]
    random_cumulative_reward += reward
    random_rewards.append(random_cumulative_reward)

for _ in range(NUM_STEPS):
    random_posts = random.sample(X, 10)
    random_ndcg = ndcg(telegram_df[1][random_posts], 10)
    random_ndcgs.append(random_ndcg)

random_data = (random_rewards, random_ndcgs)
#optionally save random data
#with open('../data/random_bandit_data.pkl', 'wb') as file:
 #   pickle.dump(random_data, file)


In [None]:
# train bandit model
TOP_N_POSTS = 10
NUM_STEPS = 1000
SAMPLE_SIZE = 50
best_model = None
best_val_ndcg = 0
learning_rates = [0.001, 0.01, 0.02, 0.05, 0.1]
epsilons = [0, 0.2, 0.4, 0.6, 0.8, 1]
model_training_data = {}
for learning_rate in learning_rates:
    for epsilon in epsilons:
        agent_best_recommend = np.inf
        agent_best_not_recommend = np.inf
        preds_train = []
        true_train = []
        preds_val = []
        true_val = []
        train_ndcgs = []
        val_ndcgs = []
        train_rewards = []
        val_rewards = []

        train_reward_cumulative = 0
        val_reward_cumulative = 0
        best_train_ndcg = 0
        best_agent_model = None
        agent = RankingAgent(input_dim=telegram_tensors.size(1), learning_rate = learning_rate, epsilon = epsilon)
        best_train_model = None
        bad_counter = 0
        for i in tqdm(range(NUM_STEPS)):
            random_idx = random.choice(X_train)
            state = telegram_tensors[random_idx]
            action = agent.choose_action(state)
            preds_train.append(action)
            reward = telegram_df[action][random_idx]
            true_train.append(1 if reward >0 else 0)
            train_reward_cumulative += reward
            agent.update(state, action, reward)
            train_rewards.append(train_reward_cumulative)

            random_idx = random.choice(X_val)
            state = telegram_tensors[random_idx]
            action = agent.choose_action(state)
            preds_val.append(action)
            true_val.append(1 if reward >0 else 0)
            reward = telegram_df[action][random_idx]
            val_reward_cumulative += reward
            val_rewards.append(val_reward_cumulative)

            random_train = random.sample(X_train, SAMPLE_SIZE)
            ordered_posts = agent.eval(telegram_tensors[random_train], X_train)
            rewards = telegram_df[1][ordered_posts]
            train_ndcg = ndcg(rewards,TOP_N_POSTS)
            train_ndcgs.append(train_ndcg)

            random_val = random.sample(X_val, SAMPLE_SIZE)
            ordered_posts = agent.eval(telegram_tensors[random_val], X_val)
            rewards = telegram_df[1][ordered_posts]
            val_ndcg = ndcg(rewards,TOP_N_POSTS)
            val_ndcgs.append(val_ndcg)

            if agent.epsilon < 0.01:
                if np.mean(train_ndcgs[i-10:i]) > best_train_ndcg:
                    best_train_ndcg = train_ndcg
                    best_train_model = agent.model.state_dict()
                    bad_counter = 0
                else:
                    bad_counter += 1
                    if bad_counter == 10:
                        agent.model.load_state_dict(best_agent_model)
                        agent.epsilon = 0.2
                        bad_counter = 0
                if np.mean(val_ndcg[i-10:i]) > best_val_ndcg:
                    best_val_ndcg = val_ndcg
                    best_model = agent.model.state_dict()
        key = str(epsilon) + '_' + str(learning_rate)
        model_training_data[key] = (train_ndcgs, val_ndcgs,  train_rewards, val_rewards, preds_train, true_train, preds_val, true_val)







In [None]:
# optionally save data
#with open('../data/bandit_model_data.pkl', 'wb') as file:
  #  pickle.dump(model_training_data, file)
#with open('../data/bandit_best_model', 'wb') as f:
 #       pickle.dump(best_model, f)

In [None]:
# load data if kernel timeout
#with open('../data/bandit_model_data.pkl', 'rb') as file:
 #   model_training_data = pickle.load(file)

In [None]:
# generate visualizations
data = model_training_data
learning_rates = [0.001, 0.01, 0.02, 0.05, 0.1]
epsilons = [0, 0.2, 0.4, 0.6, 0.8, 1]
random_rewards = random_data[0]
random_ndcgs = random_data[1]

num_rows = 3
num_cols = 2

fig, axes = plt.subplots(num_rows, num_cols, figsize=(10*num_cols, 12))
axes = axes.flatten()
for idx, epsilon in enumerate(epsilons):

    for learning_rate in learning_rates:
        val_reward = np.array([data[str(epsilon) + '_' + str(learning_rate)][3] for learning_rate in learning_rates])
        val_df = pd.DataFrame(data=val_reward.T, columns=learning_rates)
        sns.lineplot(data=val_df[learning_rate], ax=axes[idx],  dashes=False, label=f"Learning Rate {learning_rate}")
    val_df['Random'] = random_rewards
    sns.lineplot(data=val_df['Random'], ax=axes[idx], dashes=False, label=f"Random Recommendation")

    axes[idx].set_title(f"Validation Cumulative Rewards at Epsilon = {epsilon}")
    axes[idx].set_xlabel("Step")
    axes[idx].set_ylabel("Cumulative Reward")
    axes[idx].legend(title="Learning Rates")

plt.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(num_rows, num_cols, figsize=(6*num_cols, 12))
axes = axes.flatten()

for idx, epsilon in enumerate(epsilons):
    train_ndcgs = np.array([data[str(epsilon) + '_' + str(learning_rate)][0] for learning_rate in learning_rates])
    ndcg_df = pd.DataFrame(data=train_ndcgs.T, columns=learning_rates)

    for learning_rate in learning_rates:
        sns.kdeplot(ndcg_df[learning_rate],  label=f"Learning Rate {learning_rate}", ax=axes[idx])
    sns.kdeplot(random_ndcgs, label=f"Random Strategy", ax=axes[idx])

    axes[idx].set_title(f"Distribution of NDCGs at Epsilon = {epsilon}")
    axes[idx].set_xlabel("NDCG Value")
    axes[idx].set_ylabel("Density")
    axes[idx].legend(title="Learning Rates")

plt.tight_layout()
plt.savefig('train_ndcgs.png')
plt.show()

In [None]:
preds_train = data[str(0) + '_' + str(0.01)][4]
true_train = data[str(0) + '_' + str(0.01)][5]
cm = confusion_matrix(true_train, preds_train)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=np.unique(true_train))
disp.plot(cmap='Blues')

plt.title("Epsilon = 0, Learning Rate = 0.01")
plt.show()

In [None]:
cm = confusion_matrix(random_true, random_preds)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=np.unique(true_train))
disp.plot(cmap='Blues')

plt.title("Random Strategy")
plt.show()

In [None]:
print('Model Average NDCG:', np.mean(data[str(0) + '_' + str(0.01)][1]))
print('Random Strategy Average NDCG:', np.mean(random_ndcgs))