In [9]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
import pickle
from sklearn.model_selection import train_test_split

In [3]:
with open('cls_emb.pkl', 'rb') as file:
    embeddings = pickle.load(file)

In [12]:
tensor_dims = embeddings[0].size(1)

In [25]:
class QNetwork(nn.Module):
    def __init__(self, input_dim, hidden_dim=128):
        super(QNetwork, self).__init__()
        
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, 1)
        
        self.relu = nn.ReLU()
        
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        return self.fc3(x)

class QLearningAgent:
    def __init__(self, input_dim, output_dim, learning_rate=0.001, gamma=0.99, epsilon=0.1, alpha=0.5, beta=0.5, bellman_alpha=0.5):
        self.input_dim = input_dim
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.bellman_alpha = bellman_alpha
        self.epsilon = epsilon
        self.model = QNetwork(input_dim)
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)
        self.loss_fn = nn.MSELoss()
        self.alpha = alpha
        self.beta = beta
        self.epsilon_decay = 0.995
        self.epsilon_min = 0.01
        self.current_q = None
        self.next_q = None
        self.rewards = []
        self.precisions = []
        self.dcgs = []
        
    def precision(self, ordered_posts, ground_truth_labels, k):
        top_k_system_output = ordered_posts[:k]
        top_k_ground_truth = ground_truth_labels[:k]
        relevant_items = sum([1 for relevance in top_k_ground_truth if relevance == 1])
        precision_at_k = relevant_items / k
        return precision_at_k

    def dcg(self, ground_truth_labels, ordered_posts):
        relevance = {item: idx + 1 for idx, item in enumerate(ground_truth_labels)}
        dcg = 0
        for idx, item in enumerate(ordered_posts):
            rel = relevance.get(item, 0)  
            dcg += rel / np.log2(idx + 2)  
        return dcg
        
    def reward_function(self, ordered_posts, ground_truth_labels, k, alpha, beta):
        ideal_dcg = sum(1/np.log2(i+2) for i in range(k))
        real_dcg = self.dcg(ground_truth_labels, ordered_posts)
        normalized_dcg = real_dcg/ideal_dcg
        self.dcgs.append(normalized_dcg)
        prec = self.precision(ordered_posts, ground_truth_labels, k)
        self.precisions.append(prec)
        return alpha*prec + beta*normalized_dcg
        
    def choose_action(self, selected_post_embs, indices, n_posts=5):
        self.next_q = self.model(selected_post_embs).detach().numpy()
        if np.random.rand() > self.epsilon:  
            sorted_posts = sorted(zip(indices, self.next_q), key=lambda x: x[1], reverse=True)
            selected_posts = [post[0] for post in sorted_posts[:n_posts]]
        else:  
            selected_posts = random.sample(indices, n_posts)

        self.epsilon = max(self.epsilon * self.epsilon_decay, self.epsilon_min)
        return selected_posts

    def update(self, ordered_posts, ground_truth_labels, n_posts):
        reward = self.reward_function(ordered_posts, ground_truth_labels, n_posts, self.alpha, self.beta)
        self.rewards.append(reward)
        if self.current_q is None:
            self.current_q = torch.tensor([reward] * n_posts, dtype=torch.float32, requires_grad=True)
        else:
            self.current_q = self.current_q.detach()  
        max_next_q = np.max(self.next_q)
        max_next_q_tensor = torch.tensor([max_next_q]*n_posts, dtype=torch.float32) 
        target_q = reward + self.gamma*max_next_q
        target_q_tensor = torch.tensor([target_q] * n_posts, dtype=torch.float32, requires_grad=True)
        loss = self.loss_fn(self.current_q, target_q_tensor)
        
        self.optimizer.zero_grad()  
        loss.backward() 
        self.optimizer.step()  
        self.current_q = self.current_q + self.bellman_alpha*(reward + self.gamma*max_next_q_tensor - self.current_q)
            
            

In [6]:
with open('filtered_df', "rb") as file:
    filtered_df = pickle.load(file)

In [7]:
filtered_df

Unnamed: 0,rankings,sent_indices
0,"[2995, 1951, 7894, 9713, 746, 7886, 1153, 2585...","[2995, 2585, 8890, 7894, 7886, 1951, 4068, 971..."
1,"[469, 6669, 6828, 7705, 5342, 10163, 2105, 3643]","[10163, 3643, 6669, 2105, 469, 6828, 699, 7705..."
2,"[5253, 3600, 2378]","[2378, 1184, 3689, 8775, 3600, 5123, 8637, 525..."
3,"[2698, 8695, 8069, 7538, 4064, 8335, 7149, 901...","[8334, 7149, 8695, 4064, 8335, 9017, 8069, 770..."
4,"[456, 8547, 4422, 10600, 1637, 3493, 3259, 260...","[1760, 3259, 8547, 8545, 4422, 1637, 3493, 456..."
...,...,...
4989,"[3047, 9891, 2048, 4817, 1169, 2241, 7030, 441...","[2048, 4416, 6162, 4817, 9891, 1169, 2241, 703..."
4992,"[9056, 4627, 8628, 842, 9892, 7896, 5623, 3676...","[9056, 842, 9892, 392, 3676, 6124, 7896, 4627,..."
4993,"[7666, 4511, 2112, 8251, 4160, 9969]","[9969, 2112, 3536, 4111, 2544, 908, 4511, 8251..."
4997,"[4793, 2097, 2824, 8027, 1715]","[4793, 4568, 2674, 2824, 8027, 10231, 3726, 20..."


In [11]:
X = filtered_df['sent_indices']
y = filtered_df['rankings']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

In [29]:
rl_agent = QLearningAgent(tensor_dims, 5)
#train the model
for selected_indices, ground_truth_labels in zip(X_train, y_train):
    if len(selected_indices) >0 and len(ground_truth_labels) > 0:
        selected_embs = torch.vstack([embeddings[idx] for idx in selected_indices])
        model_rankings = rl_agent.choose_action(selected_embs, selected_indices)
        rl_agent.update(model_rankings, ground_truth_labels, 5)
print(np.mean(rl_agent.rewards))

1.81234953636289
