In [None]:


class PracticeEnv:
    def __init__(self, params, kt_model,device):
        self.num_questions = params.n_pid
        self.model = kt_model
        self.device = device
        self.reset()
        
    def reset(self):
        self.past_interactions = {'q':[],'target':[],'pid':[]} # Initialize past interactions
        return self.past_interactions
    
    def step(self, action):
        # Get the prediction from KTModel
        q,target,pid = self.past_interactions.values()
        q = torch.tensor(q).long.to(self.device)
        target = torch.tensor(target).long.to(self.device)
        pid = torch.tensor(pid).long.to(self.device)
        qa = q+target*self.num_questions

        correct_prob = self.model(q,qa,target,pid)
        
        correct = np.random.rand() < correct_prob
        reward = 1 if correct else 0
        
        # Update past interactions with the result of the action
        self.past_interactions.append((action, correct))
        
        return self.past_interactions, reward, correct

In [None]:


class PolicyNetwork(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, output_dim)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.softmax(self.fc2(x), dim=-1)
        return x


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class PolicyNetwork(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(PolicyNetwork, self).__init__()
        self.lstm = nn.LSTM(input_dim, 128, batch_first=True)
        self.fc = nn.Linear(128, output_dim)
    
    def forward(self, x):
        x, _ = self.lstm(x)
        x = F.softmax(self.fc(x[:, -1, :]), dim=-1)  # Use the output of the last time step
        return x

class ValueNetwork(nn.Module):
    def __init__(self, input_dim):
        super(ValueNetwork, self).__init__()
        self.lstm = nn.LSTM(input_dim, 128, batch_first=True)
        self.fc = nn.Linear(128, 1)
    
    def forward(self, x):
        x, _ = self.lstm(x)
        x = self.fc(x[:, -1, :])  # Use the output of the last time step
        return x

In [None]:
import torch.optim as optim
from torch.distributions import Categorical

def compute_advantages(rewards, values, gamma=0.99):
    advantages = []
    advantage = 0
    for r, v in zip(reversed(rewards), reversed(values)):
        advantage = r + gamma * advantage - v
        advantages.insert(0, advantage)
    return torch.tensor(advantages)

def surrogate_loss(policy_net, states, actions, advantages):
    action_probs = policy_net(states)
    dist = Categorical(action_probs)
    log_probs = dist.log_prob(actions)
    return -torch.mean(log_probs * advantages)

def train_policy(policy_net, value_net, optimizer_policy, optimizer_value, states, actions, rewards):
    # Compute value targets
    values = value_net(states).detach()
    advantages = compute_advantages(rewards, values)
    
    # Update value network
    value_loss = F.mse_loss(value_net(states).squeeze(), torch.tensor(rewards).float())
    optimizer_value.zero_grad()
    value_loss.backward()
    optimizer_value.step()
    
    # Update policy network
    policy_loss = surrogate_loss(policy_net, states, actions, advantages)
    optimizer_policy.zero_grad()
    policy_loss.backward()
    optimizer_policy.step()

In [None]:
def train(env, policy_net, value_net, num_episodes=1000, gamma=0.99):
    optimizer_policy = optim.Adam(policy_net.parameters(), lr=1e-3)
    optimizer_value = optim.Adam(value_net.parameters(), lr=1e-3)
    
    for episode in range(num_episodes):
        states, actions, rewards = [], [], []
        state = env.reset()
        
        done = False
        while not done:
            state_tensor = torch.tensor(state, dtype=torch.float32)
            action_probs = policy_net(state_tensor)
            dist = Categorical(action_probs)
            action = dist.sample()
            print(action)
            
            next_state, reward, correct = env.step(action.item())
            
            states.append(state)
            actions.append(action)
            rewards.append(reward)
            
            state = next_state
            
            if correct:
                done = True
        
        # Convert states list of lists to a tensor with appropriate padding or truncation
        states_tensor = torch.nn.utils.rnn.pad_sequence([torch.tensor(s) for s in states], batch_first=True)
        actions_tensor = torch.tensor(actions)
        train_policy(policy_net, value_net, optimizer_policy, optimizer_value, states_tensor, actions_tensor, rewards)
        
        if episode % 10 == 0:
            print(f"Episode {episode}, total reward: {sum(rewards)}")

# Initialize environment and networks
num_questions = 10
kt_model = KTModel()  # Assuming KTModel is already defined and loaded
env = PracticeEnv(num_questions, kt_model)
policy_net = PolicyNetwork(num_questions, num_questions)
value_net = ValueNetwork(num_questions)

# Train the model
train(env, policy_net, value_net, num_episodes=1000)


NameError: name 'KTModel' is not defined