# Train

In [1]:
import pandas as pd

df = pd.read_csv('embeddings/al_cpl_embeddings_mistral.csv', index_col='Unnamed: 0').to_dict(orient='index')
topic_embeddings = dict()

for key, value in df.items():
    topic_embeddings[key] = list(value.values())

In [2]:
from base_classification import load_preq_map, load_test_set

preq_map = load_preq_map()
test_set = load_test_set()

In [3]:
cpl_y_list = []
a_s = []
b_s = []

for t in test_set:
    tpc_b, tpc_a = t[0], t[1]
    a_s.append(tpc_a)
    b_s.append(tpc_b)

    if tpc_a in preq_map:
        true_pred = tpc_b in preq_map[tpc_a]
    else:
        true_pred = False

    cpl_y_list.append(float(true_pred))

In [4]:
from collections import Counter

value_counts = Counter(cpl_y_list)

total_count = sum(value_counts.values())

# Calculate the ratio of each value count
{value: count / total_count for value, count in value_counts.items()}

{0.0: 0.6932148874253331, 1.0: 0.3067851125746669}

In [5]:
cpl_a_embeds = [topic_embeddings[a] for a in a_s]
cpl_b_embeds = [topic_embeddings[b] for b in b_s]

In [6]:
import pandas as pd

drive_df = pd.read_csv('embeddings/drive_embeddings_mistral.csv', index_col='Unnamed: 0').to_dict(orient='index')
drive_topic_embeddings = dict()

for key, value in drive_df.items():
    drive_topic_embeddings[key] = list(value.values())

In [7]:
drive_preds = pd.read_csv('drive_data/train_set.csv')[['pre requisite', 'concept', 'label']]

drive_a_embeds = [drive_topic_embeddings[a] for a in drive_preds['pre requisite'].values]
drive_b_embeds = [drive_topic_embeddings[b] for b in drive_preds['concept'].values]
drive_y_list = drive_preds['label'].astype(float).values.tolist()

In [8]:
import pandas as pd

mooc_df = pd.read_csv('embeddings/mooc_embeddings_mistral.csv', index_col='Unnamed: 0').to_dict(orient='index')
mooc_topic_embeddings = dict()

for key, value in mooc_df.items():
    mooc_topic_embeddings[key] = list(value.values())

In [16]:
mooc_preds = pd.read_csv('mooc_data/united_data.csv')
mooc_preds = mooc_preds[mooc_preds['dataset'].isin(['moocML', 'moocDSA'])]

mooc_a_embeds = [mooc_topic_embeddings[a] for a in mooc_preds['conceptA'].values]
mooc_b_embeds = [mooc_topic_embeddings[b] for b in mooc_preds['conceptB'].values]
mooc_y_list = mooc_preds['isPrerequisite'].astype(float).values.tolist()

In [17]:
a_embeds = cpl_a_embeds + drive_a_embeds + mooc_a_embeds
b_embeds = cpl_b_embeds + drive_b_embeds + mooc_b_embeds
true_y = cpl_y_list + drive_y_list + mooc_y_list

In [18]:
from collections import Counter

value_counts = Counter(true_y)

total_count = sum(value_counts.values())

# Calculate the ratio of each value count
{value: count / total_count for value, count in value_counts.items()}

{0.0: 0.7131015179244268, 1.0: 0.2868984820755733}

In [19]:
from sklearn.model_selection import train_test_split

x = [a + b for a, b in zip(a_embeds, b_embeds)]
y = true_y

x_train_list, x_test_list, y_train_list, y_test_list = train_test_split(x, y, test_size=0.2, random_state=78)

In [20]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class BinaryClassifier(nn.Module):
    def __init__(self, input_size):
        super(BinaryClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, 512)  # Fully connected layer 1
        self.dropout1 = nn.Dropout(0.8)         # Dropout layer with 20% probability
        self.fc2 = nn.Linear(512, 256)          # Fully connected layer 2
        self.dropout2 = nn.Dropout(0.8)         # Dropout layer with 20% probability
        self.fc3 = nn.Linear(256, 1)            # Output layer
        
    def forward(self, x):
        x = F.relu(self.fc1(x))       # Apply ReLU activation to the first fully connected layer
        x = self.dropout1(x)          # Apply dropout to the output of the first layer
        x = F.relu(self.fc2(x))       # Apply ReLU activation to the second fully connected layer
        x = self.dropout2(x)          # Apply dropout to the output of the second layer
        x = torch.sigmoid(self.fc3(x))  # Apply sigmoid activation to the output layer for binary classification
        return x

In [21]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [22]:
input_size = len(x_train_list[0])  # 2048 for mistral
model = BinaryClassifier(input_size).to(device)
print(model)

BinaryClassifier(
  (fc1): Linear(in_features=2048, out_features=512, bias=True)
  (dropout1): Dropout(p=0.8, inplace=False)
  (fc2): Linear(in_features=512, out_features=256, bias=True)
  (dropout2): Dropout(p=0.8, inplace=False)
  (fc3): Linear(in_features=256, out_features=1, bias=True)
)


In [23]:
x_train = torch.tensor(x_train_list).to(device)
y_train = torch.tensor(y_train_list).to(device)
y_train = torch.unsqueeze(y_train, 1)

x_test = torch.tensor(x_test_list).to(device)
y_test = torch.tensor(y_test_list).to(device)
y_test = torch.unsqueeze(y_test, 1)

In [24]:
x_cpl = torch.tensor([a + b for a, b in zip(cpl_a_embeds, cpl_b_embeds)]).to(device)
y_cpl = torch.tensor(cpl_y_list).to(device)
y_cpl = torch.unsqueeze(y_cpl, 1)

x_drive = torch.tensor([a + b for a, b in zip(drive_a_embeds, drive_b_embeds)]).to(device)
y_drive = torch.tensor(drive_y_list).to(device)
y_drive = torch.unsqueeze(y_drive, 1)

In [25]:
from sklearn.metrics import accuracy_score, f1_score

input_size = len(x_train_list[0])  # 2048 for mistral
model = BinaryClassifier(input_size).to(device)

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 500
for epoch in range(num_epochs):
    model.train()  # Set the model to train mode
    optimizer.zero_grad()  # Zero the gradients
    
    # Forward pass
    outputs = model(x_train)
    
    # Compute loss
    loss = criterion(outputs, y_train)
    
    # Backpropagation
    loss.backward()
    optimizer.step()
    
    with torch.no_grad():
        model.eval()  # Set the model to evaluation mode
        train_preds = (outputs >= 0.5).float()  # Convert probabilities to binary predictions
        train_acc = accuracy_score(y_train.cpu().numpy(), train_preds.cpu().numpy())
        train_f1 = f1_score(y_train.cpu().numpy(), train_preds.cpu().numpy())
        
    with torch.no_grad():
        test_outputs = model(x_test)
        test_preds = (test_outputs >= 0.5).float()  # Convert probabilities to binary predictions
        test_acc = accuracy_score(y_test.cpu().numpy(), test_preds.cpu().numpy())
        test_f1 = f1_score(y_test.cpu().numpy(), test_preds.cpu().numpy())
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Train Accuracy: {train_acc:.4f}, Train F1: {train_f1:.4f}, Test Accuracy: {test_acc:.4f}, Test F1: {test_f1:.4f}')

  from .autonotebook import tqdm as notebook_tqdm


Epoch [1/500], Loss: 0.6909, Train Accuracy: 0.5709, Train F1: 0.3132, Test Accuracy: 0.7158, Test F1: 0.0000
Epoch [2/500], Loss: 0.6726, Train Accuracy: 0.7120, Train F1: 0.0083, Test Accuracy: 0.7158, Test F1: 0.0000
Epoch [3/500], Loss: 0.6515, Train Accuracy: 0.7126, Train F1: 0.0009, Test Accuracy: 0.7158, Test F1: 0.0000
Epoch [4/500], Loss: 0.6285, Train Accuracy: 0.7124, Train F1: 0.0000, Test Accuracy: 0.7158, Test F1: 0.0000
Epoch [5/500], Loss: 0.6102, Train Accuracy: 0.7124, Train F1: 0.0000, Test Accuracy: 0.7158, Test F1: 0.0000
Epoch [6/500], Loss: 0.6085, Train Accuracy: 0.7124, Train F1: 0.0000, Test Accuracy: 0.7158, Test F1: 0.0000
Epoch [7/500], Loss: 0.6217, Train Accuracy: 0.7124, Train F1: 0.0000, Test Accuracy: 0.7158, Test F1: 0.0000
Epoch [8/500], Loss: 0.6234, Train Accuracy: 0.7124, Train F1: 0.0000, Test Accuracy: 0.7158, Test F1: 0.0000
Epoch [9/500], Loss: 0.6120, Train Accuracy: 0.7124, Train F1: 0.0000, Test Accuracy: 0.7158, Test F1: 0.0000
Epoch [10/

In [16]:
from sklearn.metrics import accuracy_score, f1_score

input_size = len(x_train_list[0])  # 2048 for mistral
model = BinaryClassifier(input_size).to(device)

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 500
for epoch in range(num_epochs):
    model.train()  # Set the model to train mode
    optimizer.zero_grad()  # Zero the gradients
    
    # Forward pass
    outputs = model(x_drive)
    
    # Compute loss
    loss = criterion(outputs, y_drive)
    
    # Backpropagation
    loss.backward()
    optimizer.step()
    
    with torch.no_grad():
        model.eval()  # Set the model to evaluation mode
        train_preds = (outputs >= 0.5).float()  # Convert probabilities to binary predictions
        train_acc = accuracy_score(y_drive.cpu().numpy(), train_preds.cpu().numpy())
        train_f1 = f1_score(y_drive.cpu().numpy(), train_preds.cpu().numpy())
        
    with torch.no_grad():
        test_outputs = model(x_cpl)
        test_preds = (test_outputs >= 0.5).float()  # Convert probabilities to binary predictions
        test_acc = accuracy_score(y_cpl.cpu().numpy(), test_preds.cpu().numpy())
        test_f1 = f1_score(y_cpl.cpu().numpy(), test_preds.cpu().numpy())
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Train Accuracy: {train_acc:.4f}, Train F1: {train_f1:.4f}, Test Accuracy: {test_acc:.4f}, Test F1: {test_f1:.4f}')

  from .autonotebook import tqdm as notebook_tqdm


Epoch [1/500], Loss: 0.6876, Train Accuracy: 0.6010, Train F1: 0.0176, Test Accuracy: 0.6932, Test F1: 0.0000
Epoch [2/500], Loss: 0.6821, Train Accuracy: 0.6021, Train F1: 0.0000, Test Accuracy: 0.6932, Test F1: 0.0000
Epoch [3/500], Loss: 0.6744, Train Accuracy: 0.6021, Train F1: 0.0000, Test Accuracy: 0.6932, Test F1: 0.0000
Epoch [4/500], Loss: 0.6717, Train Accuracy: 0.6021, Train F1: 0.0000, Test Accuracy: 0.6932, Test F1: 0.0000
Epoch [5/500], Loss: 0.6736, Train Accuracy: 0.6021, Train F1: 0.0000, Test Accuracy: 0.6932, Test F1: 0.0000
Epoch [6/500], Loss: 0.6679, Train Accuracy: 0.6021, Train F1: 0.0000, Test Accuracy: 0.6932, Test F1: 0.0000
Epoch [7/500], Loss: 0.6694, Train Accuracy: 0.6021, Train F1: 0.0000, Test Accuracy: 0.6932, Test F1: 0.0000
Epoch [8/500], Loss: 0.6665, Train Accuracy: 0.6021, Train F1: 0.0000, Test Accuracy: 0.6932, Test F1: 0.0000
Epoch [9/500], Loss: 0.6672, Train Accuracy: 0.6021, Train F1: 0.0000, Test Accuracy: 0.6932, Test F1: 0.0000
Epoch [10/

In [17]:
from sklearn.metrics import accuracy_score, f1_score

input_size = len(x_train_list[0])  # 2048 for mistral
model = BinaryClassifier(input_size).to(device)

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 500
for epoch in range(num_epochs):
    model.train()  # Set the model to train mode
    optimizer.zero_grad()  # Zero the gradients
    
    # Forward pass
    outputs = model(x_cpl)
    
    # Compute loss
    loss = criterion(outputs, y_cpl)
    
    # Backpropagation
    loss.backward()
    optimizer.step()
    
    with torch.no_grad():
        model.eval()  # Set the model to evaluation mode
        train_preds = (outputs >= 0.5).float()  # Convert probabilities to binary predictions
        train_acc = accuracy_score(y_cpl.cpu().numpy(), train_preds.cpu().numpy())
        train_f1 = f1_score(y_cpl.cpu().numpy(), train_preds.cpu().numpy())
        
    with torch.no_grad():
        test_outputs = model(x_drive)
        test_preds = (test_outputs >= 0.5).float()  # Convert probabilities to binary predictions
        test_acc = accuracy_score(y_drive.cpu().numpy(), test_preds.cpu().numpy())
        test_f1 = f1_score(y_drive.cpu().numpy(), test_preds.cpu().numpy())
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Train Accuracy: {train_acc:.4f}, Train F1: {train_f1:.4f}, Test Accuracy: {test_acc:.4f}, Test F1: {test_f1:.4f}')

Epoch [1/500], Loss: 0.7014, Train Accuracy: 0.3307, Train F1: 0.4643, Test Accuracy: 0.6021, Test F1: 0.0000
Epoch [2/500], Loss: 0.6817, Train Accuracy: 0.6600, Train F1: 0.1673, Test Accuracy: 0.6021, Test F1: 0.0000
Epoch [3/500], Loss: 0.6599, Train Accuracy: 0.6928, Train F1: 0.0128, Test Accuracy: 0.6021, Test F1: 0.0000
Epoch [4/500], Loss: 0.6390, Train Accuracy: 0.6932, Train F1: 0.0000, Test Accuracy: 0.6021, Test F1: 0.0000
Epoch [5/500], Loss: 0.6200, Train Accuracy: 0.6932, Train F1: 0.0000, Test Accuracy: 0.6021, Test F1: 0.0000
Epoch [6/500], Loss: 0.6307, Train Accuracy: 0.6932, Train F1: 0.0000, Test Accuracy: 0.6021, Test F1: 0.0000
Epoch [7/500], Loss: 0.6350, Train Accuracy: 0.6932, Train F1: 0.0000, Test Accuracy: 0.6021, Test F1: 0.0000
Epoch [8/500], Loss: 0.6325, Train Accuracy: 0.6932, Train F1: 0.0000, Test Accuracy: 0.6021, Test F1: 0.0000
Epoch [9/500], Loss: 0.6183, Train Accuracy: 0.6932, Train F1: 0.0000, Test Accuracy: 0.6021, Test F1: 0.0000
Epoch [10/