In [1]:
import sys

import numpy as np
import tensorflow as tf
tf.get_logger().setLevel('ERROR') # only show error messages

import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, accuracy_score

from ncf.ncf import NCF
from ncf.dataset import Dataset as NCFDataset

import torch
import torch.nn as nn
import torch.nn.functional as F

import warnings
warnings.filterwarnings('ignore')

print("System version: {}".format(sys.version))
print("Pandas version: {}".format(pd.__version__))
print("Tensorflow version: {}".format(tf.__version__))

System version: 3.11.2 (tags/v3.11.2:878ead1, Feb  7 2023, 16:38:35) [MSC v.1934 64 bit (AMD64)]
Pandas version: 2.2.1
Tensorflow version: 2.16.1


In [2]:
SEED = 42

PATH = f'C:/Users/Luka/Documents/University/bachelor-project-prerequisite-learning/generated_data/'
EMBEDDINGS_PATH = 'C:/Users/Luka/Documents/University/bachelor-project-prerequisite-learning/embeddings'

DATA_PATH =  PATH + 'united_data_embeddings.csv'
ENCODED_PATH = PATH + 'united_data_encoded_embeddings.csv'
ENCODED_SPLIT_PATH = PATH + 'akd_united_split_set'

train_file = PATH + f'akd_train_{SEED}.csv'
test_file = PATH + f'akd_test_{SEED}.csv'

# Train/Test Split

In [3]:
df = pd.read_csv(ENCODED_PATH)

df.head()

Unnamed: 0.1,Unnamed: 0,conceptA,conceptB,isPrerequisite,dataset,file,conceptA_ind,conceptB_ind
0,0,1076,1562,1,moocML,mooc,128,160
1,1,1347,855,1,moocML,mooc,160,417
2,2,1347,516,0,moocML,mooc,160,101
3,3,1347,1274,0,moocML,mooc,160,301
4,4,1347,443,0,moocML,mooc,160,120


In [4]:
df['dataset'].value_counts()

dataset
moocML     6712
al_cpl     6529
drive      2797
moocDSA    2540
Name: count, dtype: int64

In [5]:
concepts = list(set(list(df['conceptA']) + list(df['conceptB'])))

In [6]:
train, test = train_test_split(df, test_size=0.2, random_state=SEED, stratify=df['dataset'])

In [7]:
df.loc[train.index, '_split_set'] = 'train'
df.loc[test.index, '_split_set'] = 'test'

In [8]:
df.head()

Unnamed: 0.1,Unnamed: 0,conceptA,conceptB,isPrerequisite,dataset,file,conceptA_ind,conceptB_ind,_split_set
0,0,1076,1562,1,moocML,mooc,128,160,train
1,1,1347,855,1,moocML,mooc,160,417,train
2,2,1347,516,0,moocML,mooc,160,101,test
3,3,1347,1274,0,moocML,mooc,160,301,train
4,4,1347,443,0,moocML,mooc,160,120,train


In [9]:
train.shape, test.shape

((14862, 8), (3716, 8))

In [10]:
train_conceptA = list(set(list(train['conceptA'])))
train_conceptB = list(set(list(train['conceptB'])))
test_conceptA = list(set(list(test['conceptA'])))
test_conceptB = list(set(list(test['conceptB'])))

In [11]:
train['dataset'].value_counts() / test['dataset'].value_counts()

dataset
moocML     3.997766
al_cpl     3.999234
drive      4.003578
moocDSA    4.000000
Name: count, dtype: float64

In [12]:
a = [x for x in test_conceptA if x not in train_conceptA]
b = [x for x in test_conceptB if x not in train_conceptB]

In [13]:
maskA = test['conceptA'].isin(a)
maskB = test['conceptB'].isin(b)

In [14]:
train = pd.concat([train, test[maskA | maskB]], axis=0)
test.drop(index=test[maskA | maskB].index, inplace=True)

In [15]:
train.shape, test.shape

((14958, 8), (3620, 8))

In [16]:
train.sort_values(by=['conceptA']).to_csv(train_file, index=False)
test.sort_values(by=['conceptA']).to_csv(test_file, index=False)

df.to_csv(ENCODED_SPLIT_PATH, index=False)

# Generate Train Set For Tensor

In [3]:
# Load the train, test set
train = pd.read_csv(train_file)
test = pd.read_csv(test_file)

# Load the embedding dataframes
embedding_dfs = {
    'al_cpl': pd.read_csv(EMBEDDINGS_PATH + r'\al_cpl_embeddings_mistral.csv'),
    'drive': pd.read_csv(EMBEDDINGS_PATH + r'\drive_embeddings_mistral.csv'),
    'mooc': pd.read_csv(EMBEDDINGS_PATH + r'\mooc_embeddings_mistral.csv')
}

# Function to get embedding for a concept
def get_embedding(file, concept_ind):
    embedding = embedding_dfs[file].loc[concept_ind].iloc[1:].values
    return embedding

# Function to process a dataset
def process_dataset(df):
    x_list = []
    y_list = []

    for _, row in df.iterrows():
        file = row['file']
        
        # Get embeddings for conceptA and conceptB
        embedding_a = get_embedding(file, row['conceptA_ind'])
        embedding_b = get_embedding(file, row['conceptB_ind'])
        
        # Combine the embeddings
        combined_features = np.concatenate([embedding_a, embedding_b])
        
        x_list.append(combined_features)
        y_list.append(row['isPrerequisite'])

    return x_list, y_list

# Process train and test sets
x_train_list, y_train_list = process_dataset(train)
x_test_list, y_test_list = process_dataset(test)

# Convert to PyTorch tensors
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

x_train = torch.tensor(x_train_list, dtype=torch.float32).to(device)
y_train = torch.tensor(y_train_list, dtype=torch.long).to(device)

x_test = torch.tensor(x_test_list, dtype=torch.float32).to(device)
y_test = torch.tensor(y_test_list, dtype=torch.long).to(device)

print(f"x_train shape: {x_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"x_test shape: {x_test.shape}")
print(f"y_test shape: {y_test.shape}")

x_train shape: torch.Size([14958, 2048])
y_train shape: torch.Size([14958])
x_test shape: torch.Size([3620, 2048])
y_test shape: torch.Size([3620])


# Content-Based Learning

In [4]:
class BinaryClassifier(nn.Module):
    def __init__(self, input_size):
        super(BinaryClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, 512)  # Fully connected layer 1
        self.dropout1 = nn.Dropout(0.8)         # Dropout layer with 20% probability
        self.fc2 = nn.Linear(512, 256)          # Fully connected layer 2
        self.dropout2 = nn.Dropout(0.8)         # Dropout layer with 20% probability
        self.fc3 = nn.Linear(256, 1)            # Output layer
        
    def forward(self, x):
        x = F.relu(self.fc1(x))       # Apply ReLU activation to the first fully connected layer
        x = self.dropout1(x)          # Apply dropout to the output of the first layer
        x = F.relu(self.fc2(x))       # Apply ReLU activation to the second fully connected layer
        x = self.dropout2(x)          # Apply dropout to the output of the second layer
        x = torch.sigmoid(self.fc3(x))  # Apply sigmoid activation to the output layer for binary classification
        return x

In [5]:
# Assuming x_train is your input tensor
input_size = x_train.shape[1]  # This should be 2048 (1024 * 2) based on your previous code

# Initialize the model
model = BinaryClassifier(input_size)

# Move the model to the same device as your data
model = model.to(device)

# Define loss function and optimizer
criterion = nn.BCELoss()  # Binary Cross Entropy Loss
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop (simplified)
num_epochs = 500
for epoch in range(num_epochs):
    model.train()
    outputs = model(x_train)
    loss = criterion(outputs, y_train.float().unsqueeze(1))
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    with torch.no_grad():
        model.eval()  # Set the model to evaluation mode
        train_preds = (outputs >= 0.5).float()  # Convert probabilities to binary predictions
        train_acc = accuracy_score(y_train.cpu().numpy(), train_preds.cpu().numpy())
        train_f1 = f1_score(y_train.cpu().numpy(), train_preds.cpu().numpy())

    with torch.no_grad():
        test_outputs = model(x_test)
        test_preds = (test_outputs >= 0.5).float()  # Convert probabilities to binary predictions
        test_acc = accuracy_score(y_test.cpu().numpy(), test_preds.cpu().numpy())
        test_f1 = f1_score(y_test.cpu().numpy(), test_preds.cpu().numpy())

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Train Accuracy: {train_acc:.4f}, Train F1: {train_f1:.4f}, Test Accuracy: {test_acc:.4f}, Test F1: {test_f1:.4f}')

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [1/500], Loss: 0.6952, Train Accuracy: 0.4372, Train F1: 0.3894, Test Accuracy: 0.7152, Test F1: 0.0000
Epoch [1/500], Loss: 0.6952
Epoch [2/500], Loss: 0.6740, Train Accuracy: 0.7055, Train F1: 0.0226, Test Accuracy: 0.7152, Test F1: 0.0000
Epoch [2/500], Loss: 0.6740
Epoch [3/500], Loss: 0.6488, Train Accuracy: 0.7122, Train F1: 0.0000, Test Accuracy: 0.7152, Test F1: 0.0000
Epoch [3/500], Loss: 0.6488
Epoch [4/500], Loss: 0.6223, Train Accuracy: 0.7126, Train F1: 0.0000, Test Accuracy: 0.7152, Test F1: 0.0000
Epoch [4/500], Loss: 0.6223
Epoch [5/500], Loss: 0.6087, Train Accuracy: 0.7126, Train F1: 0.0000, Test Accuracy: 0.7152, Test F1: 0.0000
Epoch [5/500], Loss: 0.6087
Epoch [6/500], Loss: 0.6093, Train Accuracy: 0.7126, Train F1: 0.0000, Test Accuracy: 0.7152, Test F1: 0.0000
Epoch [6/500], Loss: 0.6093
Epoch [7/500], Loss: 0.6256, Train Accuracy: 0.7126, Train F1: 0.0000, Test Accuracy: 0.7152, Test F1: 0.0000
Epoch [7/500], Loss: 0.6256
Epoch [8/500], Loss: 0.6224, Train

# Graph-Based Learning

In [3]:
data = NCFDataset(train_file=train_file, seed=SEED, col_user='conceptA', col_item='conceptB')

INFO:ncf.dataset:Indexing C:/Users/Luka/Documents/University/bachelor-project-prerequisite-learning/generated_data/akd_train_42.csv ...


In [4]:
data.n_users, data.n_items

(1384, 1600)

In [5]:
model = NCF(
    n_users=data.n_users, 
    n_items=data.n_items,
    model_type="NeuMF",
    n_factors=16,
    layer_sizes=[8,4],
    n_epochs=20,
    batch_size=256,
    learning_rate=0.001,
    verbose=20, 
    seed=SEED
)

In [6]:
%%time

model.fit(data)

INFO:ncf.ncf:Epoch 20 [4.52s]: train_loss = 0.035806 


CPU times: total: 1min 4s
Wall time: 1min 6s


In [7]:
THRESHOLD = 0.29

In [8]:
df = pd.read_csv(ENCODED_SPLIT_PATH)

In [9]:
df['dataset'].value_counts()

dataset
moocML     6712
al_cpl     6529
drive      2797
moocDSA    2540
Name: count, dtype: int64

In [10]:
predictions = [[row.conceptA, row.conceptB, model.predict(row.conceptA, row.conceptB)]
               for (_, row) in df.iterrows()]

predictions = pd.DataFrame(predictions, columns=['conceptA', 'conceptB', 'isPrerequisite_pred'])

predictions['isPrerequisite'] = df['isPrerequisite']
predictions['dataset'] = df['dataset']
predictions['_split_set'] = df['_split_set']
sorted_predictions = predictions.sort_values(by='isPrerequisite_pred', ascending=False)
sorted_predictions['pred'] = (sorted_predictions['isPrerequisite_pred'] >= THRESHOLD).astype(int)

In [11]:
sorted_predictions['isPrerequisite'].mean(), df['isPrerequisite'].mean(), sorted_predictions['isPrerequisite_pred'].mean()

(0.2868984820755733, 0.2868984820755733, 0.23067496558838732)

In [12]:
sorted_predictions['dataset'].value_counts()

dataset
moocML     6712
al_cpl     6529
drive      2797
moocDSA    2540
Name: count, dtype: int64

In [13]:
df_moocML = sorted_predictions[sorted_predictions['dataset'] == 'moocML']
df_moocDSA = sorted_predictions[sorted_predictions['dataset'] == 'moocDSA']
df_drive = sorted_predictions[sorted_predictions['dataset'] == 'drive']
df_alcpl = sorted_predictions[sorted_predictions['dataset'] == 'al_cpl']

In [14]:
df_test = sorted_predictions[sorted_predictions['_split_set'] == 'test']

In [15]:
print(classification_report(df_test['isPrerequisite'], df_test['pred']))

              precision    recall  f1-score   support

           0       0.87      0.96      0.92      2652
           1       0.88      0.65      0.75      1064

    accuracy                           0.87      3716
   macro avg       0.88      0.81      0.83      3716
weighted avg       0.87      0.87      0.87      3716



In [16]:
print(precision_score(df_test['isPrerequisite'], df_test['pred']), recall_score(df_test['isPrerequisite'], df_test['pred']), f1_score(df_test['isPrerequisite'], df_test['pred']))

0.879948914431673 0.6475563909774437 0.7460747157552788
