In [1]:
import numpy as np
import pandas as pd

from config import train_file, test_file, PATH, SEED, ENCODED_SPLIT_PATH
from utils import *

from ncf.ncf import NCF
from ncf.dataset import Dataset as NCFDataset

from sklearn.metrics import recall_score, precision_score, f1_score, classification_report




In [2]:
def generate_pseudo_data(model, train_path):
    train = pd.read_csv(train_path)

    users, items, preds = [], [], []

    for user in train["conceptA"].unique():
        itemsA = list(train.loc[train['conceptA'] == user, "conceptB"].unique())

        item = list(train.loc[~(train['conceptB'].isin(itemsA)), "conceptB"].unique())

        user = [user] * len(item)
        users.extend(user)
        items.extend(item)
        preds.extend(list(model.predict(user, item, is_list=True)))

    pseudo_predictions = pd.DataFrame(data={"conceptA": users, "conceptB": items, "isPrerequisite_pred": preds})
    pseudo_predictions.sort_values(by=['conceptA', 'isPrerequisite_pred'], ascending=[True, False], inplace=True)

    vs = train.groupby('conceptA').agg('sum')['isPrerequisite'].reset_index()

    np.random.seed(SEED)

    vs['k'] = (vs['isPrerequisite']).apply(biased_coin_toss)

    vs['isPrerequisite'].sum(), vs['k'].sum()

    pseudo_data = unite_pos_neg(pseudo_predictions, vs, train)

    return pseudo_data

In [3]:
train_path = 'C:/Users/Luka/Documents/University/bachelor-project-prerequisite-learning/generated_data/akd_united_pseudo_data_1_0.csv'

In [4]:
train = pd.read_csv(train_path)

In [5]:
train['isPrerequisite'].mean()

0.3137737174982431

In [6]:
data = NCFDataset(train_file=train_file, seed=SEED, col_user='conceptA', col_item='conceptB')

INFO:ncf.dataset:Indexing C:/Users/Luka/Documents/University/bachelor-project-prerequisite-learning/generated_data/akd_train_42.csv ...


In [7]:
model = NCF(
    n_users=data.n_users, 
    n_items=data.n_items,
    model_type="NeuMF",
    n_factors=16,
    layer_sizes=[8,4],
    # previously 16, 8, 4
    n_epochs=20,
    batch_size=256,
    learning_rate=0.001,
    verbose=20, 
    seed=SEED
)















In [8]:
%%time

model.fit(data)

INFO:ncf.ncf:Epoch 20 [2.58s]: train_loss = 0.035806 


CPU times: total: 1min 1s
Wall time: 57.5 s


In [9]:
threshold = 0.31

df = pd.read_csv(ENCODED_SPLIT_PATH)
predictions = [[row.conceptA, row.conceptB, model.predict(row.conceptA, row.conceptB)]
               for (_, row) in df.iterrows()]

predictions = pd.DataFrame(predictions, columns=['conceptA', 'conceptB', 'isPrerequisite_pred'])

predictions['isPrerequisite'] = df['isPrerequisite']
predictions['dataset'] = df['dataset']
predictions['_split_set'] = df['_split_set']
sorted_predictions = predictions.sort_values(by='isPrerequisite_pred', ascending=False)
sorted_predictions['pred'] = (sorted_predictions['isPrerequisite_pred'] >= threshold).astype(int)

In [10]:
df_test = sorted_predictions[sorted_predictions['_split_set'] == 'test']
print(classification_report(df_test['isPrerequisite'], df_test['pred']))

              precision    recall  f1-score   support

           0       0.87      0.97      0.92      2652
           1       0.89      0.64      0.74      1064

    accuracy                           0.87      3716
   macro avg       0.88      0.80      0.83      3716
weighted avg       0.87      0.87      0.87      3716



In [11]:
pseudo_data = generate_pseudo_data(model, train_path)

In [12]:
pseudo_data

Unnamed: 0,conceptA,conceptB,isPrerequisite,dataset,file,conceptA_ind,conceptB_ind,fileB
0,0.0,1512.0,1,pseudo,al_cpl,386,490,al_cpl
7,1.0,1542.0,1,pseudo,drive,254,299,al_cpl
23,2.0,429.0,1,pseudo,mooc,256,16,mooc
35,3.0,277.0,1,pseudo,mooc,272,100,mooc
131,6.0,624.0,1,pseudo,al_cpl,72,391,al_cpl
...,...,...,...,...,...,...,...,...
17008,1370.0,220.0,0,pseudo,drive,105,528,drive
17008,1370.0,1266.0,0,pseudo,drive,163,567,drive
17015,1371.0,1013.0,0,pseudo,al_cpl,114,319,mooc
17038,1376.0,1306.0,0,pseudo,drive,294,698,drive
