In [1]:
import sys
import pandas as pd
import numpy as np
import tensorflow as tf
tf.get_logger().setLevel('ERROR') # only show error messages

import itertools
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score

from ncf.ncf import NCF
from ncf.dataset import Dataset as NCFDataset

from config import *

import warnings
warnings.filterwarnings('ignore')

print("System version: {}".format(sys.version))
print("Pandas version: {}".format(pd.__version__))
print("Tensorflow version: {}".format(tf.__version__))

System version: 3.11.2 (tags/v3.11.2:878ead1, Feb  7 2023, 16:38:35) [MSC v.1934 64 bit (AMD64)]
Pandas version: 2.2.1
Tensorflow version: 2.16.1


# Cross Validation

In [2]:
train = pd.read_csv(DATA_PATH + TRAIN_TEST_FOLDER + 'train.csv')
n_splits = 5

## Write Splits In Files

In [3]:
DATA_PATH + CROSS_VALIDATION_FOLDER

'C:/Users/Luka/Documents/University/bachelor-project-prerequisite-learning/data/cross_validation/'

In [4]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=n_splits, random_state=SEED, shuffle=True)

X = train[['conceptA', 'conceptB']]
y = train['isPrerequisite']

i = 0
for train_index, test_index in skf.split(X, y):
    train_cv = train.iloc[train_index]
    test_cv = train.iloc[test_index] 
    train_file = DATA_PATH + CROSS_VALIDATION_FOLDER + f"/cross_validation_train_{i}_split_{n_splits}.csv"
    test_file = DATA_PATH + CROSS_VALIDATION_FOLDER + f"/cross_validation_test_{i}_split_{n_splits}.csv"
    i += 1
    
    train_conceptA = list(set(list(train_cv['conceptA'])))
    train_conceptB = list(set(list(train_cv['conceptB'])))
    test_conceptA = list(set(list(test_cv['conceptA'])))
    test_conceptB = list(set(list(test_cv['conceptB'])))
    
    a = [x for x in test_conceptA if x not in train_conceptA]
    b = [x for x in test_conceptB if x not in train_conceptB]
    
    maskA = test_cv['conceptA'].isin(a)
    maskB = test_cv['conceptB'].isin(b)
    
    train_cv = pd.concat([train_cv, test_cv[maskA | maskB]], axis=0)
    test_cv.drop(index=test_cv[maskA | maskB].index, inplace=True)
    
    train_cv.sort_values(by=['conceptA']).to_csv(train_file, index=False)
    test_cv.sort_values(by=['conceptA']).to_csv(test_file, index=False)

## Cross Validate

In [29]:
hyper_params = {
    # 'n_factors' : [12, 16, 20],
    'n_factors' : [16],
    # 'layer_sizes' : [[4], [8, 4], [16, 8, 4]],
    'layer_sizes' : [[8, 4]],
    # 'n_epochs' : [10, 20, 30],
    'n_epochs' : [20],
    # 'batch_size' : [128, 256, 512],
    'batch_size' : [256],
    # 'learning_rate' : [0.001, 0.01, 0.1]
    # 'learning_rate' : [0.001, 0.0001, 0.00001]
    'learning_rate' : [0.001]
}


In [26]:
# Generate all combinations
combinations = list(itertools.product(
    hyper_params['n_factors'], 
    hyper_params['layer_sizes'], 
    hyper_params['n_epochs'], hyper_params['batch_size'], 
    hyper_params['learning_rate'] 
))

# Format the combinations into dictionaries
formatted_combinations = [
    {
        'n_factors': combo[0], 
        'layer_sizes': combo[1],
        'n_epochs': combo[2],
        'batch_size': combo[3],
        'learning_rate': combo[4]
    } for combo in combinations
]

In [27]:
%%time

best_combo = {}
best_score = 0
for combo in formatted_combinations:
    score = 0
    print(combo)
    for i in range(n_splits):
        train_file = DATA_PATH + CROSS_VALIDATION_FOLDER + f"/cross_validation_train_{i}_split_{n_splits}.csv"
        test_file = DATA_PATH + CROSS_VALIDATION_FOLDER + f"/cross_validation_test_{i}_split_{n_splits}.csv"
        train_cv = NCFDataset(train_file=train_file, seed=SEED, col_user='conceptA', col_item='conceptB')
        
        model_cv = NCF(
            n_users=train_cv.n_users,
            n_items=train_cv.n_items,
            model_type="NeuMF",
            n_factors=combo['n_factors'],
            layer_sizes=combo['layer_sizes'],
            n_epochs=combo['n_epochs'],
            batch_size=combo['batch_size'],
            learning_rate=combo['learning_rate'],
            verbose=combo['n_epochs'],
            seed=SEED
        )
        
        model_cv.fit(train_cv)
        
        test_cv = pd.read_csv(test_file)
        predictions = [[row.conceptA, row.conceptB, model_cv.predict(row.conceptA, row.conceptB)]
                   for (_, row) in test_cv.iterrows()]
    
        predictions = pd.DataFrame(predictions, columns=['conceptA', 'conceptB', 'isPrerequisite_pred'])
        predictions['pred'] = (predictions['isPrerequisite_pred'] >= NCF_THRESHOLD).astype(int)
        score_split = f1_score(test_cv['isPrerequisite'], predictions['pred'])
        print(f'f1 - split {i}', score_split)
        score += score_split
    score /= n_splits
    print('split score', score)
    if score > best_score:
        best_score = score
        best_combo = combo

INFO:ncf.dataset:Indexing C:/Users/Luka/Documents/University/bachelor-project-prerequisite-learning/data/cross_validation//cross_validation_train_0_split_5.csv ...


{'n_factors': 16, 'layer_sizes': [8, 4], 'n_epochs': 20, 'batch_size': 256, 'learning_rate': 0.001}


INFO:ncf.ncf:Epoch 20 [2.54s]: train_loss = 0.042704 
INFO:ncf.dataset:Indexing C:/Users/Luka/Documents/University/bachelor-project-prerequisite-learning/data/cross_validation//cross_validation_train_1_split_5.csv ...


f1 - split 0 0.7087307410124725


INFO:ncf.ncf:Epoch 20 [2.70s]: train_loss = 0.043457 
INFO:ncf.dataset:Indexing C:/Users/Luka/Documents/University/bachelor-project-prerequisite-learning/data/cross_validation//cross_validation_train_2_split_5.csv ...


f1 - split 1 0.7296511627906976


INFO:ncf.ncf:Epoch 20 [2.68s]: train_loss = 0.041791 
INFO:ncf.dataset:Indexing C:/Users/Luka/Documents/University/bachelor-project-prerequisite-learning/data/cross_validation//cross_validation_train_3_split_5.csv ...


f1 - split 2 0.7053571428571429


INFO:ncf.ncf:Epoch 20 [2.61s]: train_loss = 0.043992 
INFO:ncf.dataset:Indexing C:/Users/Luka/Documents/University/bachelor-project-prerequisite-learning/data/cross_validation//cross_validation_train_4_split_5.csv ...


f1 - split 3 0.6977087952697709


INFO:ncf.ncf:Epoch 20 [2.59s]: train_loss = 0.040066 
INFO:ncf.dataset:Indexing C:/Users/Luka/Documents/University/bachelor-project-prerequisite-learning/data/cross_validation//cross_validation_train_0_split_5.csv ...


f1 - split 4 0.7226277372262774
split score 0.7128151158312723
{'n_factors': 16, 'layer_sizes': [8, 4], 'n_epochs': 20, 'batch_size': 256, 'learning_rate': 0.0001}


INFO:ncf.ncf:Epoch 20 [2.54s]: train_loss = 0.170068 
INFO:ncf.dataset:Indexing C:/Users/Luka/Documents/University/bachelor-project-prerequisite-learning/data/cross_validation//cross_validation_train_1_split_5.csv ...


f1 - split 0 0.25481798715203424


INFO:ncf.ncf:Epoch 20 [2.46s]: train_loss = 0.171096 
INFO:ncf.dataset:Indexing C:/Users/Luka/Documents/University/bachelor-project-prerequisite-learning/data/cross_validation//cross_validation_train_2_split_5.csv ...


f1 - split 1 0.23669923995656894


INFO:ncf.ncf:Epoch 20 [2.93s]: train_loss = 0.171202 
INFO:ncf.dataset:Indexing C:/Users/Luka/Documents/University/bachelor-project-prerequisite-learning/data/cross_validation//cross_validation_train_3_split_5.csv ...


f1 - split 2 0.24973089343379978


INFO:ncf.ncf:Epoch 20 [2.50s]: train_loss = 0.170975 
INFO:ncf.dataset:Indexing C:/Users/Luka/Documents/University/bachelor-project-prerequisite-learning/data/cross_validation//cross_validation_train_4_split_5.csv ...


f1 - split 3 0.23102310231023102


INFO:ncf.ncf:Epoch 20 [2.53s]: train_loss = 0.171596 
INFO:ncf.dataset:Indexing C:/Users/Luka/Documents/University/bachelor-project-prerequisite-learning/data/cross_validation//cross_validation_train_0_split_5.csv ...


f1 - split 4 0.22742474916387959
split score 0.23993919440330275
{'n_factors': 16, 'layer_sizes': [8, 4], 'n_epochs': 20, 'batch_size': 256, 'learning_rate': 1e-05}


INFO:ncf.ncf:Epoch 20 [2.58s]: train_loss = 0.658026 
INFO:ncf.dataset:Indexing C:/Users/Luka/Documents/University/bachelor-project-prerequisite-learning/data/cross_validation//cross_validation_train_1_split_5.csv ...


f1 - split 0 0.4426812585499316


INFO:ncf.ncf:Epoch 20 [2.51s]: train_loss = 0.658020 
INFO:ncf.dataset:Indexing C:/Users/Luka/Documents/University/bachelor-project-prerequisite-learning/data/cross_validation//cross_validation_train_2_split_5.csv ...


f1 - split 1 0.4419496166484118


INFO:ncf.ncf:Epoch 20 [2.56s]: train_loss = 0.658042 
INFO:ncf.dataset:Indexing C:/Users/Luka/Documents/University/bachelor-project-prerequisite-learning/data/cross_validation//cross_validation_train_3_split_5.csv ...


f1 - split 2 0.4460987041632203


INFO:ncf.ncf:Epoch 20 [2.44s]: train_loss = 0.658216 
INFO:ncf.dataset:Indexing C:/Users/Luka/Documents/University/bachelor-project-prerequisite-learning/data/cross_validation//cross_validation_train_4_split_5.csv ...


f1 - split 3 0.4362050163576881


INFO:ncf.ncf:Epoch 20 [2.60s]: train_loss = 0.658065 


f1 - split 4 0.4376899696048632
split score 0.440924913064823
CPU times: total: 14min 40s
Wall time: 13min 11s


In [28]:
best_combo, best_score

({'n_factors': 16,
  'layer_sizes': [8, 4],
  'n_epochs': 20,
  'batch_size': 256,
  'learning_rate': 0.001},
 0.7128151158312723)

# Final Model Train

In [30]:
DATA_PATH + TRAIN_TEST_FOLDER 

'C:/Users/Luka/Documents/University/bachelor-project-prerequisite-learning/data/train_test/'

In [32]:
train_file = DATA_PATH + TRAIN_TEST_FOLDER + 'train.csv'
test_file = DATA_PATH + TRAIN_TEST_FOLDER + 'test.csv'

In [33]:
data = NCFDataset(train_file=train_file, seed=SEED, col_user='conceptA', col_item='conceptB')

INFO:ncf.dataset:Indexing C:/Users/Luka/Documents/University/bachelor-project-prerequisite-learning/data/train_test/train.csv ...


In [34]:
data.n_users, data.n_items

(1384, 1600)

In [35]:
model = NCF(
    n_users=data.n_users, 
    n_items=data.n_items,
    model_type="NeuMF",
    n_factors=16,
    layer_sizes=[8,4],
    n_epochs=20,
    batch_size=256,
    learning_rate=0.001,
    verbose=20, 
    seed=SEED
)

In [36]:
%%time

model.fit(data)

INFO:ncf.ncf:Epoch 20 [4.42s]: train_loss = 0.036004 


CPU times: total: 59 s
Wall time: 58.8 s


## 3.4 Prediction and Evaluation

### 3.4.1 Prediction

Now that our model is fitted, we can call `predict` to get some `predictions`. `predict` returns an internal object Prediction which can be easily converted back to a dataframe:

In [37]:
df = pd.read_csv(DATA_PATH + UNITED_DATA_FOLDER + 'united_data_encoded_embeddings_split_set.csv')

In [39]:
predictions = [[row.conceptA, row.conceptB, model.predict(row.conceptA, row.conceptB)]
               for (_, row) in df.iterrows()]

predictions = pd.DataFrame(predictions, columns=['conceptA', 'conceptB', 'isPrerequisite_pred'])

predictions['isPrerequisite'] = df['isPrerequisite']
predictions['dataset'] = df['dataset']
predictions['_split_set'] = df['_split_set']
sorted_predictions = predictions.sort_values(by='isPrerequisite_pred', ascending=False)
sorted_predictions['pred'] = (sorted_predictions['isPrerequisite_pred'] >= NCF_THRESHOLD).astype(int)

In [40]:
sorted_predictions['isPrerequisite'].mean(), df['isPrerequisite'].mean(), sorted_predictions['isPrerequisite_pred'].mean()

(0.2855126743744233, 0.2855126743744233, 0.2324678171464357)

In [41]:
sorted_predictions['dataset'].value_counts()

dataset
moocML     6712
al_cpl     6375
drive      2797
moocDSA    2539
Name: count, dtype: int64

In [42]:
df_moocML = sorted_predictions[sorted_predictions['dataset'] == 'moocML']
df_moocDSA = sorted_predictions[sorted_predictions['dataset'] == 'moocDSA']
df_drive = sorted_predictions[sorted_predictions['dataset'] == 'drive']
df_alcpl = sorted_predictions[sorted_predictions['dataset'] == 'al_cpl']

In [43]:
df_test = sorted_predictions[sorted_predictions['_split_set'] == 'test']

# Check Test Set

In [44]:
print(classification_report(df_test['isPrerequisite'], df_test['pred']))

              precision    recall  f1-score   support

           0       0.87      0.96      0.91      2635
           1       0.87      0.65      0.74      1050

    accuracy                           0.87      3685
   macro avg       0.87      0.80      0.83      3685
weighted avg       0.87      0.87      0.87      3685



In [45]:
print(precision_score(df_test['isPrerequisite'], df_test['pred']), recall_score(df_test['isPrerequisite'], df_test['pred']), f1_score(df_test['isPrerequisite'], df_test['pred']))

0.8686224489795918 0.6485714285714286 0.742639040348964


### Reference: 
1. Xiangnan He, Lizi Liao, Hanwang Zhang, Liqiang Nie, Xia Hu & Tat-Seng Chua, Neural Collaborative Filtering, 2017, https://arxiv.org/abs/1708.05031

2. Official NCF implementation [Keras with Theano]: https://github.com/hexiangnan/neural_collaborative_filtering

3. Other nice NCF implementation [Pytorch]: https://github.com/LaceyChen17/neural-collaborative-filtering