In [None]:
from sys import argv
from numpy import array
from ml_models import Classifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [None]:
def data_preprocessor(fname):
    uvs, ovs, Np = 16, 20, 1
    with open(fname, 'r') as f:
        rules = f.read().split('\n')

    X,  Y = [], []
    for rule in rules:
        (UA, OA, P) = rule.split(';')
        temp_u, temp_o = [0]*uvs, [0]*ovs
        for i in UA.split(','):
            temp_u[int(i)] = 1
        for i in OA.split(','):
            temp_o[int(i)] = 1
        X.append(temp_u + temp_o)
        Y.append(int(P))
    return (array(X), array(Y))

In [None]:
def split_fold(nfolds, X, Y):
    nfolds -= 1
    i, l = 0, len(X)
    fold_size = l // nfolds
    while i < l:
        j = min(l, i+fold_size)
        yield (X[:j], Y[:j])
        i = j

In [None]:
models = ["SVM", "DT", "RF", "xgboost", "gradboost"]
X, Y = data_preprocessor("final_data.txt")

In [None]:
for model in models:
    print(f"[INFO] Training model: {model}")
    kfold_gen = KFold(n_splits=4, shuffle=True).split(X, Y)
    clf = Classifier(ctype=model)
    current_model_acc = []
    current_model_pre = []
    current_model_rec = []
    avg_acc, avg_pre, avg_rec, fold = 0, 0, 0, 0

    for (fold, (train, test)) in enumerate(kfold_gen, 1):
        for (piece, (x, y)) in enumerate(split_fold(4, X[train], Y[train]), 1):
            clf.fit(x, y)
            ypred = clf.predict(X[test])
            acc = accuracy_score(Y[test], ypred)
            pre = precision_score(Y[test], ypred)
            rec = recall_score(Y[test], ypred)
            print("[INFO] Training on fold-%d/%d | accuracy: %.3f | precision: %.3f | recall %.3f | train set size: %d |" % (piece, fold, acc, pre, rec, x.shape[0]))
        
        current_model_acc.append(acc)
        current_model_pre.append(pre)
        current_model_rec.append(rec)
        avg_acc += acc
        avg_rec += rec
        avg_pre += pre

    avg_acc /= fold
    avg_rec /= fold
    avg_pre /= fold
    print("\n[INFO] Average accuracy, precision, recall over all %d-fold: %.3f, %.3f, %.3f\n\n" % (fold, avg_acc, avg_pre, avg_rec))
    print(f"+{'-'*102}+\n")