In [1]:

import sys
import pandas as pd


sys.path.append("..")
from sklearn.model_selection import train_test_split

import numpy as np
from nonconformist.base import ClassifierAdapter
from nonconformist.cp import IcpClassifier
from nonconformist.nc import ClassifierNc, NcFactory

from ex4.ada_boost import BoostedSeqTree
from ex5.util import parse_dataset

DATASET_PATH = "../datasets/pioneer.txt"  ## 160 lines
# DATASET_PATH = "../datasets/auslan2.txt"  ## 200 lines
#DATASET_PATH = "../datasets/context.txt"  ## 240 lines
# DATASET_PATH = "../datasets/aslbu.txt"  #### 424 lines
# DATASET_PATH = "../datasets/skating.txt"  ## 530 lines
#DATASET_PATH = "../datasets/reuters.txt"  # 1010 lines
# DATASET_PATH = "../datasets/webkb.txt"  ### 3667 lines
# DATASET_PATH = "../datasets/news.txt"  #### 4976 lines
# DATASET_PATH = "../datasets/unix.txt"  #### 5472 lines
ITERATIONS = 10

SIGNIFICANCE = 0.05

In [2]:

class MyClassifierAdapter(ClassifierAdapter):
    def __init__(self, model: BoostedSeqTree):
        super().__init__(model, None)

    def fit(self, x, y):
        self.model.fit(x, y, ITERATIONS)

    def predict(self, x):
        return np.array(
            [
                np.array(
                    [self.model.predict_prob(x, 1), self.model.predict_prob(x, -1)]
                )
                for x in x
            ]
        )


In [3]:
def train_test_split(dataset, ratio = 0.8):
    # consider each class separately
    classes = dataset[:, 1].unique()
    X_train = []
    X_test = []
    Y_train = []
    Y_test = []
    for c in classes:
        data = dataset[dataset[:, 1] == c]
        x_train, x_test = np.split(data, [int(ratio * len(data))])
        X_train.append(x_train)
        X_test.append(x_test)
        Y_train.append(np.full(len(x_train), c))
        Y_test.append(np.full(len(x_test), c))
    
    X_train = np.concatenate(X_train)
    X_test = np.concatenate(X_test)
    Y_train = np.concatenate(Y_train)
    Y_test = np.concatenate(Y_test)
    return X_train, X_test, Y_train, Y_test



In [4]:
def fit(X_train, Y_train):
    PHI = {}
    bsts = {}
    classes = X_train["y"].unique()

    for c in classes:
        print("class:", c)
        df_train = X_train.copy()
        df_train["y"] = X_train["y"].apply(lambda x: 1 if x == c else 0)

        # split the dataset into train and calibration sets
        df_t, df_c = train_test_split(df_train)

        # make sure that in the calibration set there are both positive and negative examples
        if df_c["y"].nunique() < 2 or df_t["y"].nunique() < 2:
            raise ValueError("Failed to split correctly")

        X = np.array([np.array([x for x in s]) for s in df_t["s"]])
        Y = np.array(df_t["y"])

        # train the model
        print(f"Training model for class {c}")
        bst = BoostedSeqTree()
        model = MyClassifierAdapter(bst)
        nc = ClassifierNc(model)
        icp = IcpClassifier(nc)

        X_c = np.array([np.array([x for x in s]) for s in df_c["s"]])
        Y_c = np.array(df_c["y"])

        print("Fitting Classifier")
        icp.fit(X, Y)

        print(f"Calibrating model for class {c}")
        icp.calibrate(X_c, Y_c)
        bsts[c] = bst
        PHI[c] = icp

    return PHI, bsts

In [5]:
dataset = parse_dataset(DATASET_PATH)
classes = dataset[:, 1].unique()
print(f"Classes: {classes}")

# divide training and test sets
X_test, X_train, Y_test, Y_train = train_test_split(dataset)

PHI, bsts = fit(X_train, Y_train)

Classes: ['1' '2' '3' '4' '5']
class: 1
Training model for class 1
Fitting Classifier
Calibrating model for class 1
class: 2
Training model for class 2
Fitting Classifier
Calibrating model for class 2
class: 3
Training model for class 3
Fitting Classifier
Calibrating model for class 3
class: 4
Training model for class 4
Fitting Classifier
Calibrating model for class 4
class: 5
Training model for class 5
Fitting Classifier
Calibrating model for class 5


In [6]:
# save to pickle the models
import pickle

with open(f"models_{DATASET_PATH.split('/')[-1]}.pkl", "wb") as f:
    pickle.dump((PHI, bsts), f)

f.close()

AttributeError: Can't pickle local object 'BaseIcp.__init__.<locals>.<lambda>'

In [None]:
def predict(x):
    predictions = []
    for c, phi in PHI.items():
        print(f"Predicting for class {c}")
        p = phi.predict(x, significance=SIGNIFICANCE)
        predictions.append(p)
        print(f"Conformal prediction: {p}")

        print(bsts[c])
        cp = bsts[c].predict(x[0])
        print(f"Class prediction: {cp}")

        ppp = bsts[c].predict_prob(x[0], 1)
        print(f"Probability prediction positive: {ppp}")

        ppn = bsts[c].predict_prob(x[0], -1)
        print(f"Probability prediction negative: {ppn}")

    return np.array(predictions)

In [None]:
test_data = np.array([np.array([x for x in s]) for s in X_test])
print(test_data.shape)

for idx, i in enumerate(test_data):
    print("###########################################")
    print("Predicting", i)
    predict(np.array([i]))
    print("real:", Y_test[idx])
    print("###########################################")