In [23]:

import sys
import pandas as pd


sys.path.append("..")

import numpy as np
from nonconformist.base import ClassifierAdapter
from nonconformist.cp import IcpClassifier
from nonconformist.nc import ClassifierNc, NcFactory

from ex4.ada_boost import BoostedSeqTree
from ex5.util import parse_dataset

DATASET_PATH = "../datasets/pioneer.txt"  ## 160 lines
# DATASET_PATH = "../datasets/auslan2.txt"  ## 200 lines
# DATASET_PATH = "../datasets/context.txt"  ## 240 lines
# DATASET_PATH = "../datasets/aslbu.txt"  #### 424 lines
# DATASET_PATH = "../datasets/skating.txt"  ## 530 lines
# DATASET_PATH = "../datasets/reuters.txt"  # 1010 lines
# DATASET_PATH = "../datasets/webkb.txt"  ### 3667 lines
# DATASET_PATH = "../datasets/news.txt"  #### 4976 lines
# DATASET_PATH = "../datasets/unix.txt"  #### 5472 lines
ITERATIONS = 10

SIGNIFICANCE = 0.05

In [24]:

class MyClassifierAdapter(ClassifierAdapter):
    def __init__(self, model: BoostedSeqTree):
        super().__init__(model, None)

    def fit(self, x, y):
        self.model.fit(x, y, ITERATIONS)

    def predict(self, x):
        return np.array(
            [
                np.array(
                    [self.model.predict_prob(x, 1), self.model.predict_prob(x, -1)]
                )
                for x in x
            ]
        )


In [25]:
def train_test_split(df, ratio = 0.8):
    # consider each class separately
    classes = df["y"].unique()
    train = []
    test = []
    for c in classes:
        class_df = df[df["y"] == c]
        class_train = class_df.sample(frac=ratio)
        class_test = class_df.drop(class_train.index
        )
        train.append(class_train)
        test.append(class_test)

    return pd.concat(train), pd.concat(test)


In [26]:
def fit(df):
    PHI = []
    classes = df["y"].unique()

    for c in classes:
        print("class:", c)
        df_train = df.copy()
        df_train["y"] = df["y"].apply(lambda x: 1 if x == c else 0)

        # split the dataset into train and calibration sets
        df_t, df_c = train_test_split(df_train)

        # make sure that in the calibration set there are both positive and negative examples
        while df_c["y"].nunique() < 2 or df_t["y"].nunique() < 2:
            raise ValueError("Failed to split correctly")

        X = np.array([np.array([x for x in s]) for s in df_t["s"]])
        Y = np.array(df_t["y"])

        # train the model
        print(f"Training model for class {c}")
        bst = BoostedSeqTree()
        model = MyClassifierAdapter(bst)
        nc = ClassifierNc(model)
        icp = IcpClassifier(nc)

        X_c = np.array([np.array([x for x in s]) for s in df_c["s"]])
        Y_c = np.array(df_c["y"])

        print("Fitting Classifier")
        icp.fit(X, Y)

        print(f"Calibrating model for class {c}")
        icp.calibrate(X_c, Y_c)
        PHI.append(icp)

    return PHI

In [27]:
PHI = []
df = parse_dataset(DATASET_PATH)
classes = df["y"].unique()

# divide training and test sets
df_train, df_test = train_test_split(df)
X_test = df_test["s"].to_numpy()
Y_test = df_test["y"].to_numpy()

PHI = fit(df_train)

class: 1
Training model for class 1
Fitting Classifier


In [None]:
def predict(PHI, x):

    predictions = []
    for phi in PHI:
        p = phi.predict(x, significance=SIGNIFICANCE)
        predictions.append(p)

    return np.array(predictions)

In [None]:
test_data = np.array([np.array([x for x in s]) for s in X_test])

r = predict(PHI, test_data[0])
print(r)

[]
