In [49]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import CategoricalNB, GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_curve, auc

In [2]:
from src.ensembles import Ensemble

In [3]:
def encode_column(column: np.ndarray):
    le = LabelEncoder()
    column = le.fit_transform(column)
    return column


def find_intervals(x_train, group_vector):  # auxilary values for data disrcetization
    intervals = np.array([np.zeros(i - 1) for i in group_vector])

    for i, features in enumerate(x_train.T):
        max_value = max(features)
        min_value = min(features)
        section_size = (max_value - min_value) / group_vector[i]
        intervals[i] = np.array(
            [min_value + section_size * j for j in range(1, group_vector[i])]
        )
    return intervals

In [4]:
random_state = 3

# Breast Cancer

In [12]:
from sklearn.datasets import load_breast_cancer
breast_cancer_data = load_breast_cancer()

x = breast_cancer_data.data
y = breast_cancer_data.target

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.1, random_state=random_state
)

In [13]:
clf = GaussianNB()
clf.fit(x_train, y_train)
score =  clf.score(x_test, y_test)
print(score)

clf_ens = Ensemble(
    GaussianNB, 10, random_state=random_state
)
clf_ens.fit(x_train, y_train)
score = clf_ens.score(x_test, y_test)
print(score)

0.9298245614035088
0.9122807017543859


In [14]:
intervals = find_intervals(x_train, [4] * 30)
x_train_discrete = np.array([np.digitize(column , bins=intervals[i]) for i, column in enumerate(x_train.T)]).T
x_test_discrete = np.array([np.digitize(column , bins=intervals[i]) for i, column in enumerate(x_test.T)]).T
clf = CategoricalNB()
clf.fit(x_train_discrete, y_train)
score =  clf.score(x_test_discrete, y_test)
print(score)

clf_ens = Ensemble(
    CategoricalNB, 10, random_state=random_state, min_categories=[4] * 30
)
clf_ens.fit(x_train_discrete, y_train)
score = clf_ens.score(x_test_discrete, y_test)
print(score)


0.9298245614035088
0.9298245614035088


# Flights

In [40]:
train = pd.read_csv("flights/train.csv", delimiter=",")
test = pd.read_csv("flights/test.csv", delimiter=",")
dataset = pd.concat([train, test])
dataset = dataset.loc[:, ~dataset.columns.str.contains('^Unnamed')].drop("id", axis=1)
for column in ["Gender", "Customer Type", "Type of Travel", "Class", "satisfaction"]:
    dataset[column] = encode_column(dataset[column])


x = dataset.drop("satisfaction", axis=1)
y = dataset["satisfaction"]

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.99, random_state=random_state
)

intervals = find_intervals(np.array(x_train[["Age", "Flight Distance", "Departure Delay in Minutes", "Arrival Delay in Minutes"]]), [4] * 4)

for i, column in enumerate(["Age", "Flight Distance", "Departure Delay in Minutes", "Arrival Delay in Minutes"]):
    x_train[column] = np.digitize(x_train[column], bins = intervals[i])
    x_test[column] = np.digitize(x_test[column], bins=intervals[i])

x_train = np.array(x_train)
x_test = np.array(x_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

In [52]:

clf = CategoricalNB(min_categories=[2, 2, 4, 2, 3, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6, 4, 4])
clf.fit(x_train, y_train)
prediction = clf.predict(x_test)
print(sum(prediction == y_test)/len(y_test))
print(confusion_matrix(y_test, prediction))
fpr, tpr, thresholds = roc_curve(y_test, prediction, pos_label=1)
print(f"AUC: {auc(fpr, tpr)}")


clf_ens = Ensemble(
    CategoricalNB, 100, random_state=random_state, min_categories=[2, 2, 4, 2, 3, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6, 4, 4]
)
clf_ens.fit(x_train, y_train)
prediction = clf_ens.predict(x_test)
print(sum(prediction == y_test) / len(y_test))
print(confusion_matrix(y_test, prediction))
fpr, tpr, thresholds = roc_curve(y_test, prediction, pos_label=1)
print(f"AUC: {auc(fpr, tpr)}")


0.8883514022180399
[[66269  6448]
 [ 7908 47957]]
AUC: 0.884885969669753
0.8556407584265294
[[67951  4766]
 [13796 42069]]
AUC: 0.8437528837554409


In [29]:

clf = SVC()
clf.fit(x_train, y_train)
score = clf.score(x_test, y_test)
print(score)


clf_ens = Ensemble(
    SVC, 10, random_state=random_state
)
clf_ens.fit(x_train, y_train)
score = clf_ens.score(x_test, y_test)
print(score)

0.9021169370518424
0.8174550092547946


# Human activity

In [34]:
x_train = np.loadtxt("human_activity/X_train.txt", dtype = float)
x_test = np.loadtxt("human_activity/X_test.txt", dtype = float)
y_train = np.loadtxt("human_activity/y_train.txt", dtype = int)
y_test = np.loadtxt("human_activity/y_test.txt", dtype = int)

x = np.vstack([x_train, x_test])
y = np.append(y_train, y_test)
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.99, random_state=random_state
)

In [39]:

clf = DecisionTreeClassifier()
clf.fit(x_train, y_train)
score = clf.score(x_test, y_test)
print(score)


clf_ens = Ensemble(
    DecisionTreeClassifier, 100, max_attributes= 300, random_state=random_state
)
clf_ens.fit(x_train, y_train)
score = clf_ens.score(x_test, y_test)
print(score)

0.803471609296852
0.8538785917426694


In [27]:
clf = RandomForestClassifier(10, random_state=random_state)
clf.fit(x_train, y_train)
score = clf.score(x_test, y_test)
print(score)

clf_ens = Ensemble(
    DecisionTreeClassifier, 10, random_state=random_state
)
clf_ens.fit(x_train, y_train)
score = clf_ens.score(x_test, y_test)
print(score)

0.8988802171700034
0.8693586698337292
