# ZUM - Projekt

Plan badań
1. Modele zespołowe dla każdego z algorytmów (SVC, DecisionTreeClassifier,  CategoricalNB, GaussianNB)
2. Dla każdego z modelów zespołowych testy dla 5, 10, 50, 100 modeli w zespole.
3. Analogiczne testy dla Bagging Classifier
4. Analogiczne testy dla Random Forest Classifier (liczba drzew)
5. Testy na algorytmach konwencjonalnych
6. Wszystko wyżej dla CV = 10
7. Wszystko wyżej powtórzone dla dwóch zbiorów danych.

** dodatkowo różne opcje podzialu atrybutow na modele w modelach zespolowych

w sumie (4*4 + 4*4 + 4 + 4)*2 = 80

Import potrzebnych narzędzi.

In [1]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.naive_bayes import CategoricalNB, GaussianNB
from IPython.display import display, HTML

Import przygotowanej klasy Ensemble.

In [2]:
from src.ensembles import Ensemble

Import funkcji pomocniczych do testów

In [3]:
from src.test_utils import *

## Human activity

In [4]:
random_state = 3

In [5]:
min_categories=[4] * 561

Załadowanie danych

In [6]:
x_train = np.loadtxt("human_activity/X_train.txt", dtype = float)
x_test = np.loadtxt("human_activity/X_test.txt", dtype = float)
y_train = np.loadtxt("human_activity/y_train.txt", dtype = int)
y_test = np.loadtxt("human_activity/y_test.txt", dtype = int)

x = np.vstack([x_train, x_test])
y = np.append(y_train, y_test)

intervals = find_intervals(x_train, min_categories)
x_train_discrete = np.array([np.digitize(column , bins=intervals[i]) for i, column in enumerate(x_train.T)]).T
x_test_discrete = np.array([np.digitize(column , bins=intervals[i]) for i, column in enumerate(x_test.T)]).T
x_discrete = np.vstack([x_train_discrete, x_test_discrete])

### Testy na pojedynczych modelach

SVC

In [9]:
svc_results = test_clf( 
    x,
    y,
    x_train,
    y_train,
    x_test,
    y_test,
    classifier = SVC,
    random_state = random_state
    )

In [10]:
display(HTML(svc_results.to_html(escape=False)))

Unnamed: 0,classifier:,Accuracy,ROC AUC,PR AUC,Precision,Recall,Confusion matrix,CV scores,CV mean score
0,SVC,0.95,0.998,0.99,0.952,0.949,"[[488, 5, 3, 0, 0, 0],  [20, 451, 0, 0, 0, 0],  [10, 26, 384, 0, 0, 0],  [0, 2, 0, 438, 51, 0],  [0, 0, 0, 29, 503, 0],  [0, 0, 0, 0, 0, 537]]","0.971, 0.931, 0.879 0.970, 0.974, 0.961 0.953, 0.968",0.951


Gaussian Naive Bayes

In [11]:
gaussNB_results = test_clf( 
    x,
    y,
    x_train,
    y_train,
    x_test,
    y_test,
    classifier = GaussianNB
    )

In [12]:
display(HTML(gaussNB_results.to_html(escape=False)))

Unnamed: 0,classifier:,Accuracy,ROC AUC,PR AUC,Precision,Recall,Confusion matrix,CV scores,CV mean score
0,Gaussian NB,0.77,0.958,0.788,0.792,0.769,"[[416, 38, 42, 0, 0, 0],  [9, 451, 11, 0, 0, 0],  [80, 83, 257, 0, 0, 0],  [0, 7, 0, 368, 111, 5],  [0, 15, 0, 54, 455, 8],  [0, 3, 0, 211, 0, 323]]","0.765, 0.686, 0.577 0.683, 0.704, 0.786 0.815, 0.698",0.726


Naive Bayes classifier for categorical features

In [13]:
catNB_results = test_clf( 
    x_discrete,
    y,
    x_train_discrete,
    y_train,
    x_test_discrete,
    y_test,
    classifier = CategoricalNB,
    min_categories=min_categories
    )

In [14]:
display(HTML(catNB_results.to_html(escape=False)))

Unnamed: 0,classifier:,Accuracy,ROC AUC,PR AUC,Precision,Recall,Confusion matrix,CV scores,CV mean score
0,Categorical NB,0.87,0.981,0.898,0.871,0.862,"[[452, 8, 36, 0, 0, 0],  [24, 428, 19, 0, 0, 0],  [84, 63, 273, 0, 0, 0],  [0, 2, 0, 377, 111, 1],  [0, 1, 0, 33, 498, 0],  [0, 0, 0, 0, 0, 537]]","0.868, 0.824, 0.779 0.806, 0.811, 0.844 0.896, 0.832",0.84


Decision Tree Classifier

In [15]:
dt_results = test_clf( 
    x,
    y,
    x_train,
    y_train,
    x_test,
    y_test,
    classifier = DecisionTreeClassifier,
    random_state = random_state
    )

In [16]:
display(HTML(dt_results.to_html(escape=False)))

Unnamed: 0,classifier:,Accuracy,ROC AUC,PR AUC,Precision,Recall,Confusion matrix,CV scores,CV mean score
0,Decision Tree,0.861,0.914,0.764,0.861,0.857,"[[451, 26, 19, 0, 0, 0],  [66, 374, 31, 0, 0, 0],  [25, 57, 338, 0, 0, 0],  [0, 0, 0, 374, 117, 0],  [0, 0, 0, 69, 463, 0],  [0, 0, 0, 0, 0, 537]]","0.901, 0.845, 0.800 0.873, 0.891, 0.879 0.888, 0.884",0.863


In [17]:
single_model_results_df = pd.concat([svc_results, dt_results, gaussNB_results, catNB_results], ignore_index=True)

In [19]:
display(HTML(single_model_results_df.to_html(escape=False)))

Unnamed: 0,classifier:,Accuracy,ROC AUC,PR AUC,Precision,Recall,Confusion matrix,CV scores,CV mean score
0,SVC,0.95,0.998,0.99,0.952,0.949,"[[488, 5, 3, 0, 0, 0],  [20, 451, 0, 0, 0, 0],  [10, 26, 384, 0, 0, 0],  [0, 2, 0, 438, 51, 0],  [0, 0, 0, 29, 503, 0],  [0, 0, 0, 0, 0, 537]]","0.971, 0.931, 0.879 0.970, 0.974, 0.961 0.953, 0.968",0.951
1,Decision Tree,0.861,0.914,0.764,0.861,0.857,"[[451, 26, 19, 0, 0, 0],  [66, 374, 31, 0, 0, 0],  [25, 57, 338, 0, 0, 0],  [0, 0, 0, 374, 117, 0],  [0, 0, 0, 69, 463, 0],  [0, 0, 0, 0, 0, 537]]","0.901, 0.845, 0.800 0.873, 0.891, 0.879 0.888, 0.884",0.863
2,Gaussian NB,0.77,0.958,0.788,0.792,0.769,"[[416, 38, 42, 0, 0, 0],  [9, 451, 11, 0, 0, 0],  [80, 83, 257, 0, 0, 0],  [0, 7, 0, 368, 111, 5],  [0, 15, 0, 54, 455, 8],  [0, 3, 0, 211, 0, 323]]","0.765, 0.686, 0.577 0.683, 0.704, 0.786 0.815, 0.698",0.726
3,Categorical NB,0.87,0.981,0.898,0.871,0.862,"[[452, 8, 36, 0, 0, 0],  [24, 428, 19, 0, 0, 0],  [84, 63, 273, 0, 0, 0],  [0, 2, 0, 377, 111, 1],  [0, 1, 0, 33, 498, 0],  [0, 0, 0, 0, 0, 537]]","0.868, 0.824, 0.779 0.806, 0.811, 0.844 0.896, 0.832",0.84


### Testy na modelach zespołowych z biblioteki scikit-learn
(n - liczba modeli w modelu zespołowym)

Random Forest Classifier

In [7]:
rf_results = test_ensemble( 
    x,
    y,
    x_train,
    y_train,
    x_test,
    y_test,
    classifier = RandomForestClassifier,
    max_features_values=[1.0, "sqrt"],
    model = None,
    n_estimators_values=[5,10],
    random_state = random_state
    )

In [8]:
display(HTML(rf_results.to_html(escape=False)))

Unnamed: 0,Model,n_estimators,max_features,Accuracy,ROC AUC,PR AUC,Precision,Recall,Confusion matrix,CV scores,CV mean score
0,Random Forest,5,1.0,0.877,0.97,0.9,0.878,0.872,"[[473, 10, 13, 0, 0, 0],  [76, 368, 27, 0, 0, 0],  [22, 59, 339, 0, 0, 0],  [0, 0, 0, 389, 102, 0],  [0, 0, 0, 54, 478, 0],  [0, 0, 0, 0, 0, 537]]","0.918, 0.888, 0.825 0.932, 0.939, 0.898 0.896, 0.924",0.896
1,Random Forest,10,1.0,0.879,0.981,0.927,0.88,0.875,"[[477, 8, 11, 0, 0, 0],  [89, 361, 21, 0, 0, 0],  [12, 56, 352, 0, 0, 0],  [0, 0, 0, 395, 96, 0],  [0, 0, 0, 65, 467, 0],  [0, 0, 0, 0, 0, 537]]","0.917, 0.903, 0.826 0.940, 0.947, 0.909 0.899, 0.924",0.904
2,Random Forest,5,sqrt,0.875,0.981,0.917,0.88,0.869,"[[455, 32, 9, 0, 0, 0],  [90, 369, 12, 0, 0, 0],  [43, 57, 320, 0, 0, 0],  [0, 2, 0, 402, 87, 0],  [0, 0, 0, 36, 496, 0],  [0, 0, 0, 0, 0, 537]]","0.920, 0.907, 0.842 0.873, 0.938, 0.919 0.920, 0.925",0.906
3,Random Forest,10,sqrt,0.899,0.99,0.951,0.9,0.895,"[[467, 19, 10, 0, 0, 0],  [61, 398, 12, 0, 0, 0],  [28, 52, 340, 0, 0, 0],  [0, 0, 0, 434, 57, 0],  [0, 0, 0, 59, 473, 0],  [0, 0, 0, 0, 0, 537]]","0.928, 0.925, 0.855 0.888, 0.953, 0.938 0.936, 0.941",0.922


Bagging Classifier

In [None]:
bag_svc_results = test_ensemble( 
    x,
    y,
    x_train,
    y_train,
    x_test,
    y_test,
    classifier = BaggingClassifier,
    model = SVC(probability=True), 
    n_estimators_values=[5,10],
    random_state = random_state
    )

In [None]:
display(HTML(bag_svc_results.to_html(escape=False)))

In [None]:
bag_dt_results = test_ensemble( 
    x,
    y,
    x_train,
    y_train,
    x_test,
    y_test,
    classifier = BaggingClassifier,
    model = DecisionTreeClassifier(), 
    n_estimators_values=[5,10],
    random_state = random_state
    )

In [None]:
display(HTML(bag_dt_results.to_html(escape=False)))

In [None]:
bag_catNB_results = test_ensemble( 
    x_discrete,
    y,
    x_train_discrete,
    y_train,
    x_test_discrete,
    y_test,
    classifier = BaggingClassifier,
    model = CategoricalNB(min_categories=min_categories), 
    n_estimators_values=[5,10],
    random_state = random_state
    )

In [None]:
print("Bagging Categorical Naive Bayes:")
display(HTML(bag_catNB_results.to_html(escape=False)))

In [None]:
bag_gaussNB_results = test_ensemble( 
    x,
    y,
    x_train,
    y_train,
    x_test,
    y_test,
    classifier = BaggingClassifier,
    model = GaussianNB(), 
    n_estimators_values=[5,10],
    random_state = random_state
    )

In [None]:
print("Bagging Gaussian Naive Bayes:")
display(HTML(bag_gaussNB_results.to_html(escape=False)))

### Testy na przygotowanej implementacji modeli zespołowych

Model zespołowy modeli SVC

In [None]:
ens_svc_results = test_ensemble( 
    x,
    y,
    x_train,
    y_train,
    x_test,
    y_test,
    classifier = Ensemble,
    model = SVC, 
    n_estimators_values=[5,10],
    random_state = random_state
    )

In [None]:
print("Ensemble SVC:")
display(HTML(ens_svc_results.to_html(escape=False)))

Model zespołowy modeli Gaussian Naive Bayes

In [None]:
ens_gaussNB_results = test_ensemble( 
    x,
    y,
    x_train,
    y_train,
    x_test,
    y_test,
    classifier = Ensemble,
    model = GaussianNB, 
    n_estimators_values=[5,10],
    random_state = random_state
    )

In [None]:
print("Ensemble Gaussian Naive Bayes:")
display(HTML(ens_gaussNB_results.to_html(escape=False)))

Model zespołowy modeli Naive Bayes for categorical features

In [None]:
ens_catNB_results = test_ensemble( 
    x_discrete,
    y,
    x_train_discrete,
    y_train,
    x_test_discrete,
    y_test,
    classifier = Ensemble,
    model = CategoricalNB, 
    n_estimators_values=[5,10],
    random_state = random_state
    )

In [None]:
print("Ensemble Categorical Naive Bayes:")
display(HTML(ens_catNB_results.to_html(escape=False)))

Model zespołowy modeli Decision Tree Classifier

In [None]:
ens_dt_results = test_ensemble( 
    x,
    y,
    x_train,
    y_train,
    x_test,
    y_test,
    classifier = Ensemble,
    model = DecisionTreeClassifier, 
    n_estimators_values=[5,10],
    random_state = 4
    )

In [None]:
print("Ensemble Decision Tree:")
display(HTML(ens_dt_results.to_html(escape=False)))