# ZUM - Projekt

Import potrzebnych narzędzi.

In [122]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.naive_bayes import CategoricalNB, GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_curve, auc, roc_auc_score, average_precision_score, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import cross_val_score, cross_val_predict

Import przygotowanej klasy Ensemble.

In [21]:
from src.ensembles import Ensemble

Plan badań
1. Modele zespołowe dla każdego z algorytmów (SVC, DecisionTreeClassifier,  CategoricalNB, GaussianNB)
2. Dla każdego z modelów zespołowych testy dla 5, 10, 50, 100 modeli w zespole.
3. Analogiczne testy dla Bagging Classifier
4. Analogiczne testy dla Random Forest Classifier (liczba drzew)
5. Testy na algorytmach konwencjonalnych
6. Wszystko wyżej dla CV = 10
7. Wszystko wyżej powtórzone dla dwóch zbiorów danych.

** dodatkowo różne opcje podzialu atrybutow na modele w modelach zespolowych

w sumie (4*4 + 4*4 + 4 + 4)*2 = 80

## Human activity

In [22]:
random_state = 3

Załadowanie danych

In [125]:
x_train = np.loadtxt("human_activity/X_train.txt", dtype = float)
x_test = np.loadtxt("human_activity/X_test.txt", dtype = float)
y_train = np.loadtxt("human_activity/y_train.txt", dtype = int)
y_test = np.loadtxt("human_activity/y_test.txt", dtype = int)

x = np.vstack([x_train, x_test])
y = np.append(y_train, y_test)

#x_train, x_test, y_train, y_test = train_test_split(
#    x, y, test_size=0.99, random_state=random_state
#)

[1 2 3 4 5 6]


Funkcja pomocnicza do dyskretyzacji danych

In [24]:
def find_intervals(x_train, group_vector):  # auxilary values for data disrcetization
    intervals = np.array([np.zeros(i - 1) for i in group_vector])

    for i, features in enumerate(x_train.T):
        max_value = max(features)
        min_value = min(features)
        section_size = (max_value - min_value) / group_vector[i]
        intervals[i] = np.array(
            [min_value + section_size * j for j in range(1, group_vector[i])]
        )
    return intervals

Funkcje do testowania klasyfikatorów

In [136]:
def test_classifier(clf, x_train, y_train, x_test, y_test):
    clf.fit(x_train, y_train)
    
    accuracy_score = clf.score(x_test, y_test)
    y_pred_proba = clf.predict_proba(x_test)
    roc_auc = roc_auc_score(y_test, y_pred_proba, average='macro', multi_class='ovr')
    pr_auc = average_precision_score(y_test, y_pred_proba, average='macro')
    y_pred = clf.predict(x_test)
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    return accuracy_score, roc_auc, pr_auc, precision, recall, conf_matrix

In [135]:
def test_cv_classifier(clf, x, y, n=10):
   scores = cross_val_score(clf, x, y, cv=n)
   scores_mean = scores.mean()
   return scores, scores_mean

In [147]:
def print_results(accuracy_score, roc_auc, pr_auc, precision, recall, conf_matrix, scores, scores_mean):
    print("Accuracy: {:.3f}".format(accuracy_score))
    print("ROC AUC: {:.3f}".format(roc_auc))
    print("PR AUC: {:.3f}".format(pr_auc))
    print("Precision: {:.3f}".format(precision))
    print("Recall: {:.3f}".format(recall))
    print("Confusion matrix:\n", conf_matrix)
    
    print("CV scores:", end=" ")
    for score in scores:
        print("{:.3f}".format(score), end=" ")
    print()
    
    print("CV mean score: {:.3f}".format(scores_mean))

### Testy na pojedynczych modelach

SVC

In [152]:
clf_svc = SVC(probability=True)
svc_accuracy_score, svc_roc_auc, svc_pr_auc, svc_precision, svc_recall,\
    svc_conf_matrix = test_classifier(clf_svc, x_train, y_train, x_test, y_test)
svc_scores, svc_scores_mean = test_cv_classifier(clf_svc, x, y, n=10)
print("SVC:")
print_results(svc_accuracy_score, svc_roc_auc, svc_pr_auc, svc_precision, svc_recall,\
    svc_conf_matrix, svc_scores, svc_scores_mean)


SVC:
Accuracy: 0.950
ROC AUC: 0.998
PR AUC: 0.990
Precision: 0.952
Recall: 0.949
Confusion matrix:
 [[488   5   3   0   0   0]
 [ 20 451   0   0   0   0]
 [ 10  26 384   0   0   0]
 [  0   2   0 438  51   0]
 [  0   0   0  29 503   0]
 [  0   0   0   0   0 537]]
CV scores: 0.971 0.931 0.879 0.947 0.970 0.974 0.961 0.956 0.953 0.968 
CV mean score: 0.951


Gaussian Naive Bayes

In [151]:
clf_gaussNB = GaussianNB()
gaussNB_accuracy_score, gaussNB_roc_auc, gaussNB_pr_auc, gaussNB_precision, gaussNB_recall,\
    gaussNB_conf_matrix = test_classifier(clf_gaussNB, x_train, y_train, x_test, y_test)
gaussNB_scores, gaussNB_scores_mean = test_cv_classifier(clf_gaussNB, x, y, n=10)
print("Gaussian Naive Bayes:")
print_results(gaussNB_accuracy_score, gaussNB_roc_auc, gaussNB_pr_auc, gaussNB_precision, gaussNB_recall,\
    gaussNB_conf_matrix, gaussNB_scores, gaussNB_scores_mean)

Gaussian Naive Bayes:
Accuracy: 0.770
ROC AUC: 0.958
PR AUC: 0.788
Precision: 0.792
Recall: 0.769
Confusion matrix:
 [[416  38  42   0   0   0]
 [  9 451  11   0   0   0]
 [ 80  83 257   0   0   0]
 [  0   7   0 368 111   5]
 [  0  15   0  54 455   8]
 [  0   3   0 211   0 323]]
CV scores: 0.765 0.686 0.577 0.754 0.683 0.704 0.786 0.792 0.815 0.698 
CV mean score: 0.726


Naive Bayes classifier for categorical features

In [100]:
intervals = find_intervals(x_train, [4] * 561)
x_train_discrete = np.array([np.digitize(column , bins=intervals[i]) for i, column in enumerate(x_train.T)]).T
x_test_discrete = np.array([np.digitize(column , bins=intervals[i]) for i, column in enumerate(x_test.T)]).T
x_discrete = np.vstack([x_train_discrete, x_test_discrete])

In [150]:
clf_catNB = CategoricalNB(min_categories=[4] * 561)
catNB_accuracy_score, catNB_roc_auc, catNB_pr_auc, catNB_precision, catNB_recall,\
    catNB_conf_matrix = test_classifier(clf_catNB, x_train_discrete, y_train, x_test_discrete, y_test)
catNB_scores, catNB_scores_mean = test_cv_classifier(clf_svc, x_discrete, y, n=10)
print("Categorical Naive Bayes:")
print_results(catNB_accuracy_score, catNB_roc_auc, catNB_pr_auc, catNB_precision, catNB_recall,\
    catNB_conf_matrix, catNB_scores, catNB_scores_mean)

Categorical Naive Bayes:
Accuracy: 0.870
ROC AUC: 0.981
PR AUC: 0.898
Precision: 0.871
Recall: 0.862
Confusion matrix:
 [[452   8  36   0   0   0]
 [ 24 428  19   0   0   0]
 [ 84  63 273   0   0   0]
 [  0   2   0 377 111   1]
 [  0   1   0  33 498   0]
 [  0   0   0   0   0 537]]
CV scores: 0.954 0.917 0.859 0.951 0.964 0.970 0.937 0.937 0.950 0.966 
CV mean score: 0.940


Decision Tree Classifier

In [149]:
clf_dt = DecisionTreeClassifier()
dt_accuracy_score, dt_roc_auc, dt_pr_auc, dt_precision, dt_recall,\
    dt_conf_matrix = test_classifier(clf_dt, x_train, y_train, x_test, y_test)
dt_scores, dt_scores_mean = test_cv_classifier(clf_dt, x, y, n=10)
print("Decision Tree Classifier:")
print_results(dt_accuracy_score, dt_roc_auc, dt_pr_auc, dt_precision, dt_recall,\
    dt_conf_matrix, dt_scores, dt_scores_mean)

Decision Tree Classifier:
Accuracy: 0.856
ROC AUC: 0.912
PR AUC: 0.758
Precision: 0.856
Recall: 0.853
Confusion matrix:
 [[439  35  22   0   0   0]
 [ 89 351  31   0   0   0]
 [ 15  49 356   0   0   0]
 [  0   0   0 377 114   0]
 [  0   0   0  68 464   0]
 [  0   0   0   0   0 537]]
CV scores: 0.893 0.839 0.822 0.850 0.900 0.874 0.883 0.817 0.885 0.894 
CV mean score: 0.866


### Testy na modelach zespołowych z biblioteki scikit-learn

Random Forest Classifier

In [148]:
clf_rf = RandomForestClassifier(n_estimators=10, random_state=random_state)
rf_accuracy_score, rf_roc_auc, rf_pr_auc, rf_precision, rf_recall,\
    rf_conf_matrix = test_classifier(clf_rf, x_train, y_train, x_test, y_test)
rf_scores, rf_scores_mean = test_cv_classifier(clf_rf, x, y, n=10)
print("Random Forest Classifier:")
print_results(rf_accuracy_score, rf_roc_auc, rf_pr_auc, rf_precision, rf_recall,\
    rf_conf_matrix, rf_scores, rf_scores_mean)

Random Forest Classifier:
Accuracy: 0.899
ROC AUC: 0.990
PR AUC: 0.951
Precision: 0.900
Recall: 0.895
Confusion matrix:
 [[467  19  10   0   0   0]
 [ 61 398  12   0   0   0]
 [ 28  52 340   0   0   0]
 [  0   0   0 434  57   0]
 [  0   0   0  59 473   0]
 [  0   0   0   0   0 537]]
CV scores: 0.928 0.925 0.855 0.948 0.888 0.953 0.938 0.904 0.936 0.941 
CV mean score: 0.922


Bagging Classifier

In [154]:
clf_bag_svc = BaggingClassifier(estimator=SVC(probability=True), n_estimators=10, random_state=random_state)
bag_svc_accuracy_score, bag_svc_roc_auc, bag_svc_pr_auc, bag_svc_precision, bag_svc_recall,\
    bag_svc_conf_matrix = test_classifier(clf_bag_svc, x_train, y_train, x_test, y_test)
bag_svc_scores, bag_svc_scores_mean = test_cv_classifier(clf_bag_svc, x, y, n=10)
print("Bagging SVC Classifier:")
print_results(bag_svc_accuracy_score, bag_svc_roc_auc, bag_svc_pr_auc, bag_svc_precision, bag_svc_recall,\
    bag_svc_conf_matrix, bag_svc_scores, bag_svc_scores_mean)

Bagging SVC Classifier:
Accuracy: 0.951
ROC AUC: 0.998
PR AUC: 0.990
Precision: 0.952
Recall: 0.950
Confusion matrix:
 [[487   4   5   0   0   0]
 [ 16 453   2   0   0   0]
 [  8  24 388   0   0   0]
 [  0   2   0 438  51   0]
 [  0   0   0  32 500   0]
 [  0   0   0   0   0 537]]
CV scores: 0.971 0.922 0.878 0.946 0.970 0.975 0.958 0.958 0.955 0.970 
CV mean score: 0.950


In [155]:
clf_bag_dt = BaggingClassifier(estimator=DecisionTreeClassifier(),n_estimators=10, random_state=random_state)
bag_dt_accuracy_score, bag_dt_roc_auc, bag_dt_pr_auc, bag_dt_precision, bag_dt_recall,\
    bag_dt_conf_matrix = test_classifier(clf_bag_dt, x_train, y_train, x_test, y_test)
bag_dt_scores, bag_dt_scores_mean = test_cv_classifier(clf_bag_dt, x, y, n=10)
print("Bagging Decision Tree Classifier:")
print_results(bag_dt_accuracy_score, bag_dt_roc_auc, bag_dt_pr_auc, bag_dt_precision, bag_dt_recall,\
    bag_dt_conf_matrix, bag_dt_scores, bag_dt_scores_mean)

Bagging Decision Tree Classifier:
Accuracy: 0.881
ROC AUC: 0.979
PR AUC: 0.925
Precision: 0.882
Recall: 0.877
Confusion matrix:
 [[475   7  14   0   0   0]
 [ 84 365  22   0   0   0]
 [ 12  54 354   0   0   0]
 [  0   0   0 397  94   0]
 [  0   0   0  64 468   0]
 [  0   0   0   0   0 537]]
CV scores: 0.914 0.890 0.828 0.920 0.939 0.958 0.907 0.849 0.901 0.927 
CV mean score: 0.903


In [156]:
intervals = find_intervals(x_train, [4] * 561)
x_train_discrete = np.array([np.digitize(column , bins=intervals[i]) for i, column in enumerate(x_train.T)]).T
x_test_discrete = np.array([np.digitize(column , bins=intervals[i]) for i, column in enumerate(x_test.T)]).T
x_discrete = np.vstack([x_train_discrete, x_test_discrete])

In [157]:
clf_bag_catNB = BaggingClassifier(estimator=CategoricalNB(min_categories=[4]*561),n_estimators=10, random_state=random_state)
bag_catNB_accuracy_score, bag_catNB_roc_auc, bag_catNB_pr_auc, \
    bag_catNB_precision, bag_catNB_recall, bag_catNB_conf_matrix \
        = test_classifier(clf_bag_catNB, x_train_discrete, y_train, x_test_discrete, y_test)
bag_catNB_scores, bag_catNB_scores_mean = test_cv_classifier(clf_bag_catNB, x_discrete, y, n=10)
print("Bagging Categorical Naive Bayes:")
print_results(bag_catNB_accuracy_score, bag_catNB_roc_auc, bag_catNB_pr_auc, \
    bag_catNB_precision, bag_catNB_recall, bag_catNB_conf_matrix, \
        bag_catNB_scores, bag_catNB_scores_mean)

Bagging Categorical Naive Bayes:
Accuracy: 0.872
ROC AUC: 0.981
PR AUC: 0.903
Precision: 0.873
Recall: 0.864
Confusion matrix:
 [[452   7  37   0   0   0]
 [ 24 430  17   0   0   0]
 [ 85  61 274   0   0   0]
 [  0   2   0 379 109   1]
 [  0   1   0  32 499   0]
 [  0   0   0   0   0 537]]
CV scores: 0.864 0.820 0.777 0.854 0.805 0.810 0.843 0.890 0.897 0.832 
CV mean score: 0.839


In [158]:
clf_bag_gaussNB = BaggingClassifier(estimator=GaussianNB(),n_estimators=10, random_state=random_state)
bag_gaussNB_accuracy_score, bag_gaussNB_roc_auc, bag_gaussNB_pr_auc, \
    bag_gaussNB_precision, bag_gaussNB_recall, bag_dt_conf_matrix \
        = test_classifier(clf_bag_gaussNB, x_train, y_train, x_test, y_test)
bag_gaussNB_scores, bag_gaussNB_scores_mean = test_cv_classifier(clf_bag_gaussNB, x, y, n=10)
print("Bagging Gaussian Naive Bayes:")
print_results(bag_gaussNB_accuracy_score, bag_gaussNB_roc_auc, bag_gaussNB_pr_auc, \
    bag_gaussNB_precision, bag_gaussNB_recall, bag_dt_conf_matrix, \
        bag_gaussNB_scores, bag_gaussNB_scores_mean)

Bagging Gaussian Naive Bayes:
Accuracy: 0.808
ROC AUC: 0.964
PR AUC: 0.829
Precision: 0.819
Recall: 0.801
Confusion matrix:
 [[422  31  43   0   0   0]
 [  9 451  11   0   0   0]
 [ 80  83 257   0   0   0]
 [  0   7   0 268 213   3]
 [  1  15   0  19 493   4]
 [  0   6   0  40   0 491]]
CV scores: 0.762 0.679 0.585 0.756 0.688 0.703 0.800 0.792 0.838 0.705 
CV mean score: 0.731


### Testy na przygotowanej implementacji modeli zespołowych

Model zespołowy modeli SVC

In [None]:
clf_ens_svc = Ensemble(
    SVC, 10, random_state=random_state
)
ens_svc_accuracy_score, ens_svc_roc_auc, ens_svc_pr_auc, \
    ens_svc_precision, ens_svc_recall, ens_svc_conf_matrix \
        = test_classifier(clf_ens_svc, x_train, y_train, x_test, y_test)
ens_svc_scores, ens_svc_scores_mean = test_cv_classifier(clf_ens_svc, x, y, n=10)
print("Ensemble SVC:")
print_results(ens_svc_accuracy_score, ens_svc_roc_auc, ens_svc_pr_auc, \
    ens_svc_precision, ens_svc_recall, ens_svc_conf_matrix, \
        ens_svc_scores, ens_svc_scores_mean)

Model zespołowy modeli Gaussian Naive Bayes

In [None]:
clf_ens_gaussNB = Ensemble(
    GaussianNB, 10, random_state=random_state
)
ens_gaussNB_accuracy_score, ens_gaussNB_roc_auc, ens_gaussNB_pr_auc, \
    ens_gaussNB_precision, ens_gaussNB_recall, ens_gaussNB_conf_matrix \
        = test_classifier(clf_ens_gaussNB, x_train, y_train, x_test, y_test)
ens_gaussNB_scores, ens_gaussNB_scores_mean = test_cv_classifier(clf_ens_gaussNB, x, y, n=10)
print("Ensemble Gaussian Naive Bayes:")
print_results(ens_gaussNB_accuracy_score, ens_gaussNB_roc_auc, ens_gaussNB_pr_auc, \
    ens_gaussNB_precision, ens_gaussNB_recall, ens_gaussNB_conf_matrix, \
        ens_gaussNB_scores, ens_gaussNB_scores_mean)

Model zespołowy modeli Naive Bayes for categorical features

In [None]:
intervals = find_intervals(x_train, [4] * 561)
x_train_discrete = np.array([np.digitize(column , bins=intervals[i]) for i, column in enumerate(x_train.T)]).T
x_test_discrete = np.array([np.digitize(column , bins=intervals[i]) for i, column in enumerate(x_test.T)]).T
x_discrete = np.vstack([x_train_discrete, x_test_discrete])

In [None]:
clf_ens_catNB = Ensemble(
    CategoricalNB, 10, random_state=random_state, min_categories=[4] * 561
)
ens_catNB_accuracy_score, ens_catNB_roc_auc, ens_catNB_pr_auc, \
    ens_catNB_precision, ens_catNB_recall, ens_catNB_conf_matrix \
        = test_classifier(clf_ens_catNB, x_train, y_train, x_test, y_test)
ens_catNB_scores, ens_catNB_scores_mean = test_cv_classifier(clf_ens_catNB, x, y, n=10)
print("Ensemble Categorical Naive Bayes:")
print_results(ens_catNB_accuracy_score, ens_catNB_roc_auc, ens_catNB_pr_auc, \
    ens_catNB_precision, ens_catNB_recall, ens_catNB_conf_matrix, \
        ens_catNB_scores, ens_catNB_scores_mean)

Model zespołowy modeli Decision Tree Classifier

In [None]:
clf_ens_dt = Ensemble(
    DecisionTreeClassifier, 10, max_attributes= 30, random_state=random_state
)
ens_dt_accuracy_score, ens_dt_roc_auc, ens_dt_pr_auc, \
    ens_dt_precision, ens_dt_recall, ens_dt_conf_matrix \
        = test_classifier(clf_ens_dt, x_train, y_train, x_test, y_test)
ens_dt_scores, ens_dt_scores_mean = test_cv_classifier(clf_ens_dt, x, y, n=10)
print("Ensemble Decision Tree Classifier:")
print_results(ens_dt_accuracy_score, ens_dt_roc_auc, ens_dt_pr_auc, \
    ens_dt_precision, ens_dt_recall, ens_dt_conf_matrix, \
        ens_dt_scores, ens_dt_scores_mean)