# ZUM - Projekt

Import potrzebnych narzędzi.

In [122]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.naive_bayes import CategoricalNB, GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_curve, auc, roc_auc_score, average_precision_score, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import cross_val_score, cross_val_predict
from IPython.display import display, HTML

Import przygotowanej klasy Ensemble.

In [129]:
from src.ensembles import Ensemble

Plan badań
1. Modele zespołowe dla każdego z algorytmów (SVC, DecisionTreeClassifier,  CategoricalNB, GaussianNB)
2. Dla każdego z modelów zespołowych testy dla 5, 10, 50, 100 modeli w zespole.
3. Analogiczne testy dla Bagging Classifier
4. Analogiczne testy dla Random Forest Classifier (liczba drzew)
5. Testy na algorytmach konwencjonalnych
6. Wszystko wyżej dla CV = 10
7. Wszystko wyżej powtórzone dla dwóch zbiorów danych.

** dodatkowo różne opcje podzialu atrybutow na modele w modelach zespolowych

w sumie (4*4 + 4*4 + 4 + 4)*2 = 80

## Human activity

In [130]:
random_state = 3

Załadowanie danych

In [None]:
x_train = np.loadtxt("human_activity/X_train.txt", dtype = float)
x_test = np.loadtxt("human_activity/X_test.txt", dtype = float)
y_train = np.loadtxt("human_activity/y_train.txt", dtype = int)
y_test = np.loadtxt("human_activity/y_test.txt", dtype = int)

x = np.vstack([x_train, x_test])
y = np.append(y_train, y_test)

#x_train, x_test, y_train, y_test = train_test_split(
#    x, y, test_size=0.99, random_state=random_state
#)

Funkcja pomocnicza do dyskretyzacji danych

In [110]:
def find_intervals(x_train, group_vector):  # auxilary values for data disrcetization
    intervals = np.array([np.zeros(i - 1) for i in group_vector])

    for i, features in enumerate(x_train.T):
        max_value = max(features)
        min_value = min(features)
        section_size = (max_value - min_value) / group_vector[i]
        intervals[i] = np.array(
            [min_value + section_size * j for j in range(1, group_vector[i])]
        )
    return intervals

In [111]:
intervals = find_intervals(x_train, [4] * 561)
x_train_discrete = np.array([np.digitize(column , bins=intervals[i]) for i, column in enumerate(x_train.T)]).T
x_test_discrete = np.array([np.digitize(column , bins=intervals[i]) for i, column in enumerate(x_test.T)]).T
x_discrete = np.vstack([x_train_discrete, x_test_discrete])

Funkcje do testowania klasyfikatorów

In [None]:
def test_classifier(clf, x_train, y_train, x_test, y_test):
    clf.fit(x_train, y_train)
    
    accuracy_score = clf.score(x_test, y_test)
    y_pred_proba = clf.predict_proba(x_test)
    roc_auc = roc_auc_score(y_test, y_pred_proba, average='macro', multi_class='ovr')
    pr_auc = average_precision_score(y_test, y_pred_proba, average='macro')
    y_pred = clf.predict(x_test)
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    return accuracy_score, roc_auc, pr_auc, precision, recall, conf_matrix

In [None]:
def test_cv_classifier(clf, x, y, n=10):
   scores = cross_val_score(clf, x, y, cv=n)
   scores_mean = scores.mean()
   return scores, scores_mean

Funkcja do wypisywania wyników

In [135]:
def print_results(accuracy_score, roc_auc, pr_auc, precision, recall, conf_matrix, scores, scores_mean):
    print("Accuracy: {:.3f}".format(accuracy_score))
    print("ROC AUC: {:.3f}".format(roc_auc))
    print("PR AUC: {:.3f}".format(pr_auc))
    print("Precision: {:.3f}".format(precision))
    print("Recall: {:.3f}".format(recall))
    print("Confusion matrix:\n", conf_matrix)
    
    print("CV scores:", end=" ")
    for score in scores:
        print("{:.3f}".format(score), end=" ")
    print()
    
    print("CV mean score: {:.3f}".format(scores_mean))

Funkcje do formatowania macierzy pomyłek i wyników z CV do lepszej prezentacji
w formie tabeli

In [None]:
def format_conf_matrix(conf_matrix):
    formatted_matrix = '[' + ',<br> '.join(['[' + ', '.join(map(str, row)) + ']' for row in conf_matrix]) + ']'
    return formatted_matrix

# Define a function to format CV scores into multiple lines
def format_cv_scores(cv_scores):
    lines = []
    for i in range(0, len(cv_scores), 4):
        line = ', '.join(f'{score:.3f}' for score in cv_scores[i:i+3])
        lines.append(line)
    return '<br>'.join(lines)

Funkcja do testów modeli zespołowych

In [162]:
def test_ensemble( 
    x,
    y,
    x_train,
    y_train,
    x_test,
    y_test,
    classifier,
    model = None, 
    n_estimators_values=[5,10,50,100],
    random_state = random_state,
    min_categories = [4] * 561
    ):
    results = []
    n = n_estimators_values
    for n_estimators in n:
        if classifier == RandomForestClassifier:
            clf = classifier(n_estimators=n_estimators, random_state=random_state)
            accuracy_score, roc_auc, pr_auc, precision, recall, conf_matrix = test_classifier(clf, x_train, y_train, x_test, y_test)
            scores, scores_mean = test_cv_classifier(clf, x, y, n=10)
        elif classifier == Ensemble:
            if model == CategoricalNB:
                clf = classifier(classifier_constructor=model, classifiers_number=n_estimators, random_state=random_state, min_categories=min_categories)
            else:
                clf = classifier(classifier_constructor=model, classifiers_number=n_estimators, random_state=random_state)
            accuracy_score, roc_auc, pr_auc, precision, recall, conf_matrix = test_classifier(clf, x_train, y_train, x_test, y_test)
            scores, scores_mean = test_cv_classifier(clf, x, y, n=10)
        else:
            clf = classifier(estimator=model, n_estimators=n_estimators, random_state=random_state)
            accuracy_score, roc_auc, pr_auc, precision, recall, conf_matrix = test_classifier(clf, x_train, y_train, x_test, y_test)
            scores, scores_mean = test_cv_classifier(clf, x, y, n=10)
            
        
        results.append({
            'n': n_estimators,
            'Accuracy': round(accuracy_score, 3),
            'ROC AUC': round(roc_auc, 3),
            'PR AUC': round(pr_auc, 3),
            'Precision': round(precision, 3),
            'Recall': round(recall, 3),
            'Confusion matrix': format_conf_matrix(conf_matrix),
            'CV scores': format_cv_scores(scores),
            'CV mean score': round(scores_mean, 3)
        })

    results_df = pd.DataFrame(results)
    return results_df

### Testy na pojedynczych modelach

SVC

In [167]:
clf_svc = SVC(probability=True)
svc_accuracy_score, svc_roc_auc, svc_pr_auc, svc_precision, svc_recall,\
    svc_conf_matrix = test_classifier(clf_svc, x_train, y_train, x_test, y_test)
svc_scores, svc_scores_mean = test_cv_classifier(clf_svc, x, y, n=10)
print("SVC:")
print_results(svc_accuracy_score, svc_roc_auc, svc_pr_auc, svc_precision, svc_recall,\
    svc_conf_matrix, svc_scores, svc_scores_mean)


SVC:
Accuracy: 0.950
ROC AUC: 0.998
PR AUC: 0.990
Precision: 0.952
Recall: 0.949
Confusion matrix:
 [[488   5   3   0   0   0]
 [ 20 451   0   0   0   0]
 [ 10  26 384   0   0   0]
 [  0   2   0 438  51   0]
 [  0   0   0  29 503   0]
 [  0   0   0   0   0 537]]
CV scores: 0.971 0.931 0.879 0.947 0.970 0.974 0.961 0.956 0.953 0.968 
CV mean score: 0.951


Gaussian Naive Bayes

In [168]:
clf_gaussNB = GaussianNB()
gaussNB_accuracy_score, gaussNB_roc_auc, gaussNB_pr_auc, gaussNB_precision, gaussNB_recall,\
    gaussNB_conf_matrix = test_classifier(clf_gaussNB, x_train, y_train, x_test, y_test)
gaussNB_scores, gaussNB_scores_mean = test_cv_classifier(clf_gaussNB, x, y, n=10)
print("Gaussian Naive Bayes:")
print_results(gaussNB_accuracy_score, gaussNB_roc_auc, gaussNB_pr_auc, gaussNB_precision, gaussNB_recall,\
    gaussNB_conf_matrix, gaussNB_scores, gaussNB_scores_mean)

Gaussian Naive Bayes:
Accuracy: 0.770
ROC AUC: 0.958
PR AUC: 0.788
Precision: 0.792
Recall: 0.769
Confusion matrix:
 [[416  38  42   0   0   0]
 [  9 451  11   0   0   0]
 [ 80  83 257   0   0   0]
 [  0   7   0 368 111   5]
 [  0  15   0  54 455   8]
 [  0   3   0 211   0 323]]
CV scores: 0.765 0.686 0.577 0.754 0.683 0.704 0.786 0.792 0.815 0.698 
CV mean score: 0.726


Naive Bayes classifier for categorical features

In [169]:
clf_catNB = CategoricalNB(min_categories=[4] * 561)
catNB_accuracy_score, catNB_roc_auc, catNB_pr_auc, catNB_precision, catNB_recall,\
    catNB_conf_matrix = test_classifier(clf_catNB, x_train_discrete, y_train, x_test_discrete, y_test)
catNB_scores, catNB_scores_mean = test_cv_classifier(clf_svc, x_discrete, y, n=10)
print("Categorical Naive Bayes:")
print_results(catNB_accuracy_score, catNB_roc_auc, catNB_pr_auc, catNB_precision, catNB_recall,\
    catNB_conf_matrix, catNB_scores, catNB_scores_mean)

Categorical Naive Bayes:
Accuracy: 0.870
ROC AUC: 0.981
PR AUC: 0.898
Precision: 0.871
Recall: 0.862
Confusion matrix:
 [[452   8  36   0   0   0]
 [ 24 428  19   0   0   0]
 [ 84  63 273   0   0   0]
 [  0   2   0 377 111   1]
 [  0   1   0  33 498   0]
 [  0   0   0   0   0 537]]
CV scores: 0.954 0.917 0.859 0.951 0.964 0.970 0.937 0.937 0.950 0.966 
CV mean score: 0.940


Decision Tree Classifier

In [170]:
clf_dt = DecisionTreeClassifier()
dt_accuracy_score, dt_roc_auc, dt_pr_auc, dt_precision, dt_recall,\
    dt_conf_matrix = test_classifier(clf_dt, x_train, y_train, x_test, y_test)
dt_scores, dt_scores_mean = test_cv_classifier(clf_dt, x, y, n=10)
print("Decision Tree Classifier:")
print_results(dt_accuracy_score, dt_roc_auc, dt_pr_auc, dt_precision, dt_recall,\
    dt_conf_matrix, dt_scores, dt_scores_mean)

Decision Tree Classifier:
Accuracy: 0.855
ROC AUC: 0.911
PR AUC: 0.755
Precision: 0.853
Recall: 0.851
Confusion matrix:
 [[436  36  24   0   0   0]
 [ 67 359  45   0   0   0]
 [ 24  48 348   0   0   0]
 [  0   0   0 378 113   0]
 [  0   0   0  70 462   0]
 [  0   0   0   0   0 537]]
CV scores: 0.901 0.848 0.802 0.851 0.871 0.897 0.880 0.810 0.887 0.881 
CV mean score: 0.863


In [171]:
# Create a DataFrame to store the results
results_df = pd.DataFrame({
    'Model': ['SVC', 'Gaussian Naive Bayes', 'Categorical Naive Bayes', 'Decision Tree'],
    'Accuracy': [svc_accuracy_score, gaussNB_accuracy_score, catNB_accuracy_score, dt_accuracy_score],
    'ROC AUC': [svc_roc_auc, gaussNB_roc_auc, catNB_roc_auc, dt_roc_auc],
    'PR AUC': [svc_pr_auc, gaussNB_pr_auc, catNB_pr_auc, dt_pr_auc],
    'Precision': [svc_precision, gaussNB_precision, catNB_precision, dt_precision],
    'Recall': [svc_recall, gaussNB_recall, catNB_recall, dt_recall],
    'Confusion Matrix': [svc_conf_matrix, gaussNB_conf_matrix, catNB_conf_matrix, dt_conf_matrix],
    'CV Scores': [svc_scores, gaussNB_scores, catNB_scores, dt_scores],
    'CV Scores Mean': [svc_scores_mean, gaussNB_scores_mean, catNB_scores_mean, dt_scores_mean]
})

                     Model  Accuracy   ROC AUC    PR AUC  Precision    Recall  \
0                      SVC  0.950458  0.997831  0.990187   0.951999  0.948873   
1     Gaussian Naive Bayes  0.770275  0.957646  0.788423   0.792466  0.769066   
2  Categorical Naive Bayes  0.870377  0.980578  0.897657   0.871240  0.862318   
3            Decision Tree  0.855107  0.911203  0.754769   0.853245  0.851348   

                                    Confusion Matrix  \
0  [[488, 5, 3, 0, 0, 0], [20, 451, 0, 0, 0, 0], ...   
1  [[416, 38, 42, 0, 0, 0], [9, 451, 11, 0, 0, 0]...   
2  [[452, 8, 36, 0, 0, 0], [24, 428, 19, 0, 0, 0]...   
3  [[436, 36, 24, 0, 0, 0], [67, 359, 45, 0, 0, 0...   

                                           CV Scores  CV Scores Mean  
0  [0.970873786407767, 0.9310679611650485, 0.8786...        0.950968  
1  [0.7650485436893204, 0.6864077669902913, 0.576...        0.726087  
2  [0.954368932038835, 0.916504854368932, 0.85922...        0.940482  
3  [0.9009708737864077, 0.847

### Testy na modelach zespołowych z biblioteki scikit-learn
(n - liczba modeli w modelu zespołowym)

In [172]:
display(HTML(results_df.to_html(escape=False)))

Unnamed: 0,Model,Accuracy,ROC AUC,PR AUC,Precision,Recall,Confusion Matrix,CV Scores,CV Scores Mean
0,SVC,0.950458,0.997831,0.990187,0.951999,0.948873,"[[488, 5, 3, 0, 0, 0], [20, 451, 0, 0, 0, 0], [10, 26, 384, 0, 0, 0], [0, 2, 0, 438, 51, 0], [0, 0, 0, 29, 503, 0], [0, 0, 0, 0, 0, 537]]","[0.970873786407767, 0.9310679611650485, 0.8786407766990292, 0.9466019417475728, 0.9699029126213592, 0.9737864077669903, 0.9611650485436893, 0.9563106796116505, 0.9533980582524272, 0.967930029154519]",0.950968
1,Gaussian Naive Bayes,0.770275,0.957646,0.788423,0.792466,0.769066,"[[416, 38, 42, 0, 0, 0], [9, 451, 11, 0, 0, 0], [80, 83, 257, 0, 0, 0], [0, 7, 0, 368, 111, 5], [0, 15, 0, 54, 455, 8], [0, 3, 0, 211, 0, 323]]","[0.7650485436893204, 0.6864077669902913, 0.5766990291262136, 0.7543689320388349, 0.683495145631068, 0.7038834951456311, 0.7864077669902912, 0.7922330097087379, 0.8145631067961165, 0.6977648202137998]",0.726087
2,Categorical Naive Bayes,0.870377,0.980578,0.897657,0.87124,0.862318,"[[452, 8, 36, 0, 0, 0], [24, 428, 19, 0, 0, 0], [84, 63, 273, 0, 0, 0], [0, 2, 0, 377, 111, 1], [0, 1, 0, 33, 498, 0], [0, 0, 0, 0, 0, 537]]","[0.954368932038835, 0.916504854368932, 0.8592233009708737, 0.9514563106796117, 0.9640776699029127, 0.9699029126213592, 0.9368932038834952, 0.9368932038834952, 0.9495145631067962, 0.9659863945578231]",0.940482
3,Decision Tree,0.855107,0.911203,0.754769,0.853245,0.851348,"[[436, 36, 24, 0, 0, 0], [67, 359, 45, 0, 0, 0], [24, 48, 348, 0, 0, 0], [0, 0, 0, 378, 113, 0], [0, 0, 0, 70, 462, 0], [0, 0, 0, 0, 0, 537]]","[0.9009708737864077, 0.8475728155339806, 0.8019417475728156, 0.8514563106796117, 0.870873786407767, 0.8970873786407767, 0.8796116504854369, 0.8097087378640777, 0.887378640776699, 0.8814382896015549]",0.862804


Random Forest Classifier

In [165]:
rf_results = test_ensemble( 
    x,
    y,
    x_train,
    y_train,
    x_test,
    y_test,
    classifier = RandomForestClassifier,
    model = None, 
    n_estimators_values=[5,10],
    random_state = random_state
    )

In [166]:
display(HTML(rf_results.to_html(escape=False)))

Unnamed: 0,n,Accuracy,ROC AUC,PR AUC,Precision,Recall,Confusion matrix,CV scores,CV mean score
0,5,0.875,0.981,0.917,0.88,0.869,"[[455, 32, 9, 0, 0, 0],  [90, 369, 12, 0, 0, 0],  [43, 57, 320, 0, 0, 0],  [0, 2, 0, 402, 87, 0],  [0, 0, 0, 36, 496, 0],  [0, 0, 0, 0, 0, 537]]","0.920, 0.907, 0.842 0.873, 0.938, 0.919 0.920, 0.925",0.906
1,10,0.899,0.99,0.951,0.9,0.895,"[[467, 19, 10, 0, 0, 0],  [61, 398, 12, 0, 0, 0],  [28, 52, 340, 0, 0, 0],  [0, 0, 0, 434, 57, 0],  [0, 0, 0, 59, 473, 0],  [0, 0, 0, 0, 0, 537]]","0.928, 0.925, 0.855 0.888, 0.953, 0.938 0.936, 0.941",0.922


Bagging Classifier

In [None]:
bag_svc_results = test_ensemble( 
    x,
    y,
    x_train,
    y_train,
    x_test,
    y_test,
    classifier = BaggingClassifier,
    model = SVC(probability=True), 
    n_estimators_values=[5,10],
    random_state = random_state
    )

In [None]:
display(HTML(bag_svc_results.to_html(escape=False)))

In [None]:
bag_dt_results = test_ensemble( 
    x,
    y,
    x_train,
    y_train,
    x_test,
    y_test,
    classifier = BaggingClassifier,
    model = DecisionTreeClassifier(), 
    n_estimators_values=[5,10],
    random_state = random_state
    )

In [None]:
display(HTML(bag_dt_results.to_html(escape=False)))

In [113]:
bag_catNB_results = test_ensemble( 
    x_discrete,
    y,
    x_train_discrete,
    y_train,
    x_test_discrete,
    y_test,
    classifier = BaggingClassifier,
    model = CategoricalNB(min_categories=[4]*561), 
    n_estimators_values=[5,10],
    random_state = random_state
    )

In [118]:
print("Bagging Categorical Naive Bayes:")
display(HTML(bag_catNB_results.to_html(escape=False)))

Bagging Categorical Naive Bayes:


Unnamed: 0,n,Accuracy,ROC AUC,PR AUC,Precision,Recall,Confusion matrix,CV scores,CV mean score
0,5,0.871,0.981,0.903,0.873,0.863,"[[453, 6, 37, 0, 0, 0],  [25, 429, 17, 0, 0, 0],  [85, 61, 274, 0, 0, 0],  [0, 2, 0, 375, 113, 1],  [0, 1, 0, 31, 500, 0],  [0, 0, 0, 0, 0, 537]]","0.867, 0.820, 0.774 0.807, 0.812, 0.843 0.895, 0.833",0.839
1,10,0.872,0.981,0.903,0.873,0.864,"[[452, 7, 37, 0, 0, 0],  [24, 430, 17, 0, 0, 0],  [85, 61, 274, 0, 0, 0],  [0, 2, 0, 379, 109, 1],  [0, 1, 0, 32, 499, 0],  [0, 0, 0, 0, 0, 537]]","0.864, 0.820, 0.777 0.805, 0.810, 0.843 0.897, 0.832",0.839


In [115]:
bag_gaussNB_results = test_ensemble( 
    x,
    y,
    x_train,
    y_train,
    x_test,
    y_test,
    classifier = BaggingClassifier,
    model = GaussianNB(), 
    n_estimators_values=[5,10],
    random_state = random_state
    )

In [117]:
print("Bagging Gaussian Naive Bayes:")
display(HTML(bag_gaussNB_results.to_html(escape=False)))

Bagging Gaussian Naive Bayes:


Unnamed: 0,n,Accuracy,ROC AUC,PR AUC,Precision,Recall,Confusion matrix,CV scores,CV mean score
0,5,0.791,0.958,0.815,0.8,0.786,"[[427, 25, 44, 0, 0, 0],  [10, 450, 11, 0, 0, 0],  [78, 82, 260, 0, 0, 0],  [0, 7, 0, 251, 228, 5],  [1, 16, 0, 19, 491, 5],  [0, 4, 0, 80, 0, 453]]","0.767, 0.707, 0.582 0.698, 0.709, 0.799 0.870, 0.758",0.747
1,10,0.808,0.964,0.829,0.819,0.801,"[[422, 31, 43, 0, 0, 0],  [9, 451, 11, 0, 0, 0],  [80, 83, 257, 0, 0, 0],  [0, 7, 0, 268, 213, 3],  [1, 15, 0, 19, 493, 4],  [0, 6, 0, 40, 0, 491]]","0.762, 0.679, 0.585 0.688, 0.703, 0.800 0.838, 0.705",0.731


### Testy na przygotowanej implementacji modeli zespołowych

Model zespołowy modeli SVC

In [132]:
ens_svc_results = test_ensemble( 
    x,
    y,
    x_train,
    y_train,
    x_test,
    y_test,
    classifier = Ensemble,
    model = SVC, 
    n_estimators_values=[5,10],
    random_state = random_state
    )

In [137]:
print("Ensemble SVC:")
display(HTML(ens_svc_results.to_html(escape=False)))

Ensemble SVC:


Unnamed: 0,n,Accuracy,ROC AUC,PR AUC,Precision,Recall,Confusion matrix,CV scores,CV mean score
0,5,0.862,0.978,0.912,0.873,0.858,"[[470, 3, 23, 0, 0, 0],  [25, 430, 16, 0, 0, 0],  [51, 51, 318, 0, 0, 0],  [0, 2, 0, 342, 135, 12],  [0, 0, 0, 35, 497, 0],  [0, 0, 0, 6, 47, 484]]","0.871, 0.852, 0.789 0.889, 0.877, 0.905 0.863, 0.856",0.864
1,10,0.876,0.986,0.938,0.893,0.871,"[[471, 2, 23, 0, 0, 0],  [28, 433, 10, 0, 0, 0],  [38, 55, 327, 0, 0, 0],  [0, 2, 0, 303, 185, 1],  [0, 0, 0, 10, 522, 0],  [0, 0, 0, 0, 10, 527]]","0.948, 0.923, 0.861 0.952, 0.945, 0.920 0.914, 0.905",0.917


Model zespołowy modeli Gaussian Naive Bayes

In [173]:
ens_gaussNB_results = test_ensemble( 
    x,
    y,
    x_train,
    y_train,
    x_test,
    y_test,
    classifier = Ensemble,
    model = GaussianNB, 
    n_estimators_values=[5,10],
    random_state = random_state
    )

In [174]:
print("Ensemble Gaussian Naive Bayes:")
display(HTML(ens_gaussNB_results.to_html(escape=False)))

Ensemble Gaussian Naive Bayes:


Unnamed: 0,n,Accuracy,ROC AUC,PR AUC,Precision,Recall,Confusion matrix,CV scores,CV mean score
0,5,0.629,0.931,0.766,0.704,0.639,"[[395, 44, 57, 0, 0, 0],  [24, 433, 14, 0, 0, 0],  [110, 64, 246, 0, 0, 0],  [0, 5, 0, 413, 59, 14],  [0, 12, 0, 240, 262, 18],  [0, 5, 0, 408, 18, 106]]","0.641, 0.540, 0.516 0.537, 0.511, 0.617 0.608, 0.543",0.568
1,10,0.737,0.943,0.797,0.767,0.737,"[[396, 53, 47, 0, 0, 0],  [14, 445, 12, 0, 0, 0],  [104, 72, 244, 0, 0, 0],  [0, 5, 0, 414, 59, 13],  [0, 12, 0, 245, 259, 16],  [0, 6, 0, 114, 4, 413]]","0.718, 0.575, 0.551 0.566, 0.531, 0.663 0.679, 0.561",0.609


Model zespołowy modeli Naive Bayes for categorical features

In [144]:
ens_catNB_results = test_ensemble( 
    x_discrete,
    y,
    x_train_discrete,
    y_train,
    x_test_discrete,
    y_test,
    classifier = Ensemble,
    model = CategoricalNB, 
    n_estimators_values=[5,10],
    random_state = random_state
    )

In [145]:
print("Ensemble Categorical Naive Bayes:")
display(HTML(ens_catNB_results.to_html(escape=False)))

Ensemble Categorical Naive Bayes:


Unnamed: 0,n,Accuracy,ROC AUC,PR AUC,Precision,Recall,Confusion matrix,CV scores,CV mean score
0,5,0.791,0.95,0.81,0.795,0.784,"[[409, 33, 54, 0, 0, 0],  [50, 398, 23, 0, 0, 0],  [99, 59, 262, 0, 0, 0],  [0, 2, 0, 328, 129, 32],  [0, 1, 0, 70, 461, 0],  [0, 1, 0, 9, 55, 472]]","0.793, 0.709, 0.709 0.785, 0.792, 0.791 0.794, 0.725",0.767
1,10,0.822,0.957,0.842,0.831,0.813,"[[430, 24, 42, 0, 0, 0],  [39, 410, 22, 0, 0, 0],  [106, 57, 257, 0, 0, 0],  [0, 2, 0, 302, 176, 11],  [0, 1, 0, 32, 499, 0],  [0, 1, 0, 0, 12, 524]]","0.875, 0.799, 0.824 0.837, 0.855, 0.829 0.874, 0.778",0.836


Model zespołowy modeli Decision Tree Classifier

In [163]:
ens_dt_results = test_ensemble( 
    x,
    y,
    x_train,
    y_train,
    x_test,
    y_test,
    classifier = Ensemble,
    model = DecisionTreeClassifier, 
    n_estimators_values=[5,10],
    random_state = 4
    )

In [164]:
print("Ensemble Decision Tree:")
display(HTML(ens_dt_results.to_html(escape=False)))

Ensemble Decision Tree:


Unnamed: 0,n,Accuracy,ROC AUC,PR AUC,Precision,Recall,Confusion matrix,CV scores,CV mean score
0,5,0.861,0.976,0.893,0.86,0.856,"[[438, 25, 33, 0, 0, 0],  [46, 407, 18, 0, 0, 0],  [66, 45, 309, 0, 0, 0],  [0, 0, 0, 395, 88, 8],  [0, 0, 0, 71, 455, 6],  [0, 0, 0, 1, 3, 533]]","0.830, 0.830, 0.841 0.853, 0.851, 0.879 0.864, 0.756",0.838
1,10,0.897,0.987,0.936,0.9,0.891,"[[468, 8, 20, 0, 0, 0],  [60, 402, 9, 0, 0, 0],  [57, 51, 312, 0, 0, 0],  [0, 0, 0, 432, 54, 5],  [0, 1, 0, 28, 497, 6],  [0, 0, 0, 1, 4, 532]]","0.900, 0.870, 0.881 0.907, 0.899, 0.925 0.915, 0.859",0.895
