# ZUM - Projekt

Import potrzebnych narzędzi.

In [20]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.naive_bayes import CategoricalNB, GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_curve, auc
from sklearn.model_selection import cross_val_score

Import przygotowanej klasy Ensemble.

In [21]:
from src.ensembles import Ensemble

Plan badań
1. Modele zespołowe dla każdego z algorytmów (SVC, DecisionTreeClassifier,  CategoricalNB, GaussianNB)
2. Dla każdego z modelów zespołowych testy dla 5, 10, 50, 100 modeli w zespole.
3. Analogiczne testy dla Bagging Classifier
4. Analogiczne testy dla Random Forest Classifier (liczba drzew)
5. Testy na algorytmach konwencjonalnych
6. Wszystko wyżej dla CV = 10
7. Wszystko wyżej powtórzone dla dwóch zbiorów danych.

** dodatkowo różne opcje podzialu atrybutow na modele w modelach zespolowych

w sumie (4*4 + 4*4 + 4 + 4)*2 = 80

## Human activity

In [22]:
random_state = 3

Załadowanie danych

In [23]:
x_train = np.loadtxt("human_activity/X_train.txt", dtype = float)
x_test = np.loadtxt("human_activity/X_test.txt", dtype = float)
y_train = np.loadtxt("human_activity/y_train.txt", dtype = int)
y_test = np.loadtxt("human_activity/y_test.txt", dtype = int)

x = np.vstack([x_train, x_test])
y = np.append(y_train, y_test)


#x_train, x_test, y_train, y_test = train_test_split(
#    x, y, test_size=0.99, random_state=random_state
#)

Funkcja pomocnicza do dyskretyzacji danych

In [24]:
def find_intervals(x_train, group_vector):  # auxilary values for data disrcetization
    intervals = np.array([np.zeros(i - 1) for i in group_vector])

    for i, features in enumerate(x_train.T):
        max_value = max(features)
        min_value = min(features)
        section_size = (max_value - min_value) / group_vector[i]
        intervals[i] = np.array(
            [min_value + section_size * j for j in range(1, group_vector[i])]
        )
    return intervals

### Testy na pojedynczych modelach

SVC

In [60]:
clf_svc = SVC()
clf_svc.fit(x_train, y_train)
score =  clf_svc.score(x_test, y_test)
svc_scores = cross_val_score(clf_svc, x, y, cv=10)
print("Decision Tree Classifier CV scores:", svc_scores)
print("Decision Tree Classifier CV mean score:")
print(svc_scores.mean())

Decision Tree Classifier CV scores: [0.97087379 0.93106796 0.87864078 0.94660194 0.96990291 0.97378641
 0.96116505 0.95631068 0.95339806 0.96793003]
Decision Tree Classifier CV mean score:
0.9509677601970055


Gaussian Naive Bayes

In [61]:
clf_gaussNB = GaussianNB()
clf_gaussNB.fit(x_train, y_train)
score =  clf_gaussNB.score(x_test, y_test)
gaussNB_scores = cross_val_score(clf_gaussNB, x, y, cv=10)
print("Decision Tree Classifier CV scores:", gaussNB_scores)
print("Decision Tree Classifier CV mean score:")
print(gaussNB_scores.mean())

Decision Tree Classifier CV scores: [0.76504854 0.68640777 0.57669903 0.75436893 0.68349515 0.7038835
 0.78640777 0.79223301 0.81456311 0.69776482]
Decision Tree Classifier CV mean score:
0.7260871616330304


Naive Bayes classifier for categorical features

In [100]:
intervals = find_intervals(x_train, [4] * 561)
x_train_discrete = np.array([np.digitize(column , bins=intervals[i]) for i, column in enumerate(x_train.T)]).T
x_test_discrete = np.array([np.digitize(column , bins=intervals[i]) for i, column in enumerate(x_test.T)]).T
x_discrete = np.vstack([x_train_discrete, x_test_discrete])

In [101]:
print("x_discrete shape:", x_discrete.shape)
print("y shape:", y.shape)
for i in range(5):  # Change the range as needed to inspect more features
    unique_values = np.unique(x_discrete[:, i])
    print(f"Feature {i} unique values in x_discrete: {unique_values}")
print("y unique:", np.unique(y))

x_discrete shape: (10299, 561)
y shape: (10299,)
Feature 0 unique values in x_discrete: [0 1 2 3]
Feature 1 unique values in x_discrete: [0 1 2 3]
Feature 2 unique values in x_discrete: [0 1 2 3]
Feature 3 unique values in x_discrete: [0 1 2 3]
Feature 4 unique values in x_discrete: [0 1 2 3]
y unique: [1 2 3 4 5 6]


In [None]:
clf_catNB = CategoricalNB()
clf_catNB.fit(x_train_discrete, y_train)
score =  clf_catNB.score(x_discrete, y)
print(score)
catNB_scores = cross_val_score(clf_catNB, x_discrete, y, cv=10)
print("Categorical NB CV scores:", catNB_scores)
print("Categorical NB CV mean score:")
print(catNB_scores.mean())

Decision Tree Classifier

In [44]:
clf_dt = DecisionTreeClassifier()
clf_dt.fit(x_train, y_train)
score =  clf_dt.score(x_test, y_test)
dt_scores = cross_val_score(clf_dt, x, y, cv=10)
print("Decision Tree Classifier CV scores:", dt_scores)
print("Decision Tree Classifier CV mean score:")
print(dt_scores.mean())

Decision Tree Classifier CV scores: [0.89223301 0.84368932 0.80970874 0.8592233  0.88058252 0.89708738
 0.90097087 0.81553398 0.9        0.88435374]
Decision Tree Classifier CV mean score:
0.8683382867710192


### Testy na modelach zespołowych z biblioteki scikit-learn

Random Forest Classifier

In [104]:
clf_rf = RandomForestClassifier(n_estimators=10, random_state=random_state)
clf_rf.fit(x_train, y_train)
score =  clf_rf.score(x_test, y_test)
rf_scores = cross_val_score(clf_rf, x, y, cv=10)
print("RandomForestClassifier CV scores:", rf_scores)
print("RandomForestClassifier CV mean score:")
print(rf_scores.mean())

RandomForestClassifier CV scores: [0.92815534 0.92524272 0.85533981 0.94757282 0.88834951 0.95339806
 0.93786408 0.9038835  0.93592233 0.94071914]
RandomForestClassifier CV mean score:
0.9216447300140583


Baggin Classifier

In [105]:
clf_bag_svc = BaggingClassifier(estimator=SVC(),n_estimators=10, random_state=random_state)
clf_bag_svc.fit(x_train, y_train)
score =  clf_bag_svc.score(x_test, y_test)
bag_svc_scores = cross_val_score(clf_bag_svc, x, y, cv=10)
print("Bagging Classifier with SVC estimator CV scores:", bag_svc_scores)
print("Bagging Classifier with SVC estimator CV mean score:")
print(bag_svc_scores.mean())

Bagging Classifier with SVC estimator CV scores: [0.97184466 0.92815534 0.87669903 0.94466019 0.96893204 0.97378641
 0.95825243 0.95825243 0.95048544 0.96112731]
Bagging Classifier with SVC estimator CV mean score:
0.9492195269231134


In [106]:
clf_bag_dt = BaggingClassifier(estimator=DecisionTreeClassifier(),n_estimators=10, random_state=random_state)
clf_bag_dt.fit(x_train, y_train)
score =  clf_bag_dt.score(x_test, y_test)
bag_dt_scores = cross_val_score(clf_bag_dt, x, y, cv=10)
print("Bagging Classifier with Decision Tree estimator CV scores:", bag_dt_scores)
print("Bagging Classifier with Decision Tree estimator CV mean score:")
print(bag_dt_scores.mean())

Bagging Classifier with Decision Tree estimator CV scores: [0.91359223 0.89029126 0.82815534 0.92038835 0.93883495 0.95825243
 0.90679612 0.84854369 0.90097087 0.9271137 ]
Bagging Classifier with Decision Tree estimator CV mean score:
0.9032938945342351


In [33]:
intervals = find_intervals(x_train, [4] * 561)
x_train_discrete = np.array([np.digitize(column , bins=intervals[i]) for i, column in enumerate(x_train.T)]).T
x_test_discrete = np.array([np.digitize(column , bins=intervals[i]) for i, column in enumerate(x_test.T)]).T
x_discrete = np.vstack([x_train_discrete, x_test_discrete])

In [34]:
clf_bag_catNB = BaggingClassifier(estimator=CategoricalNB(),n_estimators=10, random_state=random_state)
clf_bag_catNB.fit(x_train_discrete, y_train)
score = clf_bag_catNB.score(x_test_discrete, y_test)
print(score)
bag_catNB_scores = cross_val_score(clf_bag_catNB, x, y, cv=10)
print("Bagging Classifier with Categorical NB estimator CV scores:", bag_catNB_scores)
print("Bagging Classifier with Categorical NB estimator CV mean score:")
print(bag_catNB_scores.mean())

0.8724126230064473


In [107]:
clf_bag_gaussNB = BaggingClassifier(estimator=GaussianNB(),n_estimators=10, random_state=random_state)
clf_bag_gaussNB.fit(x_train, y_train)
score = clf_bag_gaussNB.score(x_test, y_test)
print(score)
bag_gaussNB_scores = cross_val_score(clf_bag_gaussNB, x, y, cv=10)
print("Bagging Classifier with Decision Tree estimator CV scores:", bag_gaussNB_scores)
print("Bagging Classifier with Decision Tree estimator CV mean score:")
print(bag_gaussNB_scores.mean())

0.8082796063793688
Bagging Classifier with Decision Tree estimator CV scores: [0.76213592 0.67864078 0.58543689 0.75631068 0.68834951 0.70291262
 0.8        0.79223301 0.83786408 0.70456754]
Bagging Classifier with Decision Tree estimator CV mean score:
0.7308451036447867


### Testy na przygotowanej implementacji modeli zespołowych

Model zespołowy modeli SVC

In [36]:
clf_ens_svc = Ensemble(
    SVC, 10, random_state=random_state
)
clf_ens_svc.fit(x_train, y_train)
score = clf_ens_svc.score(x_test, y_test)
print(score)
ens_svc_scores = cross_val_score(clf_ens_svc, x, y, cv=10)
print("Bagging Classifier with Decision Tree estimator CV scores:", ens_svc_scores)
print("Bagging Classifier with Decision Tree estimator CV mean score:")
print(ens_svc_scores.mean())

0.8764845605700713


Model zespołowy modeli Gaussian Naive Bayes

In [37]:
clf_ens_gaussNB = Ensemble(
    GaussianNB, 10, random_state=random_state
)
clf_ens_gaussNB.fit(x_train, y_train)
score = clf_ens_gaussNB.score(x_test, y_test)
print(score)
ens_gaussNB_scores = cross_val_score(clf_ens_gaussNB, x, y, cv=10)
print("Bagging Classifier with Decision Tree estimator CV scores:", ens_gaussNB_scores)
print("Bagging Classifier with Decision Tree estimator CV mean score:")
print(ens_gaussNB_scores.mean())

0.7366813708856464


Model zespołowy modeli Naive Bayes for categorical features

In [38]:
intervals = find_intervals(x_train, [4] * 561)
x_train_discrete = np.array([np.digitize(column , bins=intervals[i]) for i, column in enumerate(x_train.T)]).T
x_test_discrete = np.array([np.digitize(column , bins=intervals[i]) for i, column in enumerate(x_test.T)]).T
x_discrete = np.vstack([x_train_discrete, x_test_discrete])

In [40]:
clf_ens_catNB = Ensemble(
    CategoricalNB, 10, random_state=random_state, min_categories=[4] * 561
)
clf_ens_catNB.fit(x_train_discrete, y_train)
score = clf_ens_catNB.score(x_test_discrete, y_test)
print(score)
ens_catNB_scores = cross_val_score(clf_ens_catNB, x, y, cv=10)
print("Bagging Classifier with Decision Tree estimator CV scores:", ens_catNB_scores)
print("Bagging Classifier with Decision Tree estimator CV mean score:")
print(ens_catNB_scores.mean())

0.8218527315914489


Model zespołowy modeli Decision Tree Classifier

In [41]:
clf_ens_dt = Ensemble(
    DecisionTreeClassifier, 10, max_attributes= 30, random_state=random_state
)
clf_ens_dt.fit(x_train, y_train)
score = clf_ens_dt.score(x_test, y_test)
print(score)
ens_dt_scores = cross_val_score(clf_ens_dt, x, y, cv=10)
print("Bagging Classifier with Decision Tree estimator CV scores:", ens_dt_scores)
print("Bagging Classifier with Decision Tree estimator CV mean score:")
print(ens_dt_scores.mean())

0.8676620291822192
[0.91407767 0.89514563 0.92135922 0.90436893 0.89266634]


In [42]:
clf_ens_svc = Ensemble(
    DecisionTreeClassifier, 10, random_state=random_state
)
clf_ens_svc.fit(x_train, y_train)
score = clf_ens_svc.score(x_test, y_test)
print(score)

0.8680013573125213
