# ZUM - Projekt

Import potrzebnych narzędzi.

In [2]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.naive_bayes import CategoricalNB, GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_curve, auc
from sklearn.model_selection import cross_val_score

Import przygotowanej klasy Ensemble.

In [3]:
from src.ensembles import Ensemble

Plan badań
1. Modele zespołowe dla każdego z algorytmów (SVC, DecisionTreeClassifier,  CategoricalNB, GaussianNB)
2. Dla każdego z modelów zespołowych testy dla 5, 10, 50, 100 modeli w zespole.
3. Analogiczne testy dla Bagging Classifier
4. Analogiczne testy dla Random Forest Classifier (liczba drzew)
5. Testy na algorytmach konwencjonalnych
6. Wszystko wyżej dla CV = 10
7. Wszystko wyżej powtórzone dla dwóch zbiorów danych.

** dodatkowo różne opcje podzialu atrybutow na modele w modelach zespolowych

w sumie (4*4 + 4*4 + 4 + 4)*2 = 80

## Human activity

In [5]:
random_state = 3

Załadowanie danych

In [6]:
x_train = np.loadtxt("human_activity/X_train.txt", dtype = float)
x_test = np.loadtxt("human_activity/X_test.txt", dtype = float)
y_train = np.loadtxt("human_activity/y_train.txt", dtype = int)
y_test = np.loadtxt("human_activity/y_test.txt", dtype = int)

x = np.vstack([x_train, x_test])
y = np.append(y_train, y_test)
#x_train, x_test, y_train, y_test = train_test_split(
#    x, y, test_size=0.99, random_state=random_state
#)

Funkcja pomocnicza do dyskretyzacji danych

In [11]:
def find_intervals(x_train, group_vector):  # auxilary values for data disrcetization
    intervals = np.array([np.zeros(i - 1) for i in group_vector])

    for i, features in enumerate(x_train.T):
        max_value = max(features)
        min_value = min(features)
        section_size = (max_value - min_value) / group_vector[i]
        intervals[i] = np.array(
            [min_value + section_size * j for j in range(1, group_vector[i])]
        )
    return intervals

### Testy na pojedynczych modelach

SVC

In [None]:
clf_svc = SVC()
clf_svc.fit(x_train, y_train)
score = clf_svc.score(x_test, y_test)
print(score)

Gaussian Naive Bayes

In [None]:
clf_gaussNB = GaussianNB()
clf_gaussNB.fit(x_train, y_train)
score =  clf_gaussNB.score(x_test, y_test)
print(score)

Naive Bayes classifier for categorical features

In [None]:
intervals = find_intervals(x_train, [4] * 30)
x_train_discrete = np.array([np.digitize(column , bins=intervals[i]) for i, column in enumerate(x_train.T)]).T
x_test_discrete = np.array([np.digitize(column , bins=intervals[i]) for i, column in enumerate(x_test.T)]).T

In [None]:
clf_catNB = CategoricalNB()
clf_catNB.fit(x_train_discrete, y_train)
score =  clf_catNB.score(x_test_discrete, y_test)
print(score)

Decision Tree Classifier

In [None]:
clf_dt = DecisionTreeClassifier()
clf_dt.fit(x_train, y_train)
score = clf_dt.score(x_test, y_test)
print(score)
scores = cross_val_score(clf_dt, x, y, cv=5)
print(scores)

### Testy na modelach zespołowych z biblioteki scikit-learn

Random Forest Classifier

In [None]:
clf_rf = RandomForestClassifier(n_estimators=10, random_state=random_state)
clf_rf.fit(x_train, y_train)
score = clf_rf.score(x_test, y_test)
print(score)

Baggin Classifier

In [7]:
clf_bag_svc = BaggingClassifier(estimator=SVC(),n_estimators=10, random_state=random_state)
clf_bag_svc.fit(x_train, y_train)
score = clf_bag_svc.score(x_test, y_test)
print(score)

0.9501187648456056


In [8]:
clf_bag_dt = BaggingClassifier(estimator=DecisionTreeClassifier(),n_estimators=10, random_state=random_state)
clf_bag_dt.fit(x_train, y_train)
score = clf_bag_dt.score(x_test, y_test)
print(score)

0.8808958262639973


In [12]:
intervals = find_intervals(x_train, [4] * 30)
x_train_discrete = np.array([np.digitize(column , bins=intervals[i]) for i, column in enumerate(x_train.T)]).T
x_test_discrete = np.array([np.digitize(column , bins=intervals[i]) for i, column in enumerate(x_test.T)]).T

IndexError: list index out of range

In [None]:
clf_bag_catNB = CategoricalNB(estimator=SVC(),n_estimators=10, random_state=random_state)
clf_bag_catNB.fit(x_train, y_train)
score = clf_bag_catNB.score(x_test, y_test)
print(score)

In [None]:
clf_bag_gaussNB = GaussianNB(estimator=SVC(),n_estimators=10, random_state=random_state)
clf_bag_gaussNB.fit(x_train, y_train)
score = clf_bag_gaussNB.score(x_test, y_test)
print(score)

### Testy na przygotowanej implementacji modeli zespołowych

Model zespołowy modeli SVC

In [None]:
clf_ens_svc = Ensemble(
    SVC, 10, random_state=random_state
)
clf_ens_svc.fit(x_train, y_train)
score = clf_ens_svc.score(x_test, y_test)
print(score)

Model zespołowy modeli Gaussian Naive Bayes

In [None]:
clf_ens_gaussNB = Ensemble(
    GaussianNB, 10, random_state=random_state
)
clf_ens_gaussNB.fit(x_train, y_train)
score = clf_ens_gaussNB.score(x_test, y_test)
print(score)

Model zespołowy modeli Naive Bayes for categorical features

In [None]:
intervals = find_intervals(x_train, [4] * 30)
x_train_discrete = np.array([np.digitize(column , bins=intervals[i]) for i, column in enumerate(x_train.T)]).T
x_test_discrete = np.array([np.digitize(column , bins=intervals[i]) for i, column in enumerate(x_test.T)]).T

In [None]:
clf_ens_catNB = Ensemble(
    CategoricalNB, 10, random_state=random_state, min_categories=[4] * 30
)
clf_ens_catNB.fit(x_train_discrete, y_train)
score = clf_ens_catNB.score(x_test_discrete, y_test)
print(score)

Model zespołowy modeli Decision Tree Classifier

In [None]:
clf_ens_dt = Ensemble(
    DecisionTreeClassifier, 10, max_attributes= 30, random_state=random_state
)
clf_ens_dt.fit(x_train, y_train)
score = clf_ens_dt.score(x_test, y_test)
print(score)
scores = cross_val_score(clf_ens_dt, x, y, cv=5)
print(scores)

0.8571428571428571
[0.88883495 0.82864078 0.87330097 0.8592233  0.87178242]
0.8639294197488971
[0.90631068 0.88737864 0.9131068  0.90679612 0.89218067]


In [None]:
clf_ens_svc = Ensemble(
    DecisionTreeClassifier, 10, random_state=random_state
)
clf_ens_svc.fit(x_train, y_train)
score = clf_ens_svc.score(x_test, y_test)
print(score)

0.8988802171700034
0.8703766542246352
