In [1]:
import warnings
import numpy as np
import pandas as pd
import xgboost
from sklearn import metrics
from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

# Wielowarstwowa sieć neuronowa

(*Multilayer perceptron*, *feedforward neural network*)



**Uwaga:** "Input layer" pomimo tego, że ma w nazwie słowo "warstwa", to tak naprawdę to nie jest żadna warstwa sieci... To są po prostu dane wejściowe... Niestety przyjęło się literaturze nazywanie tego w ten sposób, co jest mylące :(


Sieci uczy sie metodą spadku gradientu (pewnymi wariantami tej metody). Uczenie wykorzystuje algorytm **propagacji wstecznej** (https://en.wikipedia.org/wiki/Backpropagation).

<br>

<br>

<br>

**Uwaga!** Sieci neuronowe absolutnie zawsze wymagają zestandaryzowanych danych! Niezależnie od tego czy wykorzystujemy regularyzację czy nie i niezależnie od typu sieci!

<br>

<br>

### Fakt matematyczny: jednowarstwową siecią możemy otrzymać dowolny kształt. 

Co z tego wynika? To, że (teoretycznie) zawsze wystarczy sieć jednowarstwowa (odpowiednio duża). W praktyce rzeczywiście z reguły wystarcza jedna warstwa, ale mimo wszystko zawsze warto sprawdzić czy 2 (lub 3) nie zadziałają przypadkiem lepiej. Przy czym jeżeli dla dwóch wartsw jest gorzej, to nie ma sensu sprawdzać dla większej ilości.

# Zad
* Wczytaj zbiór danych - pima-indians-diabetes.data
* Podziel dane na train test
* Wykonaj uczenie modeli (dobierz najlepsze parametry)
    * LogisticRegression
    * LinearSVC
    * SVC
    * KNeighborsClassifier
    * DecisionTreeClassifier
    * RandomForestClassifier
    * BaggingClassifier
    * ExtraTreesClassifier
    * AdaBoostClassifier
    * GradientBoostingClassifier
    * VotingClassifier
    * xgboost.XGBClassifier
* Porównaj wyniki na zbiorze uczącym    

In [2]:
dataset = np.genfromtxt('data/diabetes.csv', delimiter=',', skip_header=1)
X = dataset[:, 0:8]
Y = dataset[:, 8]
print(X.shape)
print(np.mean(Y))

(768, 8)
0.3489583333333333


In [3]:
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

seed = 123
kfold = StratifiedKFold(n_splits=5, random_state=seed, shuffle=True)

In [4]:
models_and_parameters = {
    'LogisticRegression': (LogisticRegression(max_iter=1000), {'classifier__C': [0.1, 1, 10]}),
    'LinearSVC': (LinearSVC(max_iter=10000), {'classifier__C': [0.1, 1, 10]}),
    'SVC': (SVC(), {'classifier__C': [0.1, 1, 10], 'classifier__gamma': [0.1, 1, 10]}),
    'KNeighborsClassifier': (KNeighborsClassifier(), {'classifier__n_neighbors': [3, 5, 7]}),
    'DecisionTreeClassifier': (DecisionTreeClassifier(), {'classifier__max_depth': [3, 5, 7]}),
    'RandomForestClassifier': (RandomForestClassifier(), {'classifier__n_estimators': [100, 200, 300]}),
    'BaggingClassifier': (BaggingClassifier(), {'classifier__n_estimators': [10, 20, 30]}),
    'ExtraTreesClassifier': (ExtraTreesClassifier(), {'classifier__n_estimators': [100, 200, 300]}),
    'AdaBoostClassifier': (AdaBoostClassifier(), {'classifier__n_estimators': [50, 100, 200]}),
    'GradientBoostingClassifier': (GradientBoostingClassifier(), {'classifier__n_estimators': [100, 200, 300]}),
    'VotingClassifier': (VotingClassifier(
        estimators=[('lr', LogisticRegression()), ('rf', RandomForestClassifier()), ('gnb', GaussianNB())],
        voting='soft'), {}),
    'XGBClassifier': (xgboost.XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
                      {'classifier__n_estimators': [100, 200, 300]})
}

for model_name, (model, params) in models_and_parameters.items():
    params['preprocessing'] = [StandardScaler(), None]

In [5]:
best_models = []

for model_name, (model, params) in models_and_parameters.items():
    pipe = Pipeline([
        ('preprocessing', StandardScaler()),
        ('classifier', model)
    ])
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        grid = GridSearchCV(pipe, params, cv=kfold, return_train_score=True)
        grid.fit(X_train, y_train)
    best_models.append((model_name, grid.best_estimator_))
    print(f"Best parameters for {model_name}: {grid.best_params_}")
    print(f"Best cross-validation score for {model_name}: {grid.best_score_}")
    print("-" * 20)

Best parameters for LogisticRegression: {'classifier__C': 1, 'preprocessing': StandardScaler()}
Best cross-validation score for LogisticRegression: 0.7684751570531125
--------------------
Best parameters for LinearSVC: {'classifier__C': 0.1, 'preprocessing': StandardScaler()}
Best cross-validation score for LinearSVC: 0.7645726251665714
--------------------
Best parameters for SVC: {'classifier__C': 1, 'classifier__gamma': 0.1, 'preprocessing': StandardScaler()}
Best cross-validation score for SVC: 0.7490576813249572
--------------------
Best parameters for KNeighborsClassifier: {'classifier__n_neighbors': 5, 'preprocessing': StandardScaler()}
Best cross-validation score for KNeighborsClassifier: 0.7354654483152484
--------------------
Best parameters for DecisionTreeClassifier: {'classifier__max_depth': 5, 'preprocessing': None}
Best cross-validation score for DecisionTreeClassifier: 0.7237578526556254
--------------------
Best parameters for RandomForestClassifier: {'classifier__n_es

In [6]:
def score_model(model):
    y_pred = model.predict(X_test)
    precision_score = metrics.precision_score(y_test, y_pred)
    recall_score = metrics.recall_score(y_test, y_pred)
    f1_score = metrics.f1_score(y_test, y_pred)
    accuracy = metrics.accuracy_score(y_test, y_pred)
    try:
        roc_auc_score = metrics.roc_auc_score(y_test, model.decision_function(X_test))
    except:
        roc_auc_score = metrics.roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    return precision_score, recall_score, f1_score, accuracy, roc_auc_score

In [7]:
df = pd.DataFrame(columns=['Method', 'precision_score', 'recall_score', 'f1_score', 'accuracy', 'roc_auc_score'])

In [8]:
for i, (model_name, model) in enumerate(best_models):
    precision_score, recall_score, f1_score, accuracy, roc_auc_score = score_model(model)
    # print(f"Results for {model_name}:")
    # print(f"\t {precision_score}, {recall_score}, {f1_score}, {accuracy}, {roc_auc_score}")
    df.loc[i] = [model_name, precision_score, recall_score, f1_score, accuracy, roc_auc_score]
df.head(12)

Unnamed: 0,Method,precision_score,recall_score,f1_score,accuracy,roc_auc_score
0,LogisticRegression,0.734177,0.630435,0.678363,0.783465,0.828435
1,LinearSVC,0.734177,0.630435,0.678363,0.783465,0.829777
2,SVC,0.705128,0.597826,0.647059,0.76378,0.835749
3,KNeighborsClassifier,0.595238,0.543478,0.568182,0.700787,0.747585
4,DecisionTreeClassifier,0.698413,0.478261,0.567742,0.73622,0.776067
5,RandomForestClassifier,0.666667,0.652174,0.659341,0.755906,0.814513
6,BaggingClassifier,0.694118,0.641304,0.666667,0.767717,0.810722
7,ExtraTreesClassifier,0.6875,0.597826,0.639535,0.755906,0.818673
8,AdaBoostClassifier,0.625,0.597826,0.611111,0.724409,0.787607
9,GradientBoostingClassifier,0.663043,0.663043,0.663043,0.755906,0.814278


# MLPClassifier

Dodajmy model sieci neuronowej

In [9]:
model = MLPClassifier((20, 10), max_iter=1000, random_state=123)
model.fit(X_train, y_train)

y_pred = model.predict_proba(X_test)[:, 1]
predictions = y_pred.round()

accuracy = metrics.accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0), "AUC: ", metrics.roc_auc_score(y_score=y_pred, y_true=y_test))

Accuracy: 70.87% AUC:  0.7157139023081052


# Zad
Wykonaj Walidację krzyżową

In [10]:
pipe = Pipeline([('preprocessing', StandardScaler()), ('classifier', MLPClassifier(max_iter=5000))])

param_grid = {
    'preprocessing': [StandardScaler(), None],
    'classifier__hidden_layer_sizes': [(20, 10)],
    'classifier__learning_rate_init': [0.001, 0.01, 0.1],
    'classifier__max_iter': [100],
    'classifier__batch_size': [8, 16, 32],
}

grid_2 = GridSearchCV(pipe, param_grid, cv=kfold, return_train_score=True)
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    grid_2.fit(X_train, y_train)

grid_2.best_params_

{'classifier__batch_size': 32,
 'classifier__hidden_layer_sizes': (20, 10),
 'classifier__learning_rate_init': 0.001,
 'classifier__max_iter': 100,
 'preprocessing': StandardScaler()}

In [11]:
df.loc[df.shape[0]] = ['MLP', *score_model(grid_2.best_estimator_)]

In [12]:
df

Unnamed: 0,Method,precision_score,recall_score,f1_score,accuracy,roc_auc_score
0,LogisticRegression,0.734177,0.630435,0.678363,0.783465,0.828435
1,LinearSVC,0.734177,0.630435,0.678363,0.783465,0.829777
2,SVC,0.705128,0.597826,0.647059,0.76378,0.835749
3,KNeighborsClassifier,0.595238,0.543478,0.568182,0.700787,0.747585
4,DecisionTreeClassifier,0.698413,0.478261,0.567742,0.73622,0.776067
5,RandomForestClassifier,0.666667,0.652174,0.659341,0.755906,0.814513
6,BaggingClassifier,0.694118,0.641304,0.666667,0.767717,0.810722
7,ExtraTreesClassifier,0.6875,0.597826,0.639535,0.755906,0.818673
8,AdaBoostClassifier,0.625,0.597826,0.611111,0.724409,0.787607
9,GradientBoostingClassifier,0.663043,0.663043,0.663043,0.755906,0.814278


# Wczytaj dane treningowe i testowe

In [13]:
# Read in the data
train_set = pd.read_csv('data/adult/adult.data', sep=", ", header=None, engine='python')
test_set = pd.read_csv('data/adult/adult.test', sep=", ", skiprows=1,
                       header=None, engine='python')  

In [14]:
# Assign column names to the data
col_labels = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation',
              'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
              'wage_class']
train_set.columns = col_labels
test_set.columns = col_labels

In [15]:
# Modify data to make it easier to work with
train = train_set.replace('?', np.nan).dropna()
test = test_set.replace('?', np.nan).dropna()

dataset = pd.concat([train, test])

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    dataset['wage_class'] = dataset.wage_class.replace({'<=50K.': 0, '<=50K': 0, '>50K.': 1, '>50K': 1})
    dataset.drop(["fnlwgt"], axis=1, inplace=True)
    dataset.drop(["education"], axis=1, inplace=True)
    x = dataset.groupby('native_country')["wage_class"].mean()
    d = dict(pd.cut(x[x.index != " United-States"], 5, labels=range(5)))
    dataset['native_country'] = dataset['native_country'].replace(d)
    dataset = pd.get_dummies(dataset, drop_first=True)

In [16]:
# Split the data back into the original train and test sets
train = dataset.iloc[:train.shape[0]]
test = dataset.iloc[train.shape[0]:]

X_train = train.drop("wage_class", axis=1)
y_train = train.wage_class

X_test = test.drop("wage_class", axis=1)
y_test = test.wage_class

In [17]:
print(X_train.shape)
print(X_test.shape)

(30162, 41)
(15060, 41)


# Zad
Porównaj wyniki sieci na:
* oryginalnych danych 
* na wystandaryzowanych

In [18]:
df_nn = pd.DataFrame(columns=['Method', 'precision_score', 'recall_score', 'f1_score', 'accuracy', 'roc_auc_score'])

In [19]:
model = MLPClassifier((20, 10), max_iter=1000, random_state=123)
model.fit(X_train, y_train)
precision_score, recall_score, f1_score, accuracy, roc_auc_score = score_model(model)
df_nn.loc[0] = ['MLP - original data', precision_score, recall_score, f1_score, accuracy, roc_auc_score]

In [20]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

model = MLPClassifier((20, 10), max_iter=1000, random_state=123)
model.fit(X_train, y_train)
precision_score, recall_score, f1_score, accuracy, roc_auc_score = score_model(model)
df_nn.loc[1] = ['MLP - scaled data', precision_score, recall_score, f1_score, accuracy, roc_auc_score]

In [21]:
df_nn

Unnamed: 0,Method,precision_score,recall_score,f1_score,accuracy,roc_auc_score
0,MLP - original data,0.700437,0.65027,0.674422,0.84575,0.898022
1,MLP - scaled data,0.722331,0.596216,0.653243,0.844489,0.900495
