# Prosty przykład na Gradient Boosting

Wczytanie potrzebnych bibliotek

Zad.4.
Użyj funkcji load_dataset i prepare_dataset, aby wczytaj zbiór glasses, dokonaj jego standaryzacji (StandardScaler()), i naucz algorytm xgboost na tych danych (zb treningowy min 80%). Spróbuj zminimalizowac przeuczenie się algorytmu (parametry do ustawienia w prezentacji). Jako wynik końcowy wyświetl: gini na zb treningowym i na zb testowym

In [70]:
!git clone https://github.com/matzim95/ML-datasets

fatal: destination path 'ML-datasets' already exists and is not an empty directory.


In [71]:
import pandas as pd

def load_dataset(filename, class_column, index_col=None):
    dataset = pd.read_csv(f'ML-datasets/{filename}.csv', index_col=index_col)
    dataset['class'] = dataset[class_column].astype('category').cat.codes
    classes = dataset.pop(class_column).unique()
    return dataset, classes

def prepare_dataset(dataset_name):
    params = {'iris': {'class_column': 'species', 'index_col': None},
              'wine': {'class_column': 'Class', 'index_col': None},
              'glass': {'class_column': 'Type', 'index_col': 'ID'}}
    dataset, classes = load_dataset(dataset_name,
                                    **params[dataset_name])
    y = dataset.pop('class')
    X = dataset
    return X, y, classes

In [72]:
X, y, classes = prepare_dataset('glass')

Bagging
---

In [73]:
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import Normalizer, StandardScaler


In [74]:
normalizer = Normalizer()
standarizer = StandardScaler()
preprocessor = standarizer

## Testy na zbiorach danych

In [75]:

def preprocess_data(X, preprocessor=None):
    if preprocessor is not None:
        preprocessor.fit(X)
        X_preprocessed = X.copy()
        X_preprocessed[:] = preprocessor.transform(X.values)
        X = X_preprocessed
    return X

In [76]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import numpy as np

# average:
#   None - wyniki dla każdej klasy osobno
#   'binary' - dla wybranej w `pos_label` etykiety
#   'micro' - dla całkowitej liczby TP, FN, FP
#   'macro' - nieważona średnia dla każdej klasy
#   'weighted' - ważona średnia dla każdej klasy

def calculate_metrics(target, prediction, average='macro'):
    accuracy = accuracy_score(target, prediction)
    precision = precision_score(target, prediction, average=average)
    recall = recall_score(target, prediction, average=average)
    f1 = f1_score(target, prediction, average=average)
    mislabeled = (target != prediction).sum()
    total = len(target)
    return accuracy, precision, recall, f1, mislabeled, total

def print_results(metrics, classifier_id='classifier'):
    print(f'Results for {classifier_id}')
    print('----')
    print(f'  Accuracy:  {metrics[0]}')
    print(f'  Precision: {metrics[1]}')
    print(f'  Recall:    {metrics[2]}')
    print(f'  F1 score:  {metrics[3]}')
    print(f'  Mislabeled {metrics[4]} out of {metrics[5]}')
    print('\n')
    
def plot_confusion_matrix(confusion_matrix, classes, title=None,
                          title_appendix='',
                          cmap=plt.cm.Blues):
    # tytuł wykresu
    if title_appendix:
        title_appendix = f'({title_appendix})'
    if title is None:
        title = f'Confusion matrix {title_appendix}'
    
    fig, ax = plt.subplots()
    # rysuje CM
    img = ax.imshow(confusion_matrix, cmap=cmap)
    # dodaje pasek z boku
    ax.figure.colorbar(img, ax=ax)
    # dodatki do wykresu
    ax.set_xticks(np.arange(confusion_matrix.shape[1]))
    ax.set_xticklabels(classes, rotation=45, ha='right', rotation_mode='anchor')
    ax.set_yticks(np.arange(confusion_matrix.shape[0]))
    ax.set_yticklabels(classes, rotation=45, ha='right', rotation_mode='anchor')
    ax.set_title(title)
    ax.set_ylabel('True label')
    ax.set_xlabel('Predicted label')
    
    # etykiety
    fmt = '.2f' if confusion_matrix.dtype == 'float' else 'd'
    thresh = confusion_matrix.max() / 2
    for y, row in enumerate(confusion_matrix):
        for x, cell in enumerate(row):
            ax.text(x, y, format(cell, fmt),
                    ha='center', va='center',
                    color='white' if cell > thresh else 'black')
    fig.tight_layout()
    
    return ax

def normalize_confusion_matrix(confusion_matrix):
    return confusion_matrix.astype(
        'float') / confusion_matrix.sum(
        axis=1)[np.newaxis].T

In [77]:
X, y, classes = prepare_dataset('glass')
X = preprocess_data(X, standarizer)
X.describe()

  "X does not have valid feature names, but"


Unnamed: 0,refractive index,Sodium,Magnesium,Aluminum,Silicon,Potassium,Calcium,Barium,Iron
count,214.0,214.0,214.0,214.0,214.0,214.0,214.0,214.0,214.0
mean,-2.877449e-14,2.17998e-15,-2.801497e-16,-3.434428e-16,9.966067e-16,7.47066e-17,-3.137418e-16,-1.763906e-16,-6.121791e-17
std,1.002345,1.002345,1.002345,1.002345,1.002345,1.002345,1.002345,1.002345,1.002345
min,-2.381516,-3.286943,-1.865511,-2.318616,-3.676472,-0.7639186,-2.484084,-0.3528768,-0.5864509
25%,-0.6082728,-0.614158,-0.3957744,-0.511756,-0.4800288,-0.5756501,-0.5049657,-0.3528768,-0.5864509
50%,-0.2262293,-0.1323817,0.5527787,-0.1704602,0.1799655,0.08905322,-0.2514132,-0.3528768,-0.5864509
75%,0.2614331,0.5120326,0.636168,0.3715977,0.5649621,0.173582,0.1518057,-0.3528768,0.4422417
max,5.137232,4.875637,1.254639,4.125851,3.570524,8.780145,5.094318,5.99721,4.659881


In [78]:
import numpy as npm
import pandas as pd
import xgboost as xgb 

from sklearn.metrics import roc_auc_score, accuracy_score # wczytanie metryk sukcesu
from sklearn.model_selection import train_test_split

pd.options.display.max_columns = 250

#### Funkcje pomocnicze

In [79]:
def get_feats(df): #dzieki temu nie będę brał niepotrzebnych kolumn do modelowania
    feats = [f for f in df.columns if f not in ['ID_code','target']]
    return feats

def get_X(df): #do pobierania macierzy X czyli cech które będę starał się opisywac modelem
    return df[ get_feats(df) ].values

def get_y(df, target_var='target'): #wektor y - informacja o predycji. w przypadku przykładowych danych to ":target"
    return df[target_var].values

#### Podział zbioru do nauki modelu, testowy i walidacyjny

In [80]:
y

ID
1      0
2      0
3      0
4      0
5      0
      ..
210    3
211    3
212    3
213    3
214    3
Name: class, Length: 214, dtype: int8

In [81]:
X, y = X.to_numpy(), y.to_numpy() 
# świadomie chcę mieć dwa niezależne zbiory walidacyjne. Przyda mi się na przyszłość jak będę robił optymalizację hiperparametrów
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=2019, stratify = y)
# X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=2019)

print('TRAIN:',X_train.shape, y_train.shape)
print('TEST:',X_test.shape, y_test.shape)
#print('VALIDATION:',X_val.shape, y_val.shape)

TRAIN: (171, 9) (171,)
TEST: (43, 9) (43,)


#### Walidacja modelu
Zdefiniujmy sobie funkcję dzięki którym od razu sprawdzimy jak model działa.<br>
W tym przypadku przyglądnijmy się metryce GINI.

In [88]:
def create_measures(y,y_pred): 
    score_test = roc_auc_score(y, y_pred, multi_class='ovr')
    Gini_index = 2*score_test - 1
    
    d = {'AUC': [round(score_test,4)], 'GINI': [round(Gini_index,4)]}
    d = pd.DataFrame.from_dict(d)
    return d

def calculating_metrics(X_train, X_test, y_train, y_test):
    train = create_measures(y_train,model.predict_proba(X_train))#[:, 1])
    test = create_measures(y_test,model.predict_proba(X_test))#[:, 1])
    #val = create_measures(y_val,model.predict_proba(X_val))#[:, 1]) 

    measures =  pd.concat([train,test]).set_index([pd.Index(['TRAIN', 'TEST'])]) 
    
    return measures

#### Budowa modelu XGB na domyślnych parametrach

In [89]:
model = xgb.XGBClassifier()
model.fit(X_train, y_train)  
measures = calculating_metrics(X_train, X_test, y_train, y_test)
measures

Unnamed: 0,AUC,GINI
TRAIN,1.0,1.0
TEST,0.8527,0.7053


#### Budowa modelu XGB na zmienionch parametrach

In [None]:
model = xgb.XGBClassifier(max_depth = 5, n_estimators=150, subsample = 0.75, colsample_bytree=0.75)
model.fit(X_train, y_train)  
measures = calculating_metrics(X_train, X_test, X_val, y_train, y_test, y_val)
measures

#### Budowa modelu LightGBM na zmienionch parametrach 

In [None]:
import lightgbm as lgb

model = lgb.LGBMClassifier(max_depth = 5, n_estimators = 150, subsample=0.75)
model.fit(X_train, y_train)  
measures = calculating_metrics(X_train, X_test, X_val, y_train, y_test, y_val)
measures

Widać, że różnica między XGB oraz LightGBM jest nieznaczna

In [None]:
X

In [None]:
y