Zespoły klasyfikatorów
----

In [1]:
!git clone https://github.com/matzim95/ML-datasets

fatal: destination path 'ML-datasets' already exists and is not an empty directory.


In [7]:
import pandas as pd
from sklearn.model_selection import KFold, StratifiedKFold

def load_dataset(filename, class_column, index_col=None):
    dataset = pd.read_csv(f'/content/drive/MyDrive/Klasyfikacja/zajecia/group/ML-datasets/{filename}.csv', index_col=index_col)
    dataset['class'] = dataset[class_column].astype('category').cat.codes
    classes = dataset.pop(class_column).unique()
    return dataset, classes

def prepare_dataset(dataset_name):
    params = {'iris': {'class_column': 'species', 'index_col': None},
              'wine': {'class_column': 'Class', 'index_col': None},
              'glass': {'class_column': 'Type', 'index_col': 'ID'}}
    dataset, classes = load_dataset(dataset_name,
                                    **params[dataset_name])
    y = dataset.pop('class')
    X = dataset
    return X, y, classes

Przetestujmy jak działa bagging:

In [4]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import confusion_matrix

# average:
#   None - wyniki dla każdej klasy osobno
#   'binary' - dla wybranej w `pos_label` etykiety
#   'micro' - dla całkowitej liczby TP, FN, FP
#   'macro' - nieważona średnia dla każdej klasy
#   'weighted' - ważona średnia dla każdej klasy

def calculate_metrics(target, prediction, average='macro'):
    accuracy = accuracy_score(target, prediction)
    precision = precision_score(target, prediction, average=average)
    recall = recall_score(target, prediction, average=average)
    f1 = f1_score(target, prediction, average=average)
    mislabeled = (target != prediction).sum()
    total = len(target)
    return accuracy, precision, recall, f1, mislabeled, total

def print_results(metrics, classifier_id='classifier'):
    print(f'Results for {classifier_id}')
    print('----')
    print(f'  Accuracy:  {metrics[0]}')
    print(f'  Precision: {metrics[1]}')
    print(f'  Recall:    {metrics[2]}')
    print(f'  F1 score:  {metrics[3]}')
    print(f'  Mislabeled {metrics[4]} out of {metrics[5]}')
    print('\n')

def plot_confusion_matrix(cm, classes, title=None, title_appendix = '', cmap = plt.cm.Blues):

  # Obsługa tytułu
  if title_appendix:
    title_appendix = f'({title_appendix})'
  
  if title is None:
    title = f'Confusion Matrix {title_appendix}'

  fig, ax = plt.subplots(figsize = (10,5))

  img = ax.imshow(cm, cmap = cmap)

  ax.figure.colorbar(img, ax = ax)

  ax.set_xticks(np.arange(cm.shape[1]))
  ax.set_xticklabels(classes, rotation = 45)

  ax.set_yticks(np.arange(cm.shape[0]))
  ax.set_yticklabels(classes, rotation = 45)

  ax.set_title(title)
  ax.set_ylabel('True label')
  ax.set_xlabel('Predicted label')

  fmt = '.2f' if cm.dtype =='float' else 'd'

  for y, row in enumerate(cm):
    for x, cell in enumerate(row):
      ax.text(x,y, format(cell, fmt), ha ='center', color = 'white' if cell > cm.max()/2 else 'black')

  fig.tight_layout()

  return ax

def normalize_confusion_matrix(confusion_matrix):
  return confusion_matrix.astype('float') /confusion_matrix.sum(axis = 1)[np.newaxis].T


from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold, StratifiedKFold
def count_model(classifier, kfold, X,y, classes, params = None, scaling = None, numeric_features = None, results = None, plot = None, print_params = None):
  
  if params is None:
    params = {}
    clf = classifier(**params)
  else:
    clf = classifier(**params)

  predicted = []
  target = []

  if scaling == 'minmax':
    scaler = MinMaxScaler()
  elif scaling == 'standard':
    scaler = StandardScaler()
  else:
    print('Wybierz scaler z listy: minmax, standard')

  for train_index, test_index in kfold.split(X,y):
    X_train, X_test = X.iloc[train_index].copy(), X.iloc[test_index].copy()
    y_train, y_test = y.iloc[train_index].copy(), y.iloc[test_index].copy()

    if scaling is not None:
      X_train_scaled = X_train.copy()
      X_test_scaled = X_test.copy()

      X_train_scaled[numeric_features] = scaler.fit_transform(X_train[numeric_features])
      X_test_scaled[numeric_features] = scaler.transform(X_test[numeric_features])

      X_train = X_train_scaled
      X_test = X_test_scaled

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    target = np.concatenate((target, y_test))
    predicted = np.concatenate((predicted, y_pred))

  if print_params:
    print('PARAMS:\n\n', clf.get_params(),'\n')

  if results:
    print('RESULTS:\n')
    print_results(calculate_metrics(target, predicted))

  if plot:
    print('PLOT:\n')
    plot_confusion_matrix(confusion_matrix(target, predicted), classes)
    plot_confusion_matrix(normalize_confusion_matrix(confusion_matrix(target, predicted)), classes)

In [8]:
X, y, classes = load_dataset('wine', "")

TypeError: load_dataset() missing 1 required positional argument: 'class_column'

## Budowa modelu na danych rzeczywistych

#### Bagging Classifier:

#### Random Forrest

Importances

#### XGBoost:

!pip install xgboost

In [139]:
import xgboost as xgb

Importances

## Granice decyzyjne na sztucznie wygenerowanym zbiorze:

#### Random Forest


#### Boosting
