# Języki Programowania Python i R


## dr inż. Patryk Jasik
### Division of Theoretical Physics and Quantum Information
### Institute of Physics and Computer Science
### Faculty of Applied Physics and Mathematics
### Gdansk University of Technology

# scikit-learn docs
## https://scikit-learn.org/stable/

In [1]:
#%config Completer.use_jedi = False

**Classification** is the problem of identifying which of a set of categories (sub-populations) an observation (or observations) belongs to.

In [2]:
#loading the necessary packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

In [3]:
#loading the dataset
wine = pd.read_csv("data/winequality-all.csv", comment="#")
wine.head()

Unnamed: 0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol,response,color
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,3,red
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,3,red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,3,red
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,4,red
4,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,3,red


In [4]:
# the goal - classification of wines according to the 'response' variable
# 'response' determines the quality of the wine (median of three experts' opinions)
# scale from 0 (very bad) to 10 (excellent)
# let's examine the class size distribution
response_levels = wine["response"].value_counts()
response_levels.iloc[np.argsort(response_levels.index)]

1      30
2     206
3    1752
4    2323
5     856
6     148
7       5
Name: response, dtype: int64

In [7]:
# the distribution is not even, the values 8, 9 and 10 are missing
# Therefore, we propose two classes: response <5 (bad wines) and response> = 5 (good wines)

#homework
#wine["quality"] = pd.cut(wine["response"], [0, 4, 5, 10], right=False, labels=["bad","mid", "good"])

wine["quality"] = pd.cut(wine["response"], [0, 5, 10], right=False, labels=["bad", "good"])
wine["quality"].value_counts()


# uwaga, moze sie zdarzyc, ze model bedzie rozpoznawal elementu jedynie jednej klasy (szczegolne zagrozenie, jezeli jedna klasa przewaza: 75%, 90%, 99%...)

bad     4311
good    1009
Name: quality, dtype: int64

In [6]:
wine

Unnamed: 0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol,response,color,quality
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,3,red,bad
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,3,red,bad
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,3,red,bad
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,4,red,bad
4,7.4,0.66,0.00,1.8,0.075,13.0,40.0,0.99780,3.51,0.56,9.4,3,red,bad
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5315,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,4,white,bad
5316,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,3,white,bad
5317,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,4,white,bad
5318,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,5,white,good


In [8]:
#we prepare dataset for analysis and modeling
#predictors
X = wine.iloc[:, 0:11]
X.head()

Unnamed: 0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8
4,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4


In [9]:
#the target variable
y = wine["quality"]
y[0:30]

0      bad
1      bad
2      bad
3      bad
4      bad
5      bad
6     good
7     good
8      bad
9      bad
10     bad
11     bad
12     bad
13     bad
14    good
15     bad
16     bad
17     bad
18     bad
19     bad
20     bad
21     bad
22     bad
23     bad
24     bad
25     bad
26     bad
27     bad
28     bad
29     bad
Name: quality, dtype: category
Categories (2, object): ['bad' < 'good']

In [21]:
#skoro mamy do czynienia z klasyfikacją binarną ("złe" - "dobre") - y(i) należy do zbioru {0,1}
#to warto przekodować wartości zmiennej y na zbiór liczb całkowitych
from sklearn.preprocessing import OrdinalEncoder

# metody do liczenia wola nie pracowac ze zeminnymi kategroycznimi, wola raczej jakies liczbowe
# trzeba jednak uwazac jakie wartosci liczbowe przypisuje sie kategoriom
# np. [bad, mid, good -> [0, 1, 2] faoryzuje '2', przez co moze zaburzyc model
# lepiej reprezentowac kategorie przy pomocy wektrów: [bad, mid, good] -> [[1,0,0], [0,1,0], [0,0,1]] (tzw. one hot encorder; kazdy wektor ma taka sama wage)
# to jest klasyfikacja wieloklasowa

# tutaj zostanie przedstawione przy pomocy 0 i 1 (zle i dobre); to jest klasyfikacja binarna
oe = OrdinalEncoder(categories = [['bad', 'good']], # kategorie
                   handle_unknown = 'use_encoded_value', # co sie stanie, jezeli zadna z obserwacji nie bedzie pasowac z kat. podanych wyzej
                   unknown_value = np.NaN) # ... to użuj 'NaN'

# mozna tez uzywac innego encodera; np. James-Stein Encoder: patrzy na caly zbior i na podstawie wiekszej ilosci danych dobiera wartosci dla kategorii
# jest lepsze dla tego, ze uwzglednia sie rozklad innych zmiennych

In [11]:
oe

In [15]:
type(y)

pandas.core.series.Series

<div class="alert alert-block alert-danger">
<b>Attention!</b> 
    
When coding the attributes, you should be aware that the new data may contain previously unseen values that need to be handled in some way. The default behavior of OrdinalEncoder in this case is to throw an exception, here we used to assign it a fixed value, where we chose to assign a null value. Then such a value can be replaced, for example, with a dominant from the dataset.
</div>

In [12]:
# using OrdinalEncoder we will code the target value
oe.fit(np.asanyarray(y).reshape(-1, 1))
yk = oe.transform(np.asanyarray(y).reshape(-1, 1)).flatten()

In [16]:
type(yk)

numpy.ndarray

In [18]:
yk.shape

(5320,)

In [13]:
np.asanyarray(y).reshape(-1, 1)

array([['bad'],
       ['bad'],
       ['bad'],
       ...,
       ['bad'],
       ['good'],
       ['bad']], dtype=object)

In [14]:
yk[1:30]

array([0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [19]:
#Division of the dataset into the training and test datasets
import sklearn.model_selection

np.arange(4)

array([0, 1, 2, 3])

In [22]:
X.shape

(5320, 11)

In [23]:
X.head()

Unnamed: 0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8
4,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4


In [24]:
# now we will randomly select indexes
np.arange(X.shape[0])

array([   0,    1,    2, ..., 5317, 5318, 5319])

In [37]:
# podzial zbioru danych; uwaga na wielkość podziału; 20% i 20%
# przy pechu moze sie okazać, że w podzbiorze będą (przeważały) rekordy tej samej kategorii (np. niemal same wina dobre)
# trzeba sprawdzić jaki jest udział kategorii w całym zbiorze i tak przygotować zbiory treningowe i testowe
# chcemy żeby udział każdej kategorii w zbiorach testowych/treningowych wygldądał tak samo jak w zbiorze ogólnym

idx_train, idx_test = sklearn.model_selection.train_test_split(np.arange(X.shape[0]),
                                                             test_size=0.2,
                                                             random_state=12345)
X_train, X_test = X.iloc[idx_train, :], X.iloc[idx_test, :]
y_train, y_test = y[idx_train], y[idx_test]
yk_train, yk_test = yk[idx_train], yk[idx_test]

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4256, 11), (1064, 11), (4256,), (1064,))

In [27]:
wine.quality.value_counts()

bad     4311
good    1009
Name: quality, dtype: int64

In [28]:
#distribution of classes in the train dataset
y_train.value_counts()

bad     3436
good     820
Name: quality, dtype: int64

In [29]:
#distribution of classes in the test dataset
y_test.value_counts()

bad     875
good    189
Name: quality, dtype: int64

In [34]:
np.round((820/3436)*100,1) # w zbiorze treningowym udział dobrych win

23.9

In [32]:
np.round((189/875)*100,1) # w zbiorze testowym udział dobrych win

21.6

In [36]:
#distribution of classes in the dataset
np.round((1009/4311)*100,1) # w zbiorze ogólnym udział dobrych win

23.4

In [41]:
train_train_X, train_train_y, test_test_X, test_test_y = sklearn.model_selection.train_test_split(X, y, test_size=0.2, stratify=y) 
# stratify -> idealny podział

In [43]:
test_test_X.value_count()

AttributeError: 'Series' object has no attribute 'value_count'

In [None]:
## k-nearest neighbors algorithm (k-NN)
### https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm

In [62]:
# k-nearest neighbors method -> jedna z prostrzych metod klasyfikacji; głosowanie większości sąsiadów - im większa ilość sąsiadów danej kategorii,
# tym więcej głosów, że przewidywana/szacowana jest taka jak większość; można dołożyć wagi np. w postaci odgległości w sensie odległości euklidesowej
import sklearn.neighbors

knn = sklearn.neighbors.KNeighborsClassifier() # knn to model; będzie to model bazaowy dla dalszych rozważań 
knn.fit(X_train, yk_train) # model trenuje na zestawie "train"

In [50]:
knn.get_params() # metryka minkowskiego to jest odległość euklidesowa, wagi uniform - nieistotne, k=5 sasiadów

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [52]:
# prediction based na train dataset
yk_pred_train = knn.predict(X_train) # wkładam zbiór treningowy

In [51]:
# prediction based na test dataset
yk_pred_test = knn.predict(X_test) # wkładam zbiór testowy

In [55]:
sklearn.metrics.accuracy_score(yk_train, yk_pred_train) # liczba poprawnie zaklasyfikowanych obsweracji do wszystkich obserwacji

0.8583176691729323

In [54]:
sklearn.metrics.accuracy_score(yk_test, yk_pred_test)

0.7913533834586466

### Confusion Matrix
### [true negative, false positive]
### [false negative, true positive]

### https://en.wikipedia.org/wiki/Confusion_matrix

In [56]:
#Confusion Matrix [[true negative, false positive], [false negative, true positive]]
sklearn.metrics.confusion_matrix(yk_test, yk_pred_test)

array([[805,  70],
       [152,  37]])

In [57]:
y_test.value_counts()

bad     875
good    189
Name: quality, dtype: int64

In [58]:
from sklearn.metrics import plot_confusion_matrix

ImportError: cannot import name 'plot_confusion_matrix' from 'sklearn.metrics' (/opt/conda/lib/python3.10/site-packages/sklearn/metrics/__init__.py)

In [65]:
#plot_confusion_matrix(knn, X_train, yk_train)
#plt.show()
sklearn.metrics.ConfusionMatrixDisplay(knn, X_train, yk_train)
plt.show()

TypeError: ConfusionMatrixDisplay.__init__() takes 2 positional arguments but 4 were given

In [60]:
y_train.value_counts()

bad     3436
good     820
Name: quality, dtype: int64

In [61]:
plot_confusion_matrix(knn, X_test, yk_test)
plt.show()

NameError: name 'plot_confusion_matrix' is not defined

In [66]:
#Plot Receiver operating characteristic (ROC) curve.
sklearn.metrics.plot_roc_curve(knn, X_test, yk_test)
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
plt.show()

AttributeError: module 'sklearn.metrics' has no attribute 'plot_roc_curve'

In [71]:
def fit_classifier(alg, X_train, X_test, y_train, y_test):
    """
    A method that trains a selected alg model on the training dataset (X_train, y_train),
    makes predictions on the both datasets (X_train, y_train) and (X_test, y_test)
    and validates it through four selected metrics: accuracy, precision, recall and F1.
    
    Parameters:
        alg: an object representing the selected algorithm,
                e.g. sklearn.neighbors.classification.KNeighborsClassifier algorithm used for classification
        X_train: pandas.core.frame.DataFrame
            training dataset - predictors
        X_test: pandas.core.frame.DataFrame
            test dataset - predictors
        y_train: pandas.core.series.Series or numpy.array
            training dataset - target variable
        y_test: pandas.core.series.Series or numpy.array
            test dataset - target variable
    
    Returns:
        dict: a dictionary containing the keys ACC, P, R and F1,
             for which model validation values have been determined, such as: accuracy, precision, recall and F1.
        
    """
      
    alg.fit(X_train, y_train)
    y_pred_train = alg.predict(X_train)
    y_pred_test = alg.predict(X_test)
    
    # ZAWSZE trzeba oceniac model pod kątem kilku metryk
    # metryki sprawdzic w dokumentacji do dokładnie oznaczają
    return {
        "ACC_train":  sklearn.metrics.accuracy_score(y_pred_train, y_train),
        "ACC_test": sklearn.metrics.accuracy_score(y_pred_test, y_test),
        "P_train":    sklearn.metrics.precision_score(y_pred_train, y_train), # P = tp/(tp+fp) -> zdolność rozpoznawania klasy positive
        "P_test":   sklearn.metrics.precision_score(y_pred_test, y_test),
        "R_train":    sklearn.metrics.recall_score(y_pred_train, y_train),
        "R_test":   sklearn.metrics.recall_score(y_pred_test, y_test),
        "F1_train":   sklearn.metrics.f1_score(y_pred_train, y_train), # metryka F1 jest mocna; jak F1 jest duża, to model jest dobry
        "F1_test":  sklearn.metrics.f1_score(y_pred_test, y_test)
    }

In [68]:
help(fit_classifier)

Help on function fit_classifier in module __main__:

fit_classifier(alg, X_train, X_test, y_train, y_test)
    A method that trains a selected alg model on the training dataset (X_train, y_train),
    makes predictions on the both datasets (X_train, y_train) and (X_test, y_test)
    and validates it through four selected metrics: accuracy, precision, recall and F1.
    
    Parameters:
        alg: an object representing the selected algorithm,
                e.g. sklearn.neighbors.classification.KNeighborsClassifier algorithm used for classification
        X_train: pandas.core.frame.DataFrame
            training dataset - predictors
        X_test: pandas.core.frame.DataFrame
            test dataset - predictors
        y_train: pandas.core.series.Series or numpy.array
            training dataset - target variable
        y_test: pandas.core.series.Series or numpy.array
            test dataset - target variable
    
    Returns:
        dict: a dictionary containing the keys ACC

In [72]:
#we create DataFrame with metrics
params = ["knn"]
res = [fit_classifier(sklearn.neighbors.KNeighborsClassifier(),
                      X_train, X_test, yk_train, yk_test)]
df_results = pd.DataFrame(res, index=params)

In [73]:
results = pd.DataFrame()
results = results.append(df_results)

  results = results.append(df_results)


In [74]:
results

Unnamed: 0,ACC_train,ACC_test,P_train,P_test,R_train,R_test,F1_train,F1_test
knn,0.858318,0.791353,0.404878,0.195767,0.742729,0.345794,0.524073,0.25


In [79]:
# jak widać, model dobrze rozpoznaje tylko elementy jednej klasy (negative; stąd w miarę duże ACCuracy)

#we check the metrics of the model for the dataset after standardization; czy na wyniki wpływa standaryzacja?
m = X.mean()
s = X.std()

In [77]:
X_train_std = (X_train - m)/s
X_test_std = (X_test - m)/s

In [80]:
X_train_std.describe()

Unnamed: 0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol
count,4256.0,4256.0,4256.0,4256.0,4256.0,4256.0,4256.0,4256.0,4256.0,4256.0,4256.0
mean,-0.003757,-0.001321,-0.002622,0.00857,-0.000937,0.003233,0.010694,0.005759,0.009452,-1.3e-05,-0.002293
std,0.995667,0.999152,0.989494,1.004635,0.980603,0.993751,0.999382,1.000756,1.007276,0.986945,0.991881
min,-2.587902,-1.569881,-2.164312,-0.988511,-1.293694,-1.630811,-1.904192,-2.466797,-3.146689,-2.092634,-2.149566
25%,-0.617714,-0.678341,-0.46545,-0.721855,-0.507004,-0.788353,-0.688852,-0.767255,-0.6526,-0.690231,-0.884739
50%,-0.163055,-0.262289,-0.057723,-0.499642,-0.262859,-0.114386,0.050921,0.035306,-0.029078,-0.155982,-0.125843
75%,0.367381,0.391507,0.553867,0.544761,0.252559,0.615744,0.702625,0.75778,0.656796,0.445048,0.717375
max,6.581052,7.345516,9.116132,13.499798,11.130582,14.544381,5.740122,14.987228,4.896748,9.460499,3.668638


In [81]:
params = ["knn_std"]
res = [fit_classifier(sklearn.neighbors.KNeighborsClassifier(),
                          X_train_std, X_test_std, yk_train, yk_test)]
df_results = pd.DataFrame(res, index=params)

In [82]:
results = results.append(df_results)

  results = results.append(df_results)


In [85]:
results # w przypadku klasyfikacji (KNN) po standaryzacji wyniki uległy poprawie

Unnamed: 0,ACC_train,ACC_test,P_train,P_test,R_train,R_test,F1_train,F1_test
knn,0.858318,0.791353,0.404878,0.195767,0.742729,0.345794,0.524073,0.25
knn_std,0.883224,0.839286,0.587805,0.465608,0.75195,0.556962,0.659822,0.507205


In [84]:
# w związku z tym, że zmienna celu ma wartości 0 i 1, nie muszę dokownyać odwrotnej trasformacji (tak jak w przypadku regresji)

In [None]:
# confusion matrix
knn.fit(X_train_std, yk_train)
#knn.predict(X_test_std)

plot_confusion_matrix(knn, X_test_std, yk_test)
plt.show()

In [None]:
#Classification report
help(sklearn.metrics.classification_report)

In [None]:
yk_pred_test = knn.predict(X_test_std)

In [None]:
print(sklearn.metrics.classification_report(yk_test, yk_pred_test, target_names=['bad', 'good']))

In [None]:
#Plot Receiver operating characteristic (ROC) curve.
sklearn.metrics.plot_roc_curve(knn, X_test_std, yk_test)
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
plt.show()

In [None]:
# have a look on pairplot figure
X_std = (X-m)/s
sns.pairplot(X_std)
plt.show()

## Isolation Forest method for outliers detection
## https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.IsolationForest.html

In [None]:
from sklearn.ensemble import IsolationForest

In [None]:
# let's train the model and mark outliers
clf = IsolationForest(n_estimators=1000, random_state=12345)
clf.fit(X)
isf_pred = clf.predict(X)

In [None]:
isf_pred[:30]

In [None]:
# values -1 are the outliers
unique, counts = np.unique(isf_pred, return_counts=True)
print(np.asarray((unique, counts)).T)

In [None]:
len(isf_pred[isf_pred == 1])

In [None]:
len(isf_pred[isf_pred == -1])

In [None]:
np.round((len(isf_pred[isf_pred == -1])/X.shape[0])*100,1)

In [None]:
# we will use values 1 as a mask
X_wout_outl = X[isf_pred == 1]

In [None]:
X_wout_outl

In [None]:
yk_wout_outl = yk[isf_pred == 1]

In [None]:
len(yk_wout_outl)

In [None]:
sns.pairplot(X_wout_outl)
plt.show()

In [None]:
X_wout_outl.reset_index(drop=True, inplace=True)
#yk_wout_outl.reset_index(drop=True, inplace=True)


In [None]:
X_wout_outl

In [None]:
len(yk_wout_outl)

In [None]:
# let's train the model on dataset without outliers
idx_train, idx_test = sklearn.model_selection.train_test_split(np.arange(X_wout_outl.shape[0]),
                                                             test_size=0.2,
                                                             random_state=12345)

X_train_wo, X_test_wo = X_wout_outl.iloc[idx_train, :], X_wout_outl.iloc[idx_test, :]
yk_train_wo, yk_test_wo = yk_wout_outl[idx_train], yk_wout_outl[idx_test]

X_train_wo.shape, X_test_wo.shape, yk_train_wo.shape, yk_test_wo.shape

In [None]:
params = ["knn_isf"]
res = [fit_classifier(sklearn.neighbors.KNeighborsClassifier(),
                          X_train_wo, X_test_wo, yk_train_wo, yk_test_wo)]

df_results = pd.DataFrame(res, index=params)

In [None]:
results = results.append(df_results)
results

In [None]:
# we will stdandardize the dataset without outliers
m_wo = X_wout_outl.mean()
s_wo = X_wout_outl.std()
X_wo_std = (X_wout_outl - m_wo)/s_wo

In [None]:
X_train_wo_std = (X_train_wo - m_wo)/s_wo
X_test_wo_std = (X_test_wo - m_wo)/s_wo

In [None]:
params = ["knn_isf_std"]
res = [fit_classifier(sklearn.neighbors.KNeighborsClassifier(),
                          X_train_wo_std, X_test_wo_std, yk_train_wo, yk_test_wo)]
df_results = pd.DataFrame(res, index=params)

In [None]:
results = results.append(df_results)
results

In [None]:
X_test_std

In [None]:
#let's create a set of classification models
#we start with one model and we will add other models later on
methods = pd.Series({
    "knn std cv": sklearn.neighbors.KNeighborsClassifier()
})

#evaluation function
def eval_function(X_train, X_test, y_train, y_test):
    cv_models = pd.concat([
        pd.Series(fit_classifier(alg,
                                 X_train, X_test, y_train, y_test)) for alg in methods], axis=1).T
    cv_models.index = methods.index
    return cv_models

In [None]:
#application of the evaluation function
#results summarizing the cross validation
from sklearn.model_selection import KFold

kf = KFold(n_splits=5)

n_folds = 5

results_cv = [eval_function(X_std.iloc[train,:],
                            X_std.iloc[test,:],
                            yk[train],
                            yk[test]) for train, test in kf.split(X_std)]

sum(results_cv)/n_folds

In [None]:
#dictionary with metric form cross validation
dict_metrics_cv = (sum(results_cv)/n_folds).to_dict()

In [None]:
dict_metrics_cv

In [None]:
params = ["knn_std_cv"]
res = [{
 'ACC_train': dict_metrics_cv['ACC_train']['knn std cv'],
 'ACC_test': dict_metrics_cv['ACC_test']['knn std cv'],
 'P_train': dict_metrics_cv['P_train']['knn std cv'],
 'P_test': dict_metrics_cv['P_test']['knn std cv'],
 'R_train': dict_metrics_cv['R_train']['knn std cv'],
 'R_test': dict_metrics_cv['R_test']['knn std cv'],
 'F1_train': dict_metrics_cv['F1_train']['knn std cv'],
 'F1_test': dict_metrics_cv['F1_test']['knn std cv']
}]
df_results = pd.DataFrame(res, index=params)

In [None]:
results = results.append(df_results)
results

In [None]:
results_cv = [eval_function(X_wo_std.iloc[train,:],
                            X_wo_std.iloc[test,:],
                            yk_wout_outl[train],
                            yk_wout_outl[test]) for train, test in kf.split(X_wo_std)]

sum(results_cv)/n_folds

In [None]:
#dictionary with metric form cross validation
dict_metrics_cv = (sum(results_cv)/n_folds).to_dict()

In [None]:
params = ["knn_isf_std_cv"]
res = [{
 'ACC_train': dict_metrics_cv['ACC_train']['knn std cv'],
 'ACC_test': dict_metrics_cv['ACC_test']['knn std cv'],
 'P_train': dict_metrics_cv['P_train']['knn std cv'],
 'P_test': dict_metrics_cv['P_test']['knn std cv'],
 'R_train': dict_metrics_cv['R_train']['knn std cv'],
 'R_test': dict_metrics_cv['R_test']['knn std cv'],
 'F1_train': dict_metrics_cv['F1_train']['knn std cv'],
 'F1_test': dict_metrics_cv['F1_test']['knn std cv']
}]
df_results = pd.DataFrame(res, index=params)

In [None]:
results = results.append(df_results)
results

## Playing with hyperparameters of the models

In [None]:
# let's check how the accuracy depends on number of neighbors
tab_train = list()
tab_test = list()

for i in range(1,31):
    cl_model = sklearn.neighbors.KNeighborsClassifier(n_neighbors=i) #creation of the model
    print(cl_model)
    cl_model.fit(X_train_std, yk_train) #training of the model
    
    y_tr_pred = cl_model.predict(X_train_std) #prediction for the train dataset
    y_te_pred = cl_model.predict(X_test_std) #prediction for the test dataset
    
    tab_train.append(sklearn.metrics.accuracy_score(yk_train, y_tr_pred))
    tab_test.append(sklearn.metrics.accuracy_score(yk_test, y_te_pred))

In [None]:
#and the winner is!!!
plt.figure(figsize=(14,7))
plt.plot(tab_train, label='train')
plt.plot(tab_test, label='test')
plt.legend()
plt.show()

In [None]:
params = ["knn10_std"]
res = [fit_classifier(sklearn.neighbors.KNeighborsClassifier(n_neighbors=10),
                          X_train_std, X_test_std, yk_train, yk_test)]
df_results = pd.DataFrame(res, index=params)

In [None]:
results = results.append(df_results)
results

In [None]:
knn10 = sklearn.neighbors.KNeighborsClassifier(n_neighbors=10)
knn10.fit(X_train_std, yk_train)
knn10.predict(X_test_std)

plot_confusion_matrix(knn10, X_test_std, yk_test)
plt.show()

In [None]:
#Plot Receiver operating characteristic (ROC) curve.
sklearn.metrics.plot_roc_curve(knn10, X_test_std, yk_test)
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
plt.show()

In [None]:
# let's check how the F1 depends on number of neighbors
tab_train = list()
tab_test = list()

for i in range(1,31):
    cl_model = sklearn.neighbors.KNeighborsClassifier(n_neighbors=i) #creation of the model
    print(cl_model)
    cl_model.fit(X_train_std, yk_train) #training of the model
    
    y_tr_pred = cl_model.predict(X_train_std) #prediction for the train dataset
    y_te_pred = cl_model.predict(X_test_std) #prediction for the test dataset
    
    tab_train.append(sklearn.metrics.f1_score(yk_train, y_tr_pred))
    tab_test.append(sklearn.metrics.f1_score(yk_test, y_te_pred))

In [None]:
#and now the winner is!!!
plt.figure(figsize=(14,7))
plt.plot(tab_train, label='train')
plt.plot(tab_test, label='test')
plt.legend()
plt.show()

In [None]:
params = ["knn13_std"]
res = [fit_classifier(sklearn.neighbors.KNeighborsClassifier(n_neighbors=13),
                          X_train_std, X_test_std, yk_train, yk_test)]
df_results = pd.DataFrame(res, index=params)

In [None]:
results = results.append(df_results)
results

In [None]:
knn13 = sklearn.neighbors.KNeighborsClassifier(n_neighbors=13)
knn13.fit(X_train_std, yk_train)
knn13.predict(X_test_std)

plot_confusion_matrix(knn13, X_test_std, yk_test)
plt.show()

In [None]:
#Plot Receiver operating characteristic (ROC) curve.
sklearn.metrics.plot_roc_curve(knn13, X_test_std, yk_test)
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
plt.show()

# Decision Trees (DTs) and Random Forests (RFs)
## https://scikit-learn.org/stable/modules/tree.html

In [None]:
import sklearn.tree

In [None]:
# let's create the model based on decision tree
params = ["dt"]
res = [fit_classifier(sklearn.tree.DecisionTreeClassifier(),
                          X_train, X_test, yk_train, yk_test)]
df_results = pd.DataFrame(res, index=params)

In [None]:
results = results.append(df_results)
results

In [None]:
dt = sklearn.tree.DecisionTreeClassifier()
dt.fit(X_train, yk_train)
dt.predict(X_test)

plot_confusion_matrix(dt, X_test, yk_test)
plt.show()

In [None]:
#Plot Receiver operating characteristic (ROC) curve.
sklearn.metrics.plot_roc_curve(dt, X_test, yk_test)
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
plt.show()

In [None]:
dt.get_params()

In [None]:
dt.get_depth()

In [None]:
# plotting the tree
# sklearn.tree.plot_tree(dt)

In [None]:
# we definitely have to prune the tree
params = ["dt_maxd12"]
res = [fit_classifier(sklearn.tree.DecisionTreeClassifier(max_depth=12),
                          X_train, X_test, yk_train, yk_test)]
df_results = pd.DataFrame(res, index=params)

In [None]:
results = results.append(df_results)
results

In [None]:
# and let's check what happened after standardization and max_depth decreasing
params = ["dt_maxd12_std"]
res = [fit_classifier(sklearn.tree.DecisionTreeClassifier(max_depth=12),
                          X_train_std, X_test_std, yk_train, yk_test)]
df_results = pd.DataFrame(res, index=params)

In [None]:
results = results.append(df_results)
results

In [None]:
dt = sklearn.tree.DecisionTreeClassifier(max_depth=6)
dt.fit(X_train_std, yk_train)
dt.predict(X_test_std)

plot_confusion_matrix(dt, X_test_std, yk_test)
plt.show()

In [None]:
# plotting the tree
sklearn.tree.plot_tree(dt)

In [None]:
from sklearn.tree import export_text
r = export_text(dt, feature_names=['fixed.acidity', 'volatile.acidity', 'citric.acid', 'residual.sugar',
       'chlorides', 'free.sulfur.dioxide', 'total.sulfur.dioxide', 'density',
       'pH', 'sulphates', 'alcohol'])

In [None]:
print(X.columns)

In [None]:
print(r)

In [None]:
#Random Forests
import sklearn.ensemble

In [None]:
params = ["rf"]
res = [fit_classifier(sklearn.ensemble.RandomForestClassifier(random_state=12345),
                          X_train, X_test, yk_train, yk_test)]
df_results = pd.DataFrame(res, index=params)

In [None]:
results = results.append(df_results)
results

In [None]:
rf = sklearn.ensemble.RandomForestClassifier(random_state=12345)
rf.fit(X_train, yk_train)
rf.predict(X_test)

plot_confusion_matrix(rf, X_test, yk_test)
plt.show()

In [None]:
rf.get_params()

In [None]:
params = ["rf new"]
res = [fit_classifier(sklearn.ensemble.RandomForestClassifier(n_estimators=500,
                                                              max_depth=10,
                                                              random_state=12345),
                          X_train, X_test, yk_train, yk_test)]
df_results = pd.DataFrame(res, index=params)

In [None]:
results = results.append(df_results)
results

In [None]:
# Intro to XAI
# Feature importances

pd.Series(rf.feature_importances_, index = X.columns[0:11]).sort_values(ascending=False)


In [None]:
#conclusion - good wine is strong wine :)
wine.alcohol.groupby(wine.quality).mean()

And now, you can play with standardization, removing outliers, features selection, hyperparameters tuning, ...

You can do almost everything, which directing you to create stable, not overfitted, as simple as possible, as general as possible, ... model, which helps you to solve your problem.

And it is not so simple :)

## Few words about cross validation 

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.gaussian_process import GaussianProcessClassifier

In [None]:
# cross validation for several models based on std dataset
methods = pd.Series({
    "knn": sklearn.neighbors.KNeighborsClassifier(n_neighbors=10),
    "dt": sklearn.tree.DecisionTreeClassifier(max_depth=10),
    "rf": sklearn.ensemble.RandomForestClassifier(max_depth=10),
    "lr": LogisticRegression(),
    "SVC": SVC(),
    "SGD": SGDClassifier(),
    "GP": GaussianProcessClassifier(),
})

#evaluation function
def eval_function(X_train, X_test, y_train, y_test):
    cv_models = pd.concat([
        pd.Series(fit_classifier(alg,
                                 X_train, X_test, y_train, y_test)) for alg in methods], axis=1).T
    cv_models.index = methods.index
    return cv_models

kf = KFold(n_splits=5)

n_folds = 5

results_cv = [eval_function(X_std.iloc[train,:],
                            X_std.iloc[test,:],
                            yk[train],
                            yk[test]) for train, test in kf.split(X_std)]

sum(results_cv)/n_folds