# Języki Programowania Python i R


## dr inż. Patryk Jasik
### Division of Theoretical Physics and Quantum Information
### Institute of Physics and Computer Science
### Faculty of Applied Physics and Mathematics
### Gdansk University of Technology

# scikit-learn docs
## https://scikit-learn.org/stable/

In [None]:
#%config Completer.use_jedi = False

**Classification** is the problem of identifying which of a set of categories (sub-populations) an observation (or observations) belongs to.

In [None]:
#loading the necessary packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

In [None]:
#loading the dataset
wine = pd.read_csv("data/winequality-all.csv", comment="#")
wine.head()

In [None]:
# the goal - classification of wines according to the 'response' variable
# 'response' determines the quality of the wine (median of three experts' opinions)
# scale from 0 (very bad) to 10 (excellent)
# let's examine the class size distribution
response_levels = wine["response"].value_counts()
response_levels.iloc[np.argsort(response_levels.index)]

In [None]:
# the distribution is not even, the values 8, 9 and 10 are missing
# Therefore, we propose two classes: response <5 (bad wines) and response> = 5 (good wines)

#homework
#wine["quality"] = pd.cut(wine["response"], [0, 4, 5, 10], right=False, labels=["bad","mid", "good"])

wine["quality"] = pd.cut(wine["response"], [0, 5, 10], right=False, labels=["bad", "good"])
wine["quality"].value_counts()

In [None]:
wine

In [None]:
#we prepare dataset for analysis and modeling
#predictors
X = wine.iloc[:, 0:11]
X.head()

In [None]:
#the target variable
y = wine["quality"]
y[0:30]

In [None]:
#skoro mamy do czynienia z klasyfikacją binarną ("złe" - "dobre") - y(i) należy do zbioru {0,1}
#to warto przekodować wartości zmiennej y na zbiór liczb całkowitych
from sklearn.preprocessing import OrdinalEncoder

oe = OrdinalEncoder(categories = [['bad', 'good']],
                   handle_unknown = 'use_encoded_value',
                   unknown_value = np.NaN)

<div class="alert alert-block alert-danger">
<b>Attention!</b> 
    
When coding the attributes, you should be aware that the new data may contain previously unseen values that need to be handled in some way. The default behavior of OrdinalEncoder in this case is to throw an exception, here we used to assign it a fixed value, where we chose to assign a null value. Then such a value can be replaced, for example, with a dominant from the dataset.
</div>

In [None]:
# using OrdinalEncoder we will code the target value
oe.fit(np.asanyarray(y).reshape(-1, 1))
yk = oe.transform(np.asanyarray(y).reshape(-1, 1)).flatten()

In [None]:
np.asanyarray(y).reshape(-1, 1)

In [None]:
yk[1:30]

In [None]:
#Division of the dataset into the training and test datasets
import sklearn.model_selection

np.arange(4)

In [None]:
X.shape

In [None]:
# now we will randomly select indexes
np.arange(X.shape[0])

In [None]:
idx_train, idx_test = sklearn.model_selection.train_test_split(np.arange(X.shape[0]),
                                                             test_size=0.2,
                                                             random_state=12345)
X_train, X_test = X.iloc[idx_train, :], X.iloc[idx_test, :]
y_train, y_test = y[idx_train], y[idx_test]
yk_train, yk_test = yk[idx_train], yk[idx_test]

X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
wine.quality.value_counts()

In [None]:
#distribution of classes in the train dataset
y_train.value_counts()

In [None]:
#distribution of classes in the test dataset
y_test.value_counts()

In [None]:
np.round((820/3436)*100,1)

In [None]:
np.round((189/875)*100,1)

In [None]:
#distribution of classes in the dataset
np.round((1009/4311)*100,1)

## k-nearest neighbors algorithm (k-NN)
### https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm

In [None]:
# k-nearest neighbors method
import sklearn.neighbors

knn = sklearn.neighbors.KNeighborsClassifier()
knn.fit(X_train, yk_train)

In [None]:
knn.get_params()

In [None]:
# prediction based na train dataset
yk_pred_train = knn.predict(X_train)

In [None]:
# prediction based na test dataset
yk_pred_test = knn.predict(X_test)

In [None]:
sklearn.metrics.accuracy_score(yk_train, yk_pred_train)

In [None]:
sklearn.metrics.accuracy_score(yk_test, yk_pred_test)

### Confusion Matrix
### [true negative, false positive]
### [false negative, true positive]

### https://en.wikipedia.org/wiki/Confusion_matrix

In [None]:
#Confusion Matrix [[true negative, false positive], [false negative, true positive]]
sklearn.metrics.confusion_matrix(yk_test, yk_pred_test)

In [None]:
y_test.value_counts()

In [None]:
from sklearn.metrics import plot_confusion_matrix

In [None]:
plot_confusion_matrix(knn, X_train, yk_train)
plt.show()

In [None]:
y_train.value_counts()

In [None]:
plot_confusion_matrix(knn, X_test, yk_test)
plt.show()

In [None]:
#Plot Receiver operating characteristic (ROC) curve.
sklearn.metrics.plot_roc_curve(knn, X_test, yk_test)
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
plt.show()

In [None]:
def fit_classifier(alg, X_train, X_test, y_train, y_test):
    """
    A method that trains a selected alg model on the training dataset (X_train, y_train),
    makes predictions on the both datasets (X_train, y_train) and (X_test, y_test)
    and validates it through four selected metrics: accuracy, precision, recall and F1.
    
    Parameters:
        alg: an object representing the selected algorithm,
                e.g. sklearn.neighbors.classification.KNeighborsClassifier algorithm used for classification
        X_train: pandas.core.frame.DataFrame
            training dataset - predictors
        X_test: pandas.core.frame.DataFrame
            test dataset - predictors
        y_train: pandas.core.series.Series or numpy.array
            training dataset - target variable
        y_test: pandas.core.series.Series or numpy.array
            test dataset - target variable
    
    Returns:
        dict: a dictionary containing the keys ACC, P, R and F1,
             for which model validation values have been determined, such as: accuracy, precision, recall and F1.
        
    """
      
    alg.fit(X_train, y_train)
    y_pred_train = alg.predict(X_train)
    y_pred_test = alg.predict(X_test)
    
    return {
        "ACC_train":  sklearn.metrics.accuracy_score(y_pred_train, y_train),
        "ACC_test": sklearn.metrics.accuracy_score(y_pred_test, y_test),
        "P_train":    sklearn.metrics.precision_score(y_pred_train, y_train),
        "P_test":   sklearn.metrics.precision_score(y_pred_test, y_test),
        "R_train":    sklearn.metrics.recall_score(y_pred_train, y_train),
        "R_test":   sklearn.metrics.recall_score(y_pred_test, y_test),
        "F1_train":   sklearn.metrics.f1_score(y_pred_train, y_train),
        "F1_test":  sklearn.metrics.f1_score(y_pred_test, y_test)
    }

In [None]:
help(fit_classifier)

In [None]:
#we create DataFrame with metrics
params = ["knn"]
res = [fit_classifier(sklearn.neighbors.KNeighborsClassifier(),
                      X_train, X_test, yk_train, yk_test)]
df_results = pd.DataFrame(res, index=params)

In [None]:
results = pd.DataFrame()
results = results.append(df_results)

In [None]:
results

In [None]:
#we check the metrics of the model for the dataset after standardization
m = X.mean()
s = X.std()

In [None]:
X_train_std = (X_train - m)/s
X_test_std = (X_test - m)/s

In [None]:
X_train_std.describe()

In [None]:
params = ["knn_std"]
res = [fit_classifier(sklearn.neighbors.KNeighborsClassifier(),
                          X_train_std, X_test_std, yk_train, yk_test)]
df_results = pd.DataFrame(res, index=params)

In [None]:
results = results.append(df_results)

In [None]:
results

In [None]:
# confusion matrix
knn.fit(X_train_std, yk_train)
#knn.predict(X_test_std)

plot_confusion_matrix(knn, X_test_std, yk_test)
plt.show()

In [None]:
#Classification report
help(sklearn.metrics.classification_report)

In [None]:
yk_pred_test = knn.predict(X_test_std)

In [None]:
print(sklearn.metrics.classification_report(yk_test, yk_pred_test, target_names=['bad', 'good']))

In [None]:
#Plot Receiver operating characteristic (ROC) curve.
sklearn.metrics.plot_roc_curve(knn, X_test_std, yk_test)
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
plt.show()

In [None]:
# have a look on pairplot figure
X_std = (X-m)/s
sns.pairplot(X_std)
plt.show()

## Isolation Forest method for outliers detection
## https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.IsolationForest.html

In [None]:
from sklearn.ensemble import IsolationForest

In [None]:
# let's train the model and mark outliers
clf = IsolationForest(n_estimators=1000, random_state=12345)
clf.fit(X)
isf_pred = clf.predict(X)

In [None]:
isf_pred[:30]

In [None]:
# values -1 are the outliers
unique, counts = np.unique(isf_pred, return_counts=True)
print(np.asarray((unique, counts)).T)

In [None]:
len(isf_pred[isf_pred == 1])

In [None]:
len(isf_pred[isf_pred == -1])

In [None]:
np.round((len(isf_pred[isf_pred == -1])/X.shape[0])*100,1)

In [None]:
# we will use values 1 as a mask
X_wout_outl = X[isf_pred == 1]

In [None]:
X_wout_outl

In [None]:
yk_wout_outl = yk[isf_pred == 1]

In [None]:
len(yk_wout_outl)

In [None]:
sns.pairplot(X_wout_outl)
plt.show()

In [None]:
X_wout_outl.reset_index(drop=True, inplace=True)
#yk_wout_outl.reset_index(drop=True, inplace=True)


In [None]:
X_wout_outl

In [None]:
len(yk_wout_outl)

In [None]:
# let's train the model on dataset without outliers
idx_train, idx_test = sklearn.model_selection.train_test_split(np.arange(X_wout_outl.shape[0]),
                                                             test_size=0.2,
                                                             random_state=12345)

X_train_wo, X_test_wo = X_wout_outl.iloc[idx_train, :], X_wout_outl.iloc[idx_test, :]
yk_train_wo, yk_test_wo = yk_wout_outl[idx_train], yk_wout_outl[idx_test]

X_train_wo.shape, X_test_wo.shape, yk_train_wo.shape, yk_test_wo.shape

In [None]:
params = ["knn_isf"]
res = [fit_classifier(sklearn.neighbors.KNeighborsClassifier(),
                          X_train_wo, X_test_wo, yk_train_wo, yk_test_wo)]

df_results = pd.DataFrame(res, index=params)

In [None]:
results = results.append(df_results)
results

In [None]:
# we will stdandardize the dataset without outliers
m_wo = X_wout_outl.mean()
s_wo = X_wout_outl.std()
X_wo_std = (X_wout_outl - m_wo)/s_wo

In [None]:
X_train_wo_std = (X_train_wo - m_wo)/s_wo
X_test_wo_std = (X_test_wo - m_wo)/s_wo

In [None]:
params = ["knn_isf_std"]
res = [fit_classifier(sklearn.neighbors.KNeighborsClassifier(),
                          X_train_wo_std, X_test_wo_std, yk_train_wo, yk_test_wo)]
df_results = pd.DataFrame(res, index=params)

In [None]:
results = results.append(df_results)
results

In [None]:
X_test_std

In [None]:
#let's create a set of classification models
#we start with one model and we will add other models later on
methods = pd.Series({
    "knn std cv": sklearn.neighbors.KNeighborsClassifier()
})

#evaluation function
def eval_function(X_train, X_test, y_train, y_test):
    cv_models = pd.concat([
        pd.Series(fit_classifier(alg,
                                 X_train, X_test, y_train, y_test)) for alg in methods], axis=1).T
    cv_models.index = methods.index
    return cv_models

In [None]:
#application of the evaluation function
#results summarizing the cross validation
from sklearn.model_selection import KFold

kf = KFold(n_splits=5)

n_folds = 5

results_cv = [eval_function(X_std.iloc[train,:],
                            X_std.iloc[test,:],
                            yk[train],
                            yk[test]) for train, test in kf.split(X_std)]

sum(results_cv)/n_folds

In [None]:
#dictionary with metric form cross validation
dict_metrics_cv = (sum(results_cv)/n_folds).to_dict()

In [None]:
dict_metrics_cv

In [None]:
params = ["knn_std_cv"]
res = [{
 'ACC_train': dict_metrics_cv['ACC_train']['knn std cv'],
 'ACC_test': dict_metrics_cv['ACC_test']['knn std cv'],
 'P_train': dict_metrics_cv['P_train']['knn std cv'],
 'P_test': dict_metrics_cv['P_test']['knn std cv'],
 'R_train': dict_metrics_cv['R_train']['knn std cv'],
 'R_test': dict_metrics_cv['R_test']['knn std cv'],
 'F1_train': dict_metrics_cv['F1_train']['knn std cv'],
 'F1_test': dict_metrics_cv['F1_test']['knn std cv']
}]
df_results = pd.DataFrame(res, index=params)

In [None]:
results = results.append(df_results)
results

In [None]:
results_cv = [eval_function(X_wo_std.iloc[train,:],
                            X_wo_std.iloc[test,:],
                            yk_wout_outl[train],
                            yk_wout_outl[test]) for train, test in kf.split(X_wo_std)]

sum(results_cv)/n_folds

In [None]:
#dictionary with metric form cross validation
dict_metrics_cv = (sum(results_cv)/n_folds).to_dict()

In [None]:
params = ["knn_isf_std_cv"]
res = [{
 'ACC_train': dict_metrics_cv['ACC_train']['knn std cv'],
 'ACC_test': dict_metrics_cv['ACC_test']['knn std cv'],
 'P_train': dict_metrics_cv['P_train']['knn std cv'],
 'P_test': dict_metrics_cv['P_test']['knn std cv'],
 'R_train': dict_metrics_cv['R_train']['knn std cv'],
 'R_test': dict_metrics_cv['R_test']['knn std cv'],
 'F1_train': dict_metrics_cv['F1_train']['knn std cv'],
 'F1_test': dict_metrics_cv['F1_test']['knn std cv']
}]
df_results = pd.DataFrame(res, index=params)

In [None]:
results = results.append(df_results)
results

## Playing with hyperparameters of the models

In [None]:
# let's check how the accuracy depends on number of neighbors
tab_train = list()
tab_test = list()

for i in range(1,31):
    cl_model = sklearn.neighbors.KNeighborsClassifier(n_neighbors=i) #creation of the model
    print(cl_model)
    cl_model.fit(X_train_std, yk_train) #training of the model
    
    y_tr_pred = cl_model.predict(X_train_std) #prediction for the train dataset
    y_te_pred = cl_model.predict(X_test_std) #prediction for the test dataset
    
    tab_train.append(sklearn.metrics.accuracy_score(yk_train, y_tr_pred))
    tab_test.append(sklearn.metrics.accuracy_score(yk_test, y_te_pred))

In [None]:
#and the winner is!!!
plt.figure(figsize=(14,7))
plt.plot(tab_train, label='train')
plt.plot(tab_test, label='test')
plt.legend()
plt.show()

In [None]:
params = ["knn10_std"]
res = [fit_classifier(sklearn.neighbors.KNeighborsClassifier(n_neighbors=10),
                          X_train_std, X_test_std, yk_train, yk_test)]
df_results = pd.DataFrame(res, index=params)

In [None]:
results = results.append(df_results)
results

In [None]:
knn10 = sklearn.neighbors.KNeighborsClassifier(n_neighbors=10)
knn10.fit(X_train_std, yk_train)
knn10.predict(X_test_std)

plot_confusion_matrix(knn10, X_test_std, yk_test)
plt.show()

In [None]:
#Plot Receiver operating characteristic (ROC) curve.
sklearn.metrics.plot_roc_curve(knn10, X_test_std, yk_test)
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
plt.show()

In [None]:
# let's check how the F1 depends on number of neighbors
tab_train = list()
tab_test = list()

for i in range(1,31):
    cl_model = sklearn.neighbors.KNeighborsClassifier(n_neighbors=i) #creation of the model
    print(cl_model)
    cl_model.fit(X_train_std, yk_train) #training of the model
    
    y_tr_pred = cl_model.predict(X_train_std) #prediction for the train dataset
    y_te_pred = cl_model.predict(X_test_std) #prediction for the test dataset
    
    tab_train.append(sklearn.metrics.f1_score(yk_train, y_tr_pred))
    tab_test.append(sklearn.metrics.f1_score(yk_test, y_te_pred))

In [None]:
#and now the winner is!!!
plt.figure(figsize=(14,7))
plt.plot(tab_train, label='train')
plt.plot(tab_test, label='test')
plt.legend()
plt.show()

In [None]:
params = ["knn13_std"]
res = [fit_classifier(sklearn.neighbors.KNeighborsClassifier(n_neighbors=13),
                          X_train_std, X_test_std, yk_train, yk_test)]
df_results = pd.DataFrame(res, index=params)

In [None]:
results = results.append(df_results)
results

In [None]:
knn13 = sklearn.neighbors.KNeighborsClassifier(n_neighbors=13)
knn13.fit(X_train_std, yk_train)
knn13.predict(X_test_std)

plot_confusion_matrix(knn13, X_test_std, yk_test)
plt.show()

In [None]:
#Plot Receiver operating characteristic (ROC) curve.
sklearn.metrics.plot_roc_curve(knn13, X_test_std, yk_test)
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
plt.show()

# Decision Trees (DTs) and Random Forests (RFs)
## https://scikit-learn.org/stable/modules/tree.html

In [None]:
import sklearn.tree

In [None]:
# let's create the model based on decision tree
params = ["dt"]
res = [fit_classifier(sklearn.tree.DecisionTreeClassifier(),
                          X_train, X_test, yk_train, yk_test)]
df_results = pd.DataFrame(res, index=params)

In [None]:
results = results.append(df_results)
results

In [None]:
dt = sklearn.tree.DecisionTreeClassifier()
dt.fit(X_train, yk_train)
dt.predict(X_test)

plot_confusion_matrix(dt, X_test, yk_test)
plt.show()

In [None]:
#Plot Receiver operating characteristic (ROC) curve.
sklearn.metrics.plot_roc_curve(dt, X_test, yk_test)
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
plt.show()

In [None]:
dt.get_params()

In [None]:
dt.get_depth()

In [None]:
# plotting the tree
# sklearn.tree.plot_tree(dt)

In [None]:
# we definitely have to prune the tree
params = ["dt_maxd12"]
res = [fit_classifier(sklearn.tree.DecisionTreeClassifier(max_depth=12),
                          X_train, X_test, yk_train, yk_test)]
df_results = pd.DataFrame(res, index=params)

In [None]:
results = results.append(df_results)
results

In [None]:
# and let's check what happened after standardization and max_depth decreasing
params = ["dt_maxd12_std"]
res = [fit_classifier(sklearn.tree.DecisionTreeClassifier(max_depth=12),
                          X_train_std, X_test_std, yk_train, yk_test)]
df_results = pd.DataFrame(res, index=params)

In [None]:
results = results.append(df_results)
results

In [None]:
dt = sklearn.tree.DecisionTreeClassifier(max_depth=6)
dt.fit(X_train_std, yk_train)
dt.predict(X_test_std)

plot_confusion_matrix(dt, X_test_std, yk_test)
plt.show()

In [None]:
# plotting the tree
sklearn.tree.plot_tree(dt)

In [None]:
from sklearn.tree import export_text
r = export_text(dt, feature_names=['fixed.acidity', 'volatile.acidity', 'citric.acid', 'residual.sugar',
       'chlorides', 'free.sulfur.dioxide', 'total.sulfur.dioxide', 'density',
       'pH', 'sulphates', 'alcohol'])

In [None]:
print(X.columns)

In [None]:
print(r)

In [None]:
#Random Forests
import sklearn.ensemble

In [None]:
params = ["rf"]
res = [fit_classifier(sklearn.ensemble.RandomForestClassifier(random_state=12345),
                          X_train, X_test, yk_train, yk_test)]
df_results = pd.DataFrame(res, index=params)

In [None]:
results = results.append(df_results)
results

In [None]:
rf = sklearn.ensemble.RandomForestClassifier(random_state=12345)
rf.fit(X_train, yk_train)
rf.predict(X_test)

plot_confusion_matrix(rf, X_test, yk_test)
plt.show()

In [None]:
rf.get_params()

In [None]:
params = ["rf new"]
res = [fit_classifier(sklearn.ensemble.RandomForestClassifier(n_estimators=500,
                                                              max_depth=10,
                                                              random_state=12345),
                          X_train, X_test, yk_train, yk_test)]
df_results = pd.DataFrame(res, index=params)

In [None]:
results = results.append(df_results)
results

In [None]:
# Intro to XAI
# Feature importances

pd.Series(rf.feature_importances_, index = X.columns[0:11]).sort_values(ascending=False)


In [None]:
#conclusion - good wine is strong wine :)
wine.alcohol.groupby(wine.quality).mean()

And now, you can play with standardization, removing outliers, features selection, hyperparameters tuning, ...

You can do almost everything, which directing you to create stable, not overfitted, as simple as possible, as general as possible, ... model, which helps you to solve your problem.

And it is not so simple :)

## Few words about cross validation 

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.gaussian_process import GaussianProcessClassifier

In [None]:
# cross validation for several models based on std dataset
methods = pd.Series({
    "knn": sklearn.neighbors.KNeighborsClassifier(n_neighbors=10),
    "dt": sklearn.tree.DecisionTreeClassifier(max_depth=10),
    "rf": sklearn.ensemble.RandomForestClassifier(max_depth=10),
    "lr": LogisticRegression(),
    "SVC": SVC(),
    "SGD": SGDClassifier(),
    "GP": GaussianProcessClassifier(),
})

#evaluation function
def eval_function(X_train, X_test, y_train, y_test):
    cv_models = pd.concat([
        pd.Series(fit_classifier(alg,
                                 X_train, X_test, y_train, y_test)) for alg in methods], axis=1).T
    cv_models.index = methods.index
    return cv_models

kf = KFold(n_splits=5)

n_folds = 5

results_cv = [eval_function(X_std.iloc[train,:],
                            X_std.iloc[test,:],
                            yk[train],
                            yk[test]) for train, test in kf.split(X_std)]

sum(results_cv)/n_folds