In [1]:
import numpy as np
print(f'NumPy version: {np.__version__}')

import pandas as pd
print(f'Pandas version: {pd.__version__}')

import sklearn
print(f'Sklearn version: {sklearn.__version__}')

from sklearn.impute import SimpleImputer
from typing import List, Any
from collections import Counter

NumPy version: 1.18.5
Pandas version: 1.0.5
Sklearn version: 0.23.1


Folositi 4 seturi de date pentru probleme de clasificare, plecand de la repository-urile specificate in Cursul 6. Cel putin un set de date sa fie cu valori lipsa; pentru un alt set de date care are initial toate valorile, introduceti dvs. in mod artificial valori lipsa, suprascriind un anumit procent din valorile initiale (ex. `p=5%`, `p` parametru) cu `numpy.nan`. 

##  <u> Citirea seturilor de date </u> 

### Glass dataset

In [2]:
glass_df: pd.DataFrame = pd.read_csv("./data/glass.data", header=None)
glass_df = glass_df.drop(glass_df.columns[0], axis=1) 

print(f'Missing values: {np.isnan(glass_df.values.sum())}')
glass_df.head()

Missing values: False


Unnamed: 0,1,2,3,4,5,6,7,8,9,10
0,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1
1,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,1
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,1
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,1
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,1


### Wine dataset

In [3]:
wine_df: pd.DataFrame = pd.read_csv("./data/wine.data", header=None)
    
print(f'Missing values: {np.isnan(wine_df.values.sum())}')
wine_df.head()

Missing values: False


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


### Iris dataset

In [4]:
iris_df: pd.DataFrame = pd.read_csv("./data/iris.data", header = None)
    
print(f'Missing values: {np.isnan(iris_df.values[:,:-1].sum())}')
iris_df.head()

Missing values: False


Unnamed: 0,0,1,2,3,4
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


### Dermatology dataset

In [5]:
dermatology_df = pd.read_csv("./data/dermatology.data", na_values="?", header=None)

print(f'Missing values: {np.isnan(dermatology_df.values.sum())}')
dermatology_df.head()

Missing values: True


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,32,33,34
0,2,2,0,3,0,0,0,0,1,0,...,0,0,3,0,0,0,1,0,55.0,2
1,3,3,3,2,1,0,0,0,1,1,...,0,0,0,0,0,0,1,0,8.0,1
2,2,1,2,3,1,3,0,3,0,0,...,0,2,3,2,0,0,2,3,26.0,3
3,2,2,2,0,0,0,0,0,3,2,...,3,0,0,0,0,0,3,0,40.0,1
4,2,3,2,2,2,2,0,2,0,0,...,2,3,2,3,0,0,2,3,45.0,3


In [6]:
def set_nan_values(data_frame_values: np.ndarray, p: float) -> None:
    """
    Overwrites the array with p% np.nan values.
    
    :param data_frame_values: the given values
    :param p: the percentage of data to be overwrite
    
    :return: Nothing
    """
    nr: int = int(data_frame_values.size * (p / 100))
    lines: np.ndarray[int] = np.random.randint(data_frame_values.shape[0], size = nr)
    cols: np.ndarray[int] = np.random.randint(data_frame_values.shape[1], size = nr)
    data_frame_values[lines, cols] = np.nan
    return data_frame_values

wine_df.at[:,1:] = set_nan_values(wine_df.values[:,:-1], p = 2) 
assert np.isnan(wine_df.values.sum())

## <u> Missing value imputation </u> 
### SimpleImputer


Aplicati o metoda de missing value imputation, unde este cazul; documentati metoda folosita.
*Resurse*: Pentru missing value imputation, puteti urmari [Imputation of missing values](https://scikit-learn.org/stable/modules/impute.html), [How to Handle Missing Data with Python](https://machinelearningmastery.com/handle-missing-data-python/), [fancyimpute](https://github.com/iskandr/fancyimpute), [missingpy](https://github.com/epsilon-machine/missingpy).

In [7]:
def missing_value_imputation(data_set: pd.DataFrame) -> np.ndarray:
    """
    Imputes missing values from a data set using SimpleImputer from sklearn.
    
    :param data_set: the given data set to be imputed
    
    :retrun: initial data set without missing values
    """
    imp = SimpleImputer(strategy = "mean")
    return imp.fit_transform(data_set)

In [8]:
dermatology_transformed_values: np.ndarray = missing_value_imputation(dermatology_df)
assert not np.isnan(dermatology_transformed_values.sum())

wine_transformed_values: np.ndarray = missing_value_imputation(wine_df)
assert not np.isnan(wine_transformed_values.sum())

## <u> Modele de clasificare </u>

 Pentru fiecare set de date aplicati 5 modele de clasificare din scikit learn. Pentru fiecare raportati: acuratete, precision, recall, scorul F1 - a se vedea [sklearn.metrics](http://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics), [Precision and recall](https://en.wikipedia.org/wiki/Precision_and_recall) - folosind 5 fold cross validation. Raportati mediile rezultatelor atat pentru fold-urile de antrenare, cat si pentru cele de testare. Rularile se vor face cu valori fixate ale hiperparametrilor. 

In [9]:
from sklearn.model_selection import train_test_split, cross_validate

from sklearn.metrics import accuracy_score, f1_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import MinMaxScaler

import warnings
warnings.filterwarnings('ignore')

In [10]:
def print_fold_cross_validation_scores(model: sklearn.base.BaseEstimator, X: np.ndarray, y: np.ndarray) -> None:
    """
    Apply cross_validation on the model and print the average scores (accuracy, precision, recall, F1) 
    for both train and test sets
    
    :param model: the given classification model
    :param X: data set without labels
    :param y: labels 
    
    :return: Nothing
    """
    results: Dict[str, np.ndarray] = cross_validate(model, X, y, scoring=('accuracy', 'precision_macro', 'recall_macro', 'f1_macro'), 
                             return_train_score=True)
    
    print("\n Train set")
    print(f"Accuracy average score: {results['train_accuracy'].mean()}")
    print(f"Precision average score: {results['train_precision_macro'].mean()}")
    print(f"Recall average score: {results['train_recall_macro'].mean()}")
    print(f"F1 average score: {results['train_f1_macro'].mean()}")
    
    print("\n Test set")
    print(f"Accuracy average score: {results['test_accuracy'].mean()}")
    print(f"Precision average score: {results['test_precision_macro'].mean()}")
    print(f"Recall average score: {results['test_recall_macro'].mean()}")
    print(f"F1 average score: {results['test_f1_macro'].mean()}")

### KNeighborsClassifier

class `sklearn.neighbors.KNeighborsClassifier`<i>(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None, **kwargs)</i>

Într-o problemă de clasificare, algoritmul kNN (_k_-nearest neighbors) identifică cei mai apropiați _k_ vecini ai fiecărui item neclasificat - fara sa tina cont de etichetele acestora - vecini localizați în setul de antrenare. Determinarea claselor din care fac parte itemii neclasificați se face prin votare, astfel: clasa itemului se consideră clasa în care aparțin majoritatea vecinilor.

Pentru determinarea distanței dintre itemi se pot utiliza mai multe metrici. Scikit-learn admite orice funcție Python ca și metrică, insa implicit folosește metrica _Minkowski_. Iată câteva exemple de metrici des utilizate în kNN:

- _distanța Minkowski_: $d_{st} = \sqrt[p]{\sum_{j=1}^n |x_{sj} - y_{tj}|^p}$  (_Obs._: p este un hiperparametru utilizat de Scikit-learn)
- _distanța Manhattan (City block)_: $d_{st} = \sum_{j=1}^n |x_{sj} - y_{tj}|$
- _distanța Euclideană_: $d(\textbf{x},\textbf{y}) = \sqrt{\sum_{i=1}^n (y_i - x_i)^2}$
- _distanța Mahalanobis_: $d(\textbf{x},\textbf{y}) = \sqrt{\sum_{i=1}^n \frac{(x_i - y_i)^2}{s_i^2}}$, unde $s_i$ este deviația standard a lui $x_i$ și $y_i$ în sample

[Vezi performanta modelului](#knn_performance)

In [11]:
def KNeighborsClassifier_stats( X: np.ndarray, y: np.ndarray) -> None:
    """
    Apply KNeighborsClassifier after scaling the data and 
    print the average scores (accuracy, precision, recall, F1).
    
    :param X: dataset without labels
    :param y: labels
    
    :return: Nothing
    """
    print("KNeighborsClassifier \n n_neighbors=3, p=1 \n")
    scaler = MinMaxScaler()
    X_scaled: np.ndarray = scaler.fit_transform(X)
    
    model: sklearn.base.BaseEstimator = KNeighborsClassifier(n_neighbors=3, p=1)
    print_fold_cross_validation_scores(model, X_scaled, y)

### DecisionTreeClassifier

class `sklearn.tree.DecisionTreeClassifier`<i>(criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, class_weight=None, presort='deprecated', ccp_alpha=0.0)</i>

DecisionTreeClassifie este o metodă non-parametrică de învățare supravegheată utilizată pentru clasificare și regresie. Scopul este de a crea un model care prezice valoarea unei variabile țintă prin învățarea unor reguli simple de decizie deduse din caracteristicile datelor.

Un arbore de decizie (_decision tree_) este o structură arborescentă tip flowchart unde un nod intern reprezintă un feature, ramura este un criteriu de decizie, iar fiecare frunză este un rezultat, o clasificare. Algoritmul Decision tree selectează cel mai bun feature folosind o metrică ASM (_Attribute Selection Measure_), convertește un nod feature la un nod tip criteriu de decizie, și partiționează (splits) datasetul în subseturi. Procesul se execută recursiv până arborele conține numai noduri criterii de decizie și noduri frunză rezultat. Cu cât arborele este mai adânc, cu atât sunt mai complexe criteriile de decizie și modelul are o acuratețe mai mare. 

<br>Pentru măsurarea calității unui split, Scikit-learn utilizează două metrici ASM:

- _impuritatea Gini_ (cât de des este etichetat greșit un element ales aleator dacă a fost etichetat folosind distribuția etichetelor dintr-un subset; poate determina overfitting-ul modelului): <br>$Gini(p) = 1 - \sum_{j=1}^c p_j^2$ <br>
- _entropia_ (similar cu Gini impurity, mai intensă d.p.d.v. computațional din cauza funcției logaritmice): <br>$H(p) = - \sum_{j=1}^c p_j \log p_j$

(unde c este numărul de clase (etichete), iar $p_j$ este subsetul etichetat cu clasă i, unde $j \in \{1, 2, ..., c\}$).

[Vezi performanta modelului](#dt_performance)

In [12]:
def DecisionTreeClassifier_stats(X: np.ndarray, y: np.ndarray):
    """
    Apply DecisionTreeClassifier after scaling the data and 
    print the average scores (accuracy, precision, recall, F1).
    
    :param X: dataset without labels
    :param y: labels
    
    :return: Nothing
    """
    print("DecisionTreeClassifier \n random_state=0 \n")
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)
    
    model = DecisionTreeClassifier(random_state=0)
    print_fold_cross_validation_scores(model, X_scaled, y)

### StochasticGradientDescentClassifier

class sklearn.linear_model.SGDClassifier <i>(loss='hinge', *, penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=0.001, shuffle=True, verbose=0, epsilon=0.1, n_jobs=None, random_state=None, learning_rate='optimal', eta0=0.0, power_t=0.5, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, class_weight=None, warm_start=False, average=False)</i>

Clasa SGDClassifier implementează o rutină simplă de învățare descendentă în gradient stochastic, care acceptă diferite funcții de pierdere și penalități pentru clasificare.

SGDClassifier acceptă clasificarea multi-clasă prin combinarea mai multor clasificatoare binare într-o schemă „one versus all” (OVA). Pentru fiecare dintre clase, se învață un clasificator binar care discriminează între aceasta și toate celelalte clase. La momentul testării, calculăm scorul de încredere (adică distanțele semnate până la hiperplan) pentru fiecare clasificator și alegem clasa cu cea mai mare încredere.

Avantajele Stochastic Gradient Descent sunt:
- Eficienţă.
- Ușurința de implementare (o mulțime de oportunități pentru reglarea codului).

Insa dezavantajele Stochastic Gradient Descent includ:
- SGD necesită un număr de hiperparametri, cum ar fi parametrul de regularizare și numărul de iterații.
- SGD este sensibil la scalarea feature-ilor.

[Vezi prformanta modelului](#sgd_performance)

In [13]:
def StochasticGradientDescentClassifier_stats(X: np.ndarray, y: np.ndarray):
    """
    Apply StochasticGradientDescentClassifier after scaling the data and 
    print the average scores (accuracy, precision, recall, F1).
    
    :param X: dataset without labels
    :param y: labels
    
    :return: Nothing
    """
    print("StochasticGradientDescentClassifier \n loss=hinge, penalty=l2, max_iter=80 \n")
    
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)
    
    
    model = SGDClassifier(loss="hinge", penalty="l2", max_iter=300)
    print_fold_cross_validation_scores(model, X_scaled, y)

### RandomForestClassifier

class `sklearn.ensemble.RandomForestClassifier`<i>(n_estimators=100, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None)<i>
    
Un random forest este un meta-estimator (se obține o predicție în urma mai multor predicții) care se potrivește cu un număr de clasificatori ai arborelui decizional pe diferite sub-eșantioane ale setului de date și utilizează media pentru a îmbunătăți precizia predictivă și pentru a controla supraadaptarea. Dimensiunea sub-eșantionului este controlată cu parametrul max_samples dacă bootstrap = True (implicit), altfel întregul set de date este utilizat pentru a construi fiecare arbore.
    
Un clasificator _Random forest_ se folosește de ipotezele emise de mai mulți arbori de decizie aleatori (_random trees_), obținuți în urma unui _random split_. Un random forest se obține prin construirea unui random tree pentru fiecare set de antrenare. Acești arbori funcționează ca un ansamblu; pentru fiecare dată de intrare se aplică modelele din ansamblu, și rezultatul final se obține agregând rezultatele prin votare.

<br>La fel ca la _Decision Tree classifier_, pentru măsurarea calității unui split, Scikit-learn utilizează două metrici:

- _impuritatea Gini_: $Gini(p) = 1 - \sum_{j=1}^c p_j^2$ <br>
- _entropia_: $H(p) = - \sum_{j=1}^c p_j \log p_j$

(unde c este numărul de clase (etichete), iar $p_j$ este subsetul etichetat cu clasă i, unde $j \in \{1, 2, ..., c\}$).
    
[Vezi performanta modelului](#rf_performance)

In [14]:
def RandomForestClassifier_stats(X: np.ndarray, y: np.ndarray):
    """
    Apply RandomForestClassifier after scaling the data and 
    print the average scores (accuracy, precision, recall, F1).
    
    :param X: dataset without labels
    :param y: labels
    
    :return: Nothing
    """
    print("RandomForestClassifier \n max_depth=2, random_state=0 \n")
    
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)
    
    model = RandomForestClassifier(max_depth=2, random_state=0)
    print_fold_cross_validation_scores(model, X_scaled, y)

### MultiLayerPerceptronClassifier

class `sklearn.neural_network.MLPClassifier`<i>(hidden_layer_sizes=(100, ), activation='relu', solver='adam', alpha=0.0001, batch_size='auto', learning_rate='constant', learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, random_state=None, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08, n_iter_no_change=10, max_fun=15000)</i>

_Perceptronii_ sunt o clasă de clasificatori utilizați în învățarea supervizată, fiind un model matematic al unui neuron biologic. În particular, _perceptronii multistrat (MLP)_ formează rețele neuronale cu mai multe straturi de perceptroni: un strat de intrare, unul sau mai multe straturi intermediare (ascunse), și un strat de ieșire.

<br>Într-o rețea neuronală, o _funcție de activare_ definește ieșirea unui perceptron după ce este supus unui set de intrare. În forma lui cea mai simplă, funcția poate returna un rezultat binar (funcție liniară, output 0 sau 1): făcând analogie cu neuronul biologic, dacă trece un impuls electric prin axonul acestuia sau nu. În cazul rețelelor neuronale moderne care utilizează mai multe straturi de perceptroni, funcțiile de activare pot fi și non-binare (non-liniare). 

Scikit-learn admite funcții de activare de ambele tipuri în implementarea MLP classifier:
- _funcția identitate_: $f(x) = x$
- _tangenta hiperbolică_: $f(x) = \tanh(x) = \frac{\sinh(x)}{\cosh(x)} = \frac{e^x - e^{-x}}{e^x + e^{-x}}$
- _sigmoida logistică_: $f(x) = \frac{1}{1 + \exp(-x)}$
- _Rectified Linear Unit (ReLU)_: $f(x) = \max(0, x) = \begin{cases} 0 & \text{dacă } x \leq 0 \\ x & \text{dacă } x > 0 \end{cases}$

De asemenea, clasificatorul MLP din Scikit-learn utilizează și algoritmi de optimizare a ponderilor (solvers): _LBFGS_ (algoritm Quasi-Newton), _SGD_ (stochastic gradient descent) și _Adam_ (algoritm derivat din SGD, creat de Diederik P. Kingma și Jimmy Lei Ba).
Astfel, acest model optimizează funcția log-loss folosind LBFGS sau stochastic gradient descent.

[Vezi performanta modelului](#MLP_performance)

In [15]:
def MultiLayerPerceptronClassifier_stats(X: np.ndarray, y: np.ndarray):
    """
    Apply MultiLayerPerceptronClassifier after scaling the data and 
    print the average scores (accuracy, precision, recall, F1).
    
    :param X: dataset without labels
    :param y: labels
    
    :return: Nothing
    """
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)
    
    print("MultiLayerPerceptronClassifier \n max_iter=3000, alpha=1e-5, hidden_layer_sizes=(40, 20), random_state=0\n")
    model = MLPClassifier(max_iter=3000, alpha=1e-5, hidden_layer_sizes=(40, 20), random_state=0)
    print_fold_cross_validation_scores(model, X_scaled, y)

## <u> Aplicarea modelelor de clasificare </u>

### Glass dataset

In [16]:
X_glass: np.ndarray = glass_df.values[:, :-1]
Y_glass: np.ndarray = glass_df.values[:, -1]

#### KNeighborsClassifier

In [17]:
KNeighborsClassifier_stats(X_glass, Y_glass)

KNeighborsClassifier 
 n_neighbors=3, p=1 


 Train set
Accuracy average score: 0.8481368149054808
Precision average score: 0.8341999356042606
Recall average score: 0.8010774730535957
F1 average score: 0.8061701455087057

 Test set
Accuracy average score: 0.640531561461794
Precision average score: 0.5327259287553405
Recall average score: 0.5725198412698412
F1 average score: 0.5227413579269237


#### DecisionTreeClassifier

In [18]:
DecisionTreeClassifier_stats(X_glass, Y_glass)

DecisionTreeClassifier 
 random_state=0 


 Train set
Accuracy average score: 1.0
Precision average score: 1.0
Recall average score: 1.0
F1 average score: 1.0

 Test set
Accuracy average score: 0.5512735326688815
Precision average score: 0.536380676016226
Recall average score: 0.5242460317460317
F1 average score: 0.5007263654306666


#### StochasticGradientDescentClassifier

In [19]:
StochasticGradientDescentClassifier_stats(X_glass, Y_glass)

StochasticGradientDescentClassifier 
 loss=hinge, penalty=l2, max_iter=80 


 Train set
Accuracy average score: 0.5782877736978104
Precision average score: 0.591761996322853
Recall average score: 0.5303274368776864
F1 average score: 0.510623721045899

 Test set
Accuracy average score: 0.48139534883720925
Precision average score: 0.37356306459722016
Recall average score: 0.4185714285714286
F1 average score: 0.35174465080586514


#### RandomForestClassifier

In [20]:
RandomForestClassifier_stats(X_glass, Y_glass)

RandomForestClassifier 
 max_depth=2, random_state=0 


 Train set
Accuracy average score: 0.6775669794641642
Precision average score: 0.3714987822870393
Recall average score: 0.42126121463077987
F1 average score: 0.3920213407375831

 Test set
Accuracy average score: 0.5750830564784052
Precision average score: 0.31722948951209823
Recall average score: 0.368234126984127
F1 average score: 0.3318975697116091


#### MultiLayerPerceptronClassifier

In [21]:
MultiLayerPerceptronClassifier_stats(X_glass, Y_glass)

MultiLayerPerceptronClassifier 
 max_iter=3000, alpha=1e-5, hidden_layer_sizes=(40, 20), random_state=0


 Train set
Accuracy average score: 0.9591187270501835
Precision average score: 0.9630545484791817
Recall average score: 0.9704878500370304
F1 average score: 0.966399247707004

 Test set
Accuracy average score: 0.6867109634551495
Precision average score: 0.6692027392685287
Recall average score: 0.6811706349206348
F1 average score: 0.6343925192803044


### Wine dataset

In [22]:
X_wine: np.ndarray = wine_transformed_values[:, 1:]
Y_wine: np.ndarray = wine_transformed_values[:, 0]

#### KNeighborsClassifier

In [23]:
KNeighborsClassifier_stats(X_wine, Y_wine)

KNeighborsClassifier 
 n_neighbors=3, p=1 


 Train set
Accuracy average score: 1.0
Precision average score: 1.0
Recall average score: 1.0
F1 average score: 1.0

 Test set
Accuracy average score: 0.9944444444444445
Precision average score: 0.9955555555555555
Recall average score: 0.9944444444444445
F1 average score: 0.9948025987006496


#### DecisionTreeClassifier

In [24]:
DecisionTreeClassifier_stats(X_wine, Y_wine)

DecisionTreeClassifier 
 random_state=0 


 Train set
Accuracy average score: 1.0
Precision average score: 1.0
Recall average score: 1.0
F1 average score: 1.0

 Test set
Accuracy average score: 0.9382539682539683
Precision average score: 0.9438905538905539
Recall average score: 0.943968253968254
F1 average score: 0.9392268073704724


#### StochasticGradientDescentClassifier

In [25]:
StochasticGradientDescentClassifier_stats(X_wine, Y_wine)

StochasticGradientDescentClassifier 
 loss=hinge, penalty=l2, max_iter=80 


 Train set
Accuracy average score: 0.9971929478971733
Precision average score: 0.997183908045977
Recall average score: 0.9970551378446115
F1 average score: 0.9970869185090429

 Test set
Accuracy average score: 0.9609523809523809
Precision average score: 0.9654778554778554
Recall average score: 0.9669841269841271
F1 average score: 0.962777915414334


#### RandomForestClassifier

In [26]:
RandomForestClassifier_stats(X_wine, Y_wine)

RandomForestClassifier 
 max_depth=2, random_state=0 


 Train set
Accuracy average score: 0.9971929478971733
Precision average score: 0.997250566893424
Recall average score: 0.9976399331662489
F1 average score: 0.9974203892885919

 Test set
Accuracy average score: 0.9888888888888889
Precision average score: 0.9888111888111888
Recall average score: 0.9904761904761905
F1 average score: 0.9892204585537918


#### MultiLayerPerceptronClassifier

In [27]:
MultiLayerPerceptronClassifier_stats(X_wine, Y_wine)

MultiLayerPerceptronClassifier 
 max_iter=3000, alpha=1e-5, hidden_layer_sizes=(40, 20), random_state=0


 Train set
Accuracy average score: 1.0
Precision average score: 1.0
Recall average score: 1.0
F1 average score: 1.0

 Test set
Accuracy average score: 1.0
Precision average score: 1.0
Recall average score: 1.0
F1 average score: 1.0


### Iris dataset

In [28]:
X_iris: np.ndarray = iris_df.values[:, :-1]
Y_iris: np.ndarray = iris_df.values[:, -1]

#### KNeighborsClassifier

In [29]:
KNeighborsClassifier_stats(X_iris, Y_iris)

KNeighborsClassifier 
 n_neighbors=3, p=1 


 Train set
Accuracy average score: 0.9566666666666667
Precision average score: 0.9568900048746005
Recall average score: 0.9566666666666668
F1 average score: 0.9566598928205734

 Test set
Accuracy average score: 0.9533333333333334
Precision average score: 0.9572390572390572
Recall average score: 0.9533333333333334
F1 average score: 0.9531151110098477


#### DecisionTreeClassifier

In [30]:
DecisionTreeClassifier_stats(X_iris, Y_iris)

DecisionTreeClassifier 
 random_state=0 


 Train set
Accuracy average score: 1.0
Precision average score: 1.0
Recall average score: 1.0
F1 average score: 1.0

 Test set
Accuracy average score: 0.9600000000000002
Precision average score: 0.9622895622895623
Recall average score: 0.9600000000000002
F1 average score: 0.9598997493734336


#### StochasticGradientDescentClassifier

In [31]:
StochasticGradientDescentClassifier_stats(X_iris, Y_iris)

StochasticGradientDescentClassifier 
 loss=hinge, penalty=l2, max_iter=80 


 Train set
Accuracy average score: 0.9349999999999999
Precision average score: 0.9401815151983259
Recall average score: 0.9349999999999999
F1 average score: 0.9350202321830517

 Test set
Accuracy average score: 0.9400000000000001
Precision average score: 0.9472727272727273
Recall average score: 0.9399999999999998
F1 average score: 0.9399691147059567


#### RandomForestClassifier

In [32]:
RandomForestClassifier_stats(X_iris, Y_iris)

RandomForestClassifier 
 max_depth=2, random_state=0 


 Train set
Accuracy average score: 0.96
Precision average score: 0.9603590903462533
Recall average score: 0.96
F1 average score: 0.9599854085237091

 Test set
Accuracy average score: 0.9533333333333334
Precision average score: 0.9572390572390572
Recall average score: 0.9533333333333334
F1 average score: 0.9531151110098477


#### MultiLayerPerceptronClassifier

In [33]:
MultiLayerPerceptronClassifier_stats(X_iris, Y_iris)

MultiLayerPerceptronClassifier 
 max_iter=3000, alpha=1e-5, hidden_layer_sizes=(40, 20), random_state=0


 Train set
Accuracy average score: 0.9799999999999999
Precision average score: 0.9802275074803959
Recall average score: 0.9799999999999999
F1 average score: 0.9799942679833593

 Test set
Accuracy average score: 0.9733333333333334
Precision average score: 0.9755555555555556
Recall average score: 0.9733333333333333
F1 average score: 0.9731986531986532


### Dermatology dataset

In [34]:
X_dermatology: np.ndarray = dermatology_transformed_values[:, :-1]
Y_dermatology: np.ndarray = dermatology_transformed_values[:, -1]

#### KNeighborsClassifier

In [35]:
KNeighborsClassifier_stats(X_dermatology, Y_dermatology)

KNeighborsClassifier 
 n_neighbors=3, p=1 


 Train set
Accuracy average score: 0.983608396839497
Precision average score: 0.9812763343042443
Recall average score: 0.9824526862026863
F1 average score: 0.9816980914489996

 Test set
Accuracy average score: 0.9589781562384301
Precision average score: 0.9574395974395975
Recall average score: 0.9588250959990091
F1 average score: 0.9561565653253632


#### DecisionTreeClassifier

In [36]:
DecisionTreeClassifier_stats(X_dermatology, Y_dermatology)

DecisionTreeClassifier 
 random_state=0 


 Train set
Accuracy average score: 1.0
Precision average score: 1.0
Recall average score: 1.0
F1 average score: 1.0

 Test set
Accuracy average score: 0.9153276564235469
Precision average score: 0.9205887260887261
Recall average score: 0.9097541991020253
F1 average score: 0.9097358381982849


#### StochasticGradientDescentClassifier

In [37]:
StochasticGradientDescentClassifier_stats(X_dermatology, Y_dermatology)

StochasticGradientDescentClassifier 
 loss=hinge, penalty=l2, max_iter=80 


 Train set
Accuracy average score: 0.9938566552901025
Precision average score: 0.9936285523949195
Recall average score: 0.9933542647828363
F1 average score: 0.9933611114192702

 Test set
Accuracy average score: 0.9671603109959275
Precision average score: 0.9681776741776741
Recall average score: 0.9646845746845747
F1 average score: 0.9637026241126165


#### RandomForestClassifier

In [38]:
RandomForestClassifier_stats(X_dermatology, Y_dermatology)

RandomForestClassifier 
 max_depth=2, random_state=0 


 Train set
Accuracy average score: 0.8101103370891579
Precision average score: 0.6731768475122735
Recall average score: 0.6658080411651841
F1 average score: 0.6087286594845375

 Test set
Accuracy average score: 0.7923731951129211
Precision average score: 0.5619188034188035
Recall average score: 0.6455167055167055
F1 average score: 0.5883530107510742


#### MultiLayerPerceptronClassifier

In [39]:
MultiLayerPerceptronClassifier_stats(X_dermatology, Y_dermatology)

MultiLayerPerceptronClassifier 
 max_iter=3000, alpha=1e-5, hidden_layer_sizes=(40, 20), random_state=0


 Train set
Accuracy average score: 1.0
Precision average score: 1.0
Recall average score: 1.0
F1 average score: 1.0

 Test set
Accuracy average score: 0.9753424657534246
Precision average score: 0.9728399378399379
Recall average score: 0.9731746031746032
F1 average score: 0.9726293309763943


## <u> Performanta fiecarui model </u>

Raportati performanta fiecarui model, folosind 5 fold cross validation. Pentru fiecare din cele 5 rulari, cautati hiperparametrii optimi folosind 4-fold cross validation. Performanta modelului va fi raportata ca medie a celor  5 rulari.

In [40]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline

<a id='knn_performance'></a>

### KNeighborsClassifier

In [41]:
def KNeighborsClassifier_performance(X: np.ndarray, y: np.ndarray) -> None:
    """
    Apply cross_val_score on the KNeighborsClassifier model with 5 fold cross validation 
    using the hyperparameters found with 4 fold cross validation
    
    :param X: data set without the labels
    :param Y: labels
    
    :returns: nothing
    """
    pipe = Pipeline([('scaler', MinMaxScaler()), ('knn', KNeighborsClassifier())])
    
    parameter_grid: Dict[str, List[Any]] = {'knn__n_neighbors': list(range(1, 10)), 
                      'knn__p': list(range(1, 5))}
    
    grid_search = GridSearchCV(pipe, param_grid = parameter_grid, scoring = 'accuracy', 
                        cv=4, n_jobs=4)
    
    grid_search.fit(X, y)
    print(f"Best params: {grid_search.best_params_}")
    
    scores: np.ndarray = cross_val_score(grid_search, X, y, cv=5)
    print(f'Accuracy: {scores.mean()}')
     
    grid_search = pd.DataFrame(grid_search.cv_results_)
    display(grid_search.head())

#### Glass dataset

In [42]:
KNeighborsClassifier_performance(X_glass, Y_glass)

Best params: {'knn__n_neighbors': 3, 'knn__p': 1}
Accuracy: 0.6125138427464009


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_knn__n_neighbors,param_knn__p,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
0,0.002485,0.00035,0.00545,0.000779,1,1,"{'knn__n_neighbors': 1, 'knn__p': 1}",0.592593,0.703704,0.622642,0.603774,0.630678,0.043508,8
1,0.002111,0.000411,0.006201,0.001426,1,2,"{'knn__n_neighbors': 1, 'knn__p': 2}",0.574074,0.666667,0.603774,0.566038,0.602638,0.039549,19
2,0.001985,3e-06,0.008434,0.000784,1,3,"{'knn__n_neighbors': 1, 'knn__p': 3}",0.555556,0.648148,0.603774,0.584906,0.598096,0.03362,22
3,0.001737,0.000249,0.007191,0.000554,1,4,"{'knn__n_neighbors': 1, 'knn__p': 4}",0.555556,0.611111,0.603774,0.584906,0.588836,0.021461,26
4,0.002232,0.000744,0.006324,0.002115,2,1,"{'knn__n_neighbors': 2, 'knn__p': 1}",0.574074,0.722222,0.584906,0.660377,0.635395,0.060151,7


#### Wine dataset

In [43]:
KNeighborsClassifier_performance(X_wine, Y_wine)

Best params: {'knn__n_neighbors': 5, 'knn__p': 1}
Accuracy: 0.9944444444444445


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_knn__n_neighbors,param_knn__p,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
0,0.002109,0.000541,0.004588,0.000645,1,1,"{'knn__n_neighbors': 1, 'knn__p': 1}",0.977778,1.0,1.0,1.0,0.994444,0.009623,3
1,0.002851,0.00133,0.003596,0.000214,1,2,"{'knn__n_neighbors': 1, 'knn__p': 2}",0.977778,1.0,0.977273,1.0,0.988763,0.011239,23
2,0.001859,0.000214,0.006199,0.000247,1,3,"{'knn__n_neighbors': 1, 'knn__p': 3}",0.955556,1.0,0.977273,1.0,0.983207,0.018465,28
3,0.001736,0.00043,0.008308,0.002679,1,4,"{'knn__n_neighbors': 1, 'knn__p': 4}",0.955556,1.0,0.977273,1.0,0.983207,0.018465,28
4,0.001984,0.000351,0.004712,0.000554,2,1,"{'knn__n_neighbors': 2, 'knn__p': 1}",0.977778,1.0,1.0,0.977273,0.988763,0.011239,23


#### Iris dataset

In [44]:
KNeighborsClassifier_performance(X_iris, Y_iris)

Best params: {'knn__n_neighbors': 3, 'knn__p': 3}
Accuracy: 0.9533333333333334


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_knn__n_neighbors,param_knn__p,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
0,0.003348,0.002679699,0.004588,0.001933,1,1,"{'knn__n_neighbors': 1, 'knn__p': 1}",0.947368,0.921053,0.891892,1.0,0.940078,0.039773,34
1,0.001488,3.371748e-07,0.003224,0.000248,1,2,"{'knn__n_neighbors': 1, 'knn__p': 2}",0.973684,0.947368,0.891892,1.0,0.953236,0.040008,20
2,0.001612,0.0002148399,0.004215,0.000248,1,3,"{'knn__n_neighbors': 1, 'knn__p': 3}",0.973684,0.947368,0.918919,1.0,0.959993,0.030143,8
3,0.002109,0.001074573,0.004836,0.001014,1,4,"{'knn__n_neighbors': 1, 'knn__p': 4}",0.973684,0.947368,0.918919,1.0,0.959993,0.030143,8
4,0.003844,0.003045243,0.003844,0.000542,2,1,"{'knn__n_neighbors': 2, 'knn__p': 1}",0.973684,0.921053,0.918919,0.945946,0.9399,0.022211,35


#### Dermatology dataset

In [45]:
KNeighborsClassifier_performance(X_dermatology, Y_dermatology)

Best params: {'knn__n_neighbors': 6, 'knn__p': 1}
Accuracy: 0.9644576082932247


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_knn__n_neighbors,param_knn__p,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
0,0.003589,0.000412,0.007564,0.000813,1,1,"{'knn__n_neighbors': 1, 'knn__p': 1}",0.978261,0.967391,0.945055,0.934066,0.956193,0.017508,11
1,0.00558,0.00365,0.007687,0.000431,1,2,"{'knn__n_neighbors': 1, 'knn__p': 2}",0.967391,0.978261,0.934066,0.923077,0.950699,0.022792,23
2,0.003224,0.000248,0.035588,0.007234,1,3,"{'knn__n_neighbors': 1, 'knn__p': 3}",0.934783,0.967391,0.923077,0.923077,0.937082,0.01814,30
3,0.0031,0.000215,0.03782,0.00494,1,4,"{'knn__n_neighbors': 1, 'knn__p': 4}",0.923913,0.967391,0.912088,0.923077,0.931617,0.021175,34
4,0.007439,0.006044,0.009177,0.001424,2,1,"{'knn__n_neighbors': 2, 'knn__p': 1}",0.945652,0.978261,0.945055,0.945055,0.953506,0.014294,14


<a id='dt_performance'></a>
### DecisionTreeClassifier

In [46]:
def DecisionTreeClassifier_performance(X: np.ndarray, y: np.ndarray):
    """
    Apply cross_val_score on the DecisionTreeClassifier model with 5 fold cross validation 
    using the hyperparameters found with 4 fold cross validation
    
    :param X: data set without the labels
    :param Y: labels
    
    :returns: nothing
    """
    pipe = Pipeline([('scaler', MinMaxScaler()), ('dt', DecisionTreeClassifier())])
    
    parameter_grid: Dict[str, List[Any]] = {'dt__criterion': ['entropy', 'gini'], 
                  'dt__max_depth': list(range(3,9)),
                  'dt__min_samples_leaf': list(range(10,20))}
    
    grid_search = GridSearchCV(pipe, param_grid = parameter_grid, scoring = 'accuracy', 
                        cv=5, n_jobs=4)
    
    grid_search.fit(X, y)
    print(f"Best params: {grid_search.best_params_}")
    
    scores: np.ndarray = cross_val_score(grid_search, X, y, cv=5)
    print(f'Accuracy: {scores.mean()}')
     
    grid_search = pd.DataFrame(grid_search.cv_results_)
    display(grid_search.head())

#### Glass dataset

In [47]:
DecisionTreeClassifier_performance(X_glass, Y_glass)

Best params: {'dt__criterion': 'gini', 'dt__max_depth': 5, 'dt__min_samples_leaf': 16}
Accuracy: 0.6121816168327796


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_dt__criterion,param_dt__max_depth,param_dt__min_samples_leaf,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.00238,0.000198,0.000893,0.0001986034,entropy,3,10,"{'dt__criterion': 'entropy', 'dt__max_depth': ...",0.511628,0.465116,0.534884,0.604651,0.666667,0.556589,0.071115,93
1,0.002083,0.000198,0.000894,0.0001981755,entropy,3,11,"{'dt__criterion': 'entropy', 'dt__max_depth': ...",0.55814,0.465116,0.534884,0.604651,0.642857,0.56113,0.060811,89
2,0.002281,0.000247,0.000692,0.0002410982,entropy,3,12,"{'dt__criterion': 'entropy', 'dt__max_depth': ...",0.55814,0.488372,0.488372,0.604651,0.642857,0.556478,0.061744,96
3,0.001982,4e-06,0.000797,0.0002453753,entropy,3,13,"{'dt__criterion': 'entropy', 'dt__max_depth': ...",0.55814,0.511628,0.488372,0.604651,0.642857,0.56113,0.057143,89
4,0.002678,0.000743,0.000992,7.629395e-07,entropy,3,14,"{'dt__criterion': 'entropy', 'dt__max_depth': ...",0.55814,0.488372,0.488372,0.55814,0.642857,0.547176,0.057116,98


#### Wine dataset

In [48]:
DecisionTreeClassifier_performance(X_wine, Y_wine)

Best params: {'dt__criterion': 'entropy', 'dt__max_depth': 3, 'dt__min_samples_leaf': 10}
Accuracy: 0.9833333333333332


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_dt__criterion,param_dt__max_depth,param_dt__min_samples_leaf,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.002183,0.000506,0.000794,0.000243,entropy,3,10,"{'dt__criterion': 'entropy', 'dt__max_depth': ...",0.944444,1.0,0.972222,1.0,1.0,0.983333,0.022222,1
1,0.002186,0.000239,0.001187,0.000399,entropy,3,11,"{'dt__criterion': 'entropy', 'dt__max_depth': ...",0.944444,1.0,0.972222,1.0,1.0,0.983333,0.022222,1
2,0.002083,0.000199,0.000694,0.000243,entropy,3,12,"{'dt__criterion': 'entropy', 'dt__max_depth': ...",0.944444,1.0,0.972222,1.0,1.0,0.983333,0.022222,1
3,0.001885,0.000199,0.000599,0.000197,entropy,3,13,"{'dt__criterion': 'entropy', 'dt__max_depth': ...",0.944444,1.0,0.972222,1.0,1.0,0.983333,0.022222,1
4,0.001785,0.000397,0.000893,0.000198,entropy,3,14,"{'dt__criterion': 'entropy', 'dt__max_depth': ...",0.944444,1.0,0.972222,1.0,1.0,0.983333,0.022222,1


#### Iris dataset

In [49]:
DecisionTreeClassifier_performance(X_iris, Y_iris)

Best params: {'dt__criterion': 'entropy', 'dt__max_depth': 3, 'dt__min_samples_leaf': 10}
Accuracy: 0.9333333333333332


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_dt__criterion,param_dt__max_depth,param_dt__min_samples_leaf,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.001588,0.000198,0.000892,0.000198,entropy,3,10,"{'dt__criterion': 'entropy', 'dt__max_depth': ...",0.933333,0.966667,0.9,0.866667,1.0,0.933333,0.04714,1
1,0.002481,0.000541,0.001189,0.000397,entropy,3,11,"{'dt__criterion': 'entropy', 'dt__max_depth': ...",0.933333,0.966667,0.9,0.866667,1.0,0.933333,0.04714,1
2,0.002082,0.000483,0.001194,0.000927,entropy,3,12,"{'dt__criterion': 'entropy', 'dt__max_depth': ...",0.933333,0.966667,0.9,0.866667,1.0,0.933333,0.04714,1
3,0.001488,0.000314,0.000595,0.000198,entropy,3,13,"{'dt__criterion': 'entropy', 'dt__max_depth': ...",0.933333,0.966667,0.9,0.866667,1.0,0.933333,0.04714,1
4,0.00149,4e-06,0.000693,0.000241,entropy,3,14,"{'dt__criterion': 'entropy', 'dt__max_depth': ...",0.933333,0.966667,0.9,0.866667,1.0,0.933333,0.04714,1


#### Dermatology dataset

In [50]:
DecisionTreeClassifier_performance(X_dermatology, Y_dermatology)

Best params: {'dt__criterion': 'gini', 'dt__max_depth': 5, 'dt__min_samples_leaf': 15}
Accuracy: 0.9262865605331358


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_dt__criterion,param_dt__max_depth,param_dt__min_samples_leaf,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.002775,0.00051,0.000896,0.000367,entropy,3,10,"{'dt__criterion': 'entropy', 'dt__max_depth': ...",0.756757,0.849315,0.808219,0.90411,0.90411,0.844502,0.056824,104
1,0.003671,0.00074,0.001891,0.001813,entropy,3,11,"{'dt__criterion': 'entropy', 'dt__max_depth': ...",0.756757,0.821918,0.808219,0.90411,0.90411,0.839023,0.057414,109
2,0.003966,0.002569,0.001094,0.000726,entropy,3,12,"{'dt__criterion': 'entropy', 'dt__max_depth': ...",0.756757,0.835616,0.808219,0.90411,0.90411,0.841762,0.056856,106
3,0.00367,0.001587,0.001192,0.000917,entropy,3,13,"{'dt__criterion': 'entropy', 'dt__max_depth': ...",0.756757,0.835616,0.808219,0.90411,0.90411,0.841762,0.056856,106
4,0.002682,0.000864,0.000889,0.000196,entropy,3,14,"{'dt__criterion': 'entropy', 'dt__max_depth': ...",0.77027,0.835616,0.808219,0.90411,0.90411,0.844465,0.052937,105


<a id='sgd_performance'></a>
### StochasticGradientDescentClassifier

In [51]:
def SGDClassifier_performance(X: np.ndarray, y: np.ndarray):
    """
    Apply cross_val_score on the SGDClassifier model with 5 fold cross validation 
    using the hyperparameters found with 4 fold cross validation
    
    :param X: data set without the labels
    :param Y: labels
    
    :returns: nothing
    """
    pipe = Pipeline([('scaler', MinMaxScaler()), ('sgd', SGDClassifier())])
    
    parameter_grid: Dict[str, List[Any]] = {'sgd__loss': ['hinge', 'log', 'modified_huber', 'perceptron'], 
                  'sgd__penalty': ['l2', 'l1', 'elasticnet'],
                 'sgd__alpha': [0.0001, 0.0002]}
        
    grid_search = GridSearchCV(pipe, param_grid = parameter_grid, scoring = 'accuracy', 
                        cv=4, n_jobs=4)
    
    grid_search.fit(X, y)
    print(f"Best params: {grid_search.best_params_}")
    
    scores: np.ndarray = cross_val_score(grid_search, X, y, cv=5)
    print(f'Accuracy: {scores.mean()}')
     
    grid_search = pd.DataFrame(grid_search.cv_results_)
    display(grid_search.head())

#### Glass dataset

In [52]:
SGDClassifier_performance(X_glass, Y_glass)

Best params: {'sgd__alpha': 0.0001, 'sgd__loss': 'log', 'sgd__penalty': 'l2'}
Accuracy: 0.46821705426356586


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_sgd__alpha,param_sgd__loss,param_sgd__penalty,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
0,0.014509,0.003702,0.028272,0.027797,0.0001,hinge,l2,"{'sgd__alpha': 0.0001, 'sgd__loss': 'hinge', '...",0.333333,0.592593,0.584906,0.54717,0.5145,0.106,10
1,0.010046,0.000415,0.000744,0.000248,0.0001,hinge,l1,"{'sgd__alpha': 0.0001, 'sgd__loss': 'hinge', '...",0.333333,0.574074,0.54717,0.622642,0.519305,0.110725,9
2,0.013394,0.007161,0.001116,0.000214,0.0001,hinge,elasticnet,"{'sgd__alpha': 0.0001, 'sgd__loss': 'hinge', '...",0.351852,0.5,0.471698,0.566038,0.472397,0.07756,18
3,0.010669,0.001335,0.000615,0.000206,0.0001,log,l2,"{'sgd__alpha': 0.0001, 'sgd__loss': 'log', 'sg...",0.574074,0.555556,0.528302,0.641509,0.57486,0.041782,1
4,0.013763,0.001835,0.007564,0.011957,0.0001,log,l1,"{'sgd__alpha': 0.0001, 'sgd__loss': 'log', 'sg...",0.425926,0.62963,0.528302,0.622642,0.551625,0.082874,3


#### Wine dataset

In [53]:
SGDClassifier_performance(X_wine, Y_wine)

Best params: {'sgd__alpha': 0.0002, 'sgd__loss': 'hinge', 'sgd__penalty': 'elasticnet'}
Accuracy: 0.9553968253968254


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_sgd__alpha,param_sgd__loss,param_sgd__penalty,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
0,0.004837,0.001245,0.000988,0.000345,0.0001,hinge,l2,"{'sgd__alpha': 0.0001, 'sgd__loss': 'hinge', '...",0.911111,1.0,0.977273,1.0,0.972096,0.036412,5
1,0.007069,0.001071,0.000742,0.000426,0.0001,hinge,l1,"{'sgd__alpha': 0.0001, 'sgd__loss': 'hinge', '...",0.955556,0.977778,0.977273,0.977273,0.97197,0.009479,6
2,0.006073,0.001133,0.000874,0.000219,0.0001,hinge,elasticnet,"{'sgd__alpha': 0.0001, 'sgd__loss': 'hinge', '...",0.911111,0.977778,0.977273,0.977273,0.960859,0.028722,14
3,0.005208,0.000555,0.000744,0.000248,0.0001,log,l2,"{'sgd__alpha': 0.0001, 'sgd__loss': 'log', 'sg...",0.888889,0.955556,0.977273,0.977273,0.949747,0.036238,20
4,0.006071,0.001235,0.000744,0.000248,0.0001,log,l1,"{'sgd__alpha': 0.0001, 'sgd__loss': 'log', 'sg...",0.955556,0.977778,0.977273,0.977273,0.97197,0.009479,6


#### Iris dataset

In [54]:
SGDClassifier_performance(X_iris, Y_iris)

Best params: {'sgd__alpha': 0.0002, 'sgd__loss': 'perceptron', 'sgd__penalty': 'elasticnet'}
Accuracy: 0.9266666666666665


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_sgd__alpha,param_sgd__loss,param_sgd__penalty,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
0,0.004836,0.000645,0.001123,0.0004173915,0.0001,hinge,l2,"{'sgd__alpha': 0.0001, 'sgd__loss': 'hinge', '...",0.973684,0.947368,0.891892,0.945946,0.939723,0.029742,6
1,0.00657,0.001903,0.00149,0.001166242,0.0001,hinge,l1,"{'sgd__alpha': 0.0001, 'sgd__loss': 'hinge', '...",0.921053,0.947368,0.864865,0.972973,0.926565,0.040074,11
2,0.006076,0.001075,0.000496,5.430242e-07,0.0001,hinge,elasticnet,"{'sgd__alpha': 0.0001, 'sgd__loss': 'hinge', '...",0.921053,0.921053,0.837838,0.972973,0.913229,0.048414,16
3,0.005084,0.000215,0.000868,0.0002151142,0.0001,log,l2,"{'sgd__alpha': 0.0001, 'sgd__loss': 'log', 'sg...",0.921053,0.947368,0.891892,0.972973,0.933321,0.030152,7
4,0.004836,0.000734,0.000743,0.0004298154,0.0001,log,l1,"{'sgd__alpha': 0.0001, 'sgd__loss': 'log', 'sg...",0.973684,0.921053,0.918919,0.864865,0.91963,0.038482,15


#### Dermatology dataset

In [55]:
SGDClassifier_performance(X_dermatology, Y_dermatology)

Best params: {'sgd__alpha': 0.0001, 'sgd__loss': 'perceptron', 'sgd__penalty': 'elasticnet'}
Accuracy: 0.967197334320622


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_sgd__alpha,param_sgd__loss,param_sgd__penalty,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
0,0.01178,0.003531,0.001364,0.000411,0.0001,hinge,l2,"{'sgd__alpha': 0.0001, 'sgd__loss': 'hinge', '...",0.967391,0.967391,0.956044,0.967033,0.964465,0.004864,16
1,0.015374,0.000783,0.000744,0.000248,0.0001,hinge,l1,"{'sgd__alpha': 0.0001, 'sgd__loss': 'hinge', '...",0.978261,0.98913,0.956044,0.945055,0.967123,0.017451,11
2,0.015374,0.003381,0.00062,0.000215,0.0001,hinge,elasticnet,"{'sgd__alpha': 0.0001, 'sgd__loss': 'hinge', '...",0.978261,0.945652,0.967033,0.967033,0.964495,0.011805,13
3,0.0217,0.011323,0.000869,0.000215,0.0001,log,l2,"{'sgd__alpha': 0.0001, 'sgd__loss': 'log', 'sg...",0.967391,0.978261,0.978022,0.967033,0.972677,0.005467,3
4,0.0248,0.006043,0.001117,0.000215,0.0001,log,l1,"{'sgd__alpha': 0.0001, 'sgd__loss': 'log', 'sg...",0.978261,1.0,0.967033,0.956044,0.975334,0.016263,2


<a id='rf_performance'></a>
### RandomForestClassifier

In [56]:
def RandomForestClassifier_performance(X: np.ndarray, y: np.ndarray) -> None:
    """
    Apply cross_val_score on the RandomForestClassifier model with 5 fold cross validation 
    using the hyperparameters found with 4 fold cross validation
    
    :param X: data set without the labels
    :param Y: labels
    
    :returns: nothing
    """
    pipe = Pipeline([('scaler', MinMaxScaler()), ('rf', RandomForestClassifier())])
    
    parameter_grid: Dict[str, List[Any]] = {'rf__n_estimators': list(range(1,10)), 
                'rf__criterion': ['entropy', 'gini'],
                'rf__max_depth': list(range(3,9))}
    
    grid_search = GridSearchCV(pipe, param_grid = parameter_grid, scoring = 'accuracy', 
                        cv=4, n_jobs=4)
    
    grid_search.fit(X, y)
    print(f"Best params: {grid_search.best_params_}")
    
    scores: np.ndarray = cross_val_score(grid_search, X, y, cv=5)
    print(f'Accuracy: {scores.mean()}')
     
    grid_search = pd.DataFrame(grid_search.cv_results_)
    display(grid_search.head())

#### Glass dataset

In [57]:
RandomForestClassifier_performance(X_glass, Y_glass)

Best params: {'rf__criterion': 'entropy', 'rf__max_depth': 8, 'rf__n_estimators': 8}
Accuracy: 0.6219269102990033


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_rf__criterion,param_rf__max_depth,param_rf__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
0,0.004221,0.000252,0.00136,0.0002122609,entropy,3,1,"{'rf__criterion': 'entropy', 'rf__max_depth': ...",0.518519,0.537037,0.490566,0.603774,0.537474,0.0417,107
1,0.007068,0.000215,0.001489,1.976862e-07,entropy,3,2,"{'rf__criterion': 'entropy', 'rf__max_depth': ...",0.648148,0.555556,0.660377,0.584906,0.612247,0.043494,72
2,0.010292,0.000412,0.001488,6.078505e-07,entropy,3,3,"{'rf__criterion': 'entropy', 'rf__max_depth': ...",0.5,0.611111,0.716981,0.641509,0.6174,0.077974,67
3,0.017235,0.008384,0.001736,0.0002478366,entropy,3,4,"{'rf__criterion': 'entropy', 'rf__max_depth': ...",0.722222,0.592593,0.584906,0.603774,0.625874,0.05603,65
4,0.020459,0.005946,0.00248,0.0009272223,entropy,3,5,"{'rf__criterion': 'entropy', 'rf__max_depth': ...",0.5,0.5,0.716981,0.679245,0.599057,0.099951,84


#### Wine dataset

In [58]:
RandomForestClassifier_performance(X_wine, Y_wine)

Best params: {'rf__criterion': 'entropy', 'rf__max_depth': 5, 'rf__n_estimators': 9}
Accuracy: 0.9722222222222221


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_rf__criterion,param_rf__max_depth,param_rf__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
0,0.00496,0.001446,0.002108,0.001074,entropy,3,1,"{'rf__criterion': 'entropy', 'rf__max_depth': ...",0.8,0.977778,1.0,0.954545,0.933081,0.078497,81
1,0.006572,0.000214,0.001488,0.000351,entropy,3,2,"{'rf__criterion': 'entropy', 'rf__max_depth': ...",0.866667,0.866667,0.931818,0.954545,0.904924,0.039092,96
2,0.011156,0.00102,0.001487,2e-06,entropy,3,3,"{'rf__criterion': 'entropy', 'rf__max_depth': ...",0.866667,0.977778,1.0,0.954545,0.949747,0.050588,61
3,0.011656,0.000556,0.004588,0.003768,entropy,3,4,"{'rf__criterion': 'entropy', 'rf__max_depth': ...",0.911111,0.977778,0.977273,0.977273,0.960859,0.028722,50
4,0.025793,0.012653,0.002231,0.000427,entropy,3,5,"{'rf__criterion': 'entropy', 'rf__max_depth': ...",0.822222,0.977778,0.954545,1.0,0.938636,0.069107,78


#### Iris dataset

In [59]:
RandomForestClassifier_performance(X_iris, Y_iris)

Best params: {'rf__criterion': 'entropy', 'rf__max_depth': 8, 'rf__n_estimators': 7}
Accuracy: 0.9666666666666668


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_rf__criterion,param_rf__max_depth,param_rf__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
0,0.004836,0.001233,0.001488,0.000352,entropy,3,1,"{'rf__criterion': 'entropy', 'rf__max_depth': ...",0.947368,0.947368,0.891892,1.0,0.946657,0.038229,71
1,0.011163,0.003934,0.001485,0.000347,entropy,3,2,"{'rf__criterion': 'entropy', 'rf__max_depth': ...",0.947368,0.921053,0.864865,0.972973,0.926565,0.040074,106
2,0.017976,0.009243,0.001617,0.000223,entropy,3,3,"{'rf__criterion': 'entropy', 'rf__max_depth': ...",0.947368,0.973684,0.891892,0.972973,0.946479,0.033251,78
3,0.013268,0.002539,0.012648,0.012338,entropy,3,4,"{'rf__criterion': 'entropy', 'rf__max_depth': ...",0.947368,0.947368,0.945946,0.972973,0.953414,0.011307,41
4,0.020832,0.007213,0.00186,0.000412,entropy,3,5,"{'rf__criterion': 'entropy', 'rf__max_depth': ...",0.947368,0.947368,0.945946,0.972973,0.953414,0.011307,41


#### Dermatology dataset

In [60]:
RandomForestClassifier_performance(X_dermatology, Y_dermatology)

Best params: {'rf__criterion': 'entropy', 'rf__max_depth': 8, 'rf__n_estimators': 9}
Accuracy: 0.9289522399111441


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_rf__criterion,param_rf__max_depth,param_rf__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
0,0.007321,0.00054,0.004211,0.002638,entropy,3,1,"{'rf__criterion': 'entropy', 'rf__max_depth': ...",0.782609,0.48913,0.736264,0.736264,0.686067,0.115265,108
1,0.011287,0.001544,0.002973,0.002009,entropy,3,2,"{'rf__criterion': 'entropy', 'rf__max_depth': ...",0.782609,0.76087,0.78022,0.659341,0.74576,0.050601,106
2,0.015004,0.005925,0.001984,0.00035,entropy,3,3,"{'rf__criterion': 'entropy', 'rf__max_depth': ...",0.891304,0.891304,0.824176,0.813187,0.854993,0.036519,95
3,0.044888,0.038508,0.002232,0.000248,entropy,3,4,"{'rf__criterion': 'entropy', 'rf__max_depth': ...",0.891304,0.804348,0.868132,0.912088,0.868968,0.040419,90
4,0.018724,0.002586,0.017856,0.021412,entropy,3,5,"{'rf__criterion': 'entropy', 'rf__max_depth': ...",0.793478,0.869565,0.934066,0.956044,0.888288,0.063297,73


<a id='MLP_performance'></a>
### MultiLayerPerceptronClassifier

In [61]:
def MLPClassifier_performance(X: np.ndarray, y: np.ndarray) -> None:
    """
    Apply cross_val_score on the MLPClassifier model with 5 fold cross validation 
    using the hyperparameters found with 4 fold cross validation
    
    :param X: data set without the labels
    :param Y: labels
    
    :returns: nothing
    """
    pipe = Pipeline([('scaler', MinMaxScaler()), ('MLP', MLPClassifier())])
    
    parameter_grid: Dict[str, List[Any]] = {'MLP__activation': ['tanh', 'relu'],
                'MLP__solver': ['sgd', 'adam']}
        
    grid_search = GridSearchCV(pipe, param_grid = parameter_grid, scoring = 'accuracy', 
                        cv=4, n_jobs=4)
    
    grid_search.fit(X, y)
    print(f"Best params: {grid_search.best_params_}")
    
    scores: np.ndarray = cross_val_score(grid_search, X, y, cv=5)
    print(f'Accuracy: {scores.mean()}')
     
    grid_search = pd.DataFrame(grid_search.cv_results_)
    display(grid_search.head())

#### Glass dataset

In [62]:
MLPClassifier_performance(X_glass, Y_glass)

Best params: {'MLP__activation': 'relu', 'MLP__solver': 'adam'}
Accuracy: 0.505094130675526


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_MLP__activation,param_MLP__solver,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
0,0.493767,0.015597,0.003346,0.001423,tanh,sgd,"{'MLP__activation': 'tanh', 'MLP__solver': 'sgd'}",0.407407,0.37037,0.358491,0.358491,0.37369,0.020062,4
1,0.409947,0.00829,0.004088,0.002059,tanh,adam,"{'MLP__activation': 'tanh', 'MLP__solver': 'ad...",0.407407,0.462963,0.54717,0.660377,0.519479,0.095359,2
2,0.38887,0.004484,0.001483,0.00035,relu,sgd,"{'MLP__activation': 'relu', 'MLP__solver': 'sgd'}",0.574074,0.425926,0.471698,0.396226,0.466981,0.067422,3
3,0.408084,0.004371,0.002482,0.000929,relu,adam,"{'MLP__activation': 'relu', 'MLP__solver': 'ad...",0.462963,0.481481,0.566038,0.660377,0.542715,0.07826,1


#### Wine dataset

In [63]:
MLPClassifier_performance(X_wine, Y_wine)

Best params: {'MLP__activation': 'relu', 'MLP__solver': 'adam'}
Accuracy: 0.9831746031746033


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_MLP__activation,param_MLP__solver,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
0,0.433877,0.014449,0.002608,0.001545,tanh,sgd,"{'MLP__activation': 'tanh', 'MLP__solver': 'sgd'}",0.977778,1.0,1.0,0.954545,0.983081,0.018808,2
1,0.365181,0.023779,0.00186,0.001235,tanh,adam,"{'MLP__activation': 'tanh', 'MLP__solver': 'ad...",0.955556,1.0,0.977273,0.977273,0.977525,0.015716,3
2,0.329716,0.002388,0.003106,0.000212,relu,sgd,"{'MLP__activation': 'relu', 'MLP__solver': 'sgd'}",0.955556,1.0,0.954545,1.0,0.977525,0.022478,4
3,0.379441,0.010109,0.002239,0.001285,relu,adam,"{'MLP__activation': 'relu', 'MLP__solver': 'ad...",1.0,0.977778,1.0,0.977273,0.988763,0.011239,1


#### Iris dataset

In [64]:
MLPClassifier_performance(X_iris, Y_iris)

Best params: {'MLP__activation': 'tanh', 'MLP__solver': 'adam'}
Accuracy: 0.9199999999999999


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_MLP__activation,param_MLP__solver,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
0,0.376714,0.008798,0.002979,0.001856,tanh,sgd,"{'MLP__activation': 'tanh', 'MLP__solver': 'sgd'}",0.736842,0.684211,0.648649,0.756757,0.706615,0.042691,4
1,0.315456,0.019947,0.002108,0.000538,tanh,adam,"{'MLP__activation': 'tanh', 'MLP__solver': 'ad...",0.868421,0.973684,0.891892,1.0,0.933499,0.05478,1
2,0.302316,0.00965,0.001981,0.000783,relu,sgd,"{'MLP__activation': 'relu', 'MLP__solver': 'sgd'}",0.684211,0.789474,0.648649,0.72973,0.713016,0.052674,3
3,0.359354,0.019945,0.001991,0.000918,relu,adam,"{'MLP__activation': 'relu', 'MLP__solver': 'ad...",0.868421,0.921053,0.945946,0.972973,0.927098,0.038533,2


#### Dermatology dataset

In [65]:
MLPClassifier_performance(X_dermatology, Y_dermatology)

Best params: {'MLP__activation': 'relu', 'MLP__solver': 'adam'}
Accuracy: 0.9726027397260273


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_MLP__activation,param_MLP__solver,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
0,0.764337,0.033197,0.002976,0.000928,tanh,sgd,"{'MLP__activation': 'tanh', 'MLP__solver': 'sgd'}",0.945652,0.913043,0.89011,0.901099,0.912476,0.020801,3
1,0.827948,0.014102,0.001859,0.000953,tanh,adam,"{'MLP__activation': 'tanh', 'MLP__solver': 'ad...",0.98913,0.967391,0.956044,0.945055,0.964405,0.016314,2
2,0.938679,0.017589,0.003101,0.001235,relu,sgd,"{'MLP__activation': 'relu', 'MLP__solver': 'sgd'}",0.836957,0.869565,0.901099,0.901099,0.87718,0.026552,4
3,0.961868,0.01298,0.002728,0.000894,relu,adam,"{'MLP__activation': 'relu', 'MLP__solver': 'ad...",1.0,0.978261,0.956044,0.945055,0.96984,0.021126,1
