## Full Repo
https://github.com/MostafaBelo/Konecta_Assignments/tree/main

## Imports

In [1]:
import pandas as pd

import numpy as np
from matplotlib import pyplot as plt

from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from scipy.stats import randint, uniform, loguniform

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_curve, auc, confusion_matrix, classification_report, roc_auc_score

In [2]:
df = pd.read_csv("diabetes.csv")

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                                                                    Non-Null Count  Dtype  
---  ------                                                                    --------------  -----  
 0   Number of times pregnant                                                  768 non-null    int64  
 1   Plasma glucose concentration a 2 hours in an oral glucose tolerance test  768 non-null    int64  
 2   Diastolic blood pressure (mm Hg)                                          768 non-null    int64  
 3   Triceps skin fold thickness (mm)                                          768 non-null    int64  
 4   2-Hour serum insulin (mu U/ml)                                            768 non-null    int64  
 5   Body mass index (weight in kg/(height in m)^2)                            768 non-null    float64
 6   Diabetes pedigree function                                         

## Preprocessing

In [3]:
df.describe()
# unreasonable values are some fo the zeros (which probably are the equivalent to null or mising for some of the medical columns)

Unnamed: 0,Number of times pregnant,Plasma glucose concentration a 2 hours in an oral glucose tolerance test,Diastolic blood pressure (mm Hg),Triceps skin fold thickness (mm),2-Hour serum insulin (mu U/ml),Body mass index (weight in kg/(height in m)^2),Diabetes pedigree function,Age (years),Class variable
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [4]:
df.head()

Unnamed: 0,Number of times pregnant,Plasma glucose concentration a 2 hours in an oral glucose tolerance test,Diastolic blood pressure (mm Hg),Triceps skin fold thickness (mm),2-Hour serum insulin (mu U/ml),Body mass index (weight in kg/(height in m)^2),Diabetes pedigree function,Age (years),Class variable
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
((df.isnull()) | (df == 0)).sum()

Number of times pregnant                                                    111
Plasma glucose concentration a 2 hours in an oral glucose tolerance test      5
Diastolic blood pressure (mm Hg)                                             35
Triceps skin fold thickness (mm)                                            227
2-Hour serum insulin (mu U/ml)                                              374
Body mass index (weight in kg/(height in m)^2)                               11
Diabetes pedigree function                                                    0
Age (years)                                                                   0
Class variable                                                              500
dtype: int64

In [6]:
def list_uniques(df: pd.DataFrame, lim=30):
    for col in df.columns:
        uq_vals = df[col].unique()
        print(col)
        if lim != -1 and len(uq_vals) > lim:
            print("Too many values")
        else:
            print(uq_vals)
        print()

# No nulls/missing values found
# Target is already encoded as 0/1
list_uniques(df, -1)

Number of times pregnant
[ 6  1  8  0  5  3 10  2  4  7  9 11 13 15 17 12 14]

Plasma glucose concentration a 2 hours in an oral glucose tolerance test
[148  85 183  89 137 116  78 115 197 125 110 168 139 189 166 100 118 107
 103 126  99 196 119 143 147  97 145 117 109 158  88  92 122 138 102  90
 111 180 133 106 171 159 146  71 105 101 176 150  73 187  84  44 141 114
  95 129  79   0  62 131 112 113  74  83 136  80 123  81 134 142 144  93
 163 151  96 155  76 160 124 162 132 120 173 170 128 108 154  57 156 153
 188 152 104  87  75 179 130 194 181 135 184 140 177 164  91 165  86 193
 191 161 167  77 182 157 178  61  98 127  82  72 172  94 175 195  68 186
 198 121  67 174 199  56 169 149  65 190]

Diastolic blood pressure (mm Hg)
[ 72  66  64  40  74  50   0  70  96  92  80  60  84  30  88  90  94  76
  82  75  58  78  68 110  56  62  85  86  48  44  65 108  55 122  54  52
  98 104  95  46 102 100  61  24  38 106 114]

Triceps skin fold thickness (mm)
[35 29  0 23 32 45 19 47 38 30 41 3

In [7]:
# Removing missing values

df_cleaned = df.copy()

df_cleaned.drop(df_cleaned[df_cleaned["Plasma glucose concentration a 2 hours in an oral glucose tolerance test"] == 0].index, inplace=True)
df_cleaned.drop(df_cleaned[df_cleaned["Diastolic blood pressure (mm Hg)"] == 0].index, inplace=True)
# df_cleaned.drop(df_cleaned[df_cleaned["Triceps skin fold thickness (mm)"] == 0].index, inplace=True) # too many zeros
# df_cleaned.drop(df_cleaned[df_cleaned["2-Hour serum insulin (mu U/ml)"] == 0].index, inplace=True) # too many zeros
df_cleaned.drop(df_cleaned[df_cleaned["Body mass index (weight in kg/(height in m)^2)"] == 0].index, inplace=True)

df = df_cleaned.copy()

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 724 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                                                                    Non-Null Count  Dtype  
---  ------                                                                    --------------  -----  
 0   Number of times pregnant                                                  724 non-null    int64  
 1   Plasma glucose concentration a 2 hours in an oral glucose tolerance test  724 non-null    int64  
 2   Diastolic blood pressure (mm Hg)                                          724 non-null    int64  
 3   Triceps skin fold thickness (mm)                                          724 non-null    int64  
 4   2-Hour serum insulin (mu U/ml)                                            724 non-null    int64  
 5   Body mass index (weight in kg/(height in m)^2)                            724 non-null    float64
 6   Diabetes pedigree function                                              

In [8]:
## Handling outliers

df_z = (df - df.mean())/df.std()

(df_z.abs() > 3).sum()

Number of times pregnant                                                     4
Plasma glucose concentration a 2 hours in an oral glucose tolerance test     0
Diastolic blood pressure (mm Hg)                                             8
Triceps skin fold thickness (mm)                                             1
2-Hour serum insulin (mu U/ml)                                              18
Body mass index (weight in kg/(height in m)^2)                               5
Diabetes pedigree function                                                  10
Age (years)                                                                  3
Class variable                                                               0
dtype: int64

In [9]:
# Inspecting outliers

df[df_z.abs()["Diabetes pedigree function"] > 3]

Unnamed: 0,Number of times pregnant,Plasma glucose concentration a 2 hours in an oral glucose tolerance test,Diastolic blood pressure (mm Hg),Triceps skin fold thickness (mm),2-Hour serum insulin (mu U/ml),Body mass index (weight in kg/(height in m)^2),Diabetes pedigree function,Age (years),Class variable
4,0,137,40,35,168,43.1,2.288,33,1
45,0,180,66,39,0,42.0,1.893,25,1
58,0,146,82,0,0,40.5,1.781,44,0
228,4,197,70,39,744,36.7,2.329,31,0
330,8,118,72,19,0,23.1,1.476,46,0
370,3,173,82,48,465,38.4,2.137,25,1
395,2,127,58,24,275,27.7,1.6,25,0
445,0,180,78,63,14,59.4,2.42,25,1
593,2,82,52,22,115,28.5,1.699,25,0
621,2,92,76,20,0,24.2,1.698,28,0


In [10]:
for col in df.columns:
    print(col)
    print((3 * df[col].std()) + df[col].mean(), (-3 * df[col].std()) + df[col].mean())
    print(df[col].describe())
    print()

Number of times pregnant
13.954430556217089 -6.22238635732206
count    724.000000
mean       3.866022
std        3.362803
min        0.000000
25%        1.000000
50%        3.000000
75%        6.000000
max       17.000000
Name: Number of times pregnant, dtype: float64

Plasma glucose concentration a 2 hours in an oral glucose tolerance test
214.13268663667898 29.632506733486764
count    724.000000
mean     121.882597
std       30.750030
min       44.000000
25%       99.750000
50%      117.000000
75%      142.000000
max      199.000000
Name: Plasma glucose concentration a 2 hours in an oral glucose tolerance test, dtype: float64

Diastolic blood pressure (mm Hg)
109.540163451345 35.2609415210307
count    724.000000
mean      72.400552
std       12.379870
min       24.000000
25%       64.000000
50%       72.000000
75%       80.000000
max      122.000000
Name: Diastolic blood pressure (mm Hg), dtype: float64

Triceps skin fold thickness (mm)
68.6416380558638 -25.754897724372075
count    7

In [11]:
# Dropping outliers in relevant columns (according to medical relvance to the outlier thresholds)

cols = list(df.columns)
print(cols)

df_no_outliers = df.copy()

df_no_outliers.drop(df[(df_z.abs()[cols[:6]] > 3).any(axis=1)].index, inplace=True)

df = df_no_outliers.copy()

df

['Number of times pregnant', 'Plasma glucose concentration a 2 hours in an oral glucose tolerance test', 'Diastolic blood pressure (mm Hg)', 'Triceps skin fold thickness (mm)', '2-Hour serum insulin (mu U/ml)', 'Body mass index (weight in kg/(height in m)^2)', 'Diabetes pedigree function', 'Age (years)', 'Class variable']


Unnamed: 0,Number of times pregnant,Plasma glucose concentration a 2 hours in an oral glucose tolerance test,Diastolic blood pressure (mm Hg),Triceps skin fold thickness (mm),2-Hour serum insulin (mu U/ml),Body mass index (weight in kg/(height in m)^2),Diabetes pedigree function,Age (years),Class variable
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [12]:
df.describe()

Unnamed: 0,Number of times pregnant,Plasma glucose concentration a 2 hours in an oral glucose tolerance test,Diastolic blood pressure (mm Hg),Triceps skin fold thickness (mm),2-Hour serum insulin (mu U/ml),Body mass index (weight in kg/(height in m)^2),Diabetes pedigree function,Age (years),Class variable
count,690.0,690.0,690.0,690.0,690.0,690.0,690.0,690.0,690.0
mean,3.837681,120.288406,72.231884,20.727536,72.4,32.11058,0.466939,33.257971,0.327536
std,3.261648,29.826095,11.626686,15.316818,90.265707,6.502858,0.313856,11.706875,0.469655
min,0.0,44.0,38.0,0.0,0.0,18.2,0.078,21.0,0.0
25%,1.0,99.0,64.0,0.0,0.0,27.3,0.245,24.0,0.0
50%,3.0,115.0,72.0,23.0,42.5,32.0,0.3725,29.0,0.0
75%,6.0,138.75,80.0,32.0,126.0,36.1,0.61375,41.0,1.0
max,13.0,199.0,108.0,60.0,415.0,52.9,2.288,81.0,1.0


In [13]:
# Normalizing Values

cols = df.columns
X = df[cols[:-1]]
y = df["Class variable"]

X = (X-X.mean())/X.std()

X.describe()

Unnamed: 0,Number of times pregnant,Plasma glucose concentration a 2 hours in an oral glucose tolerance test,Diastolic blood pressure (mm Hg),Triceps skin fold thickness (mm),2-Hour serum insulin (mu U/ml),Body mass index (weight in kg/(height in m)^2),Diabetes pedigree function,Age (years)
count,690.0,690.0,690.0,690.0,690.0,690.0,690.0,690.0
mean,3.346759e-17,6.950962e-17,-9.525392e-17,-1.081261e-16,-9.267949e-17,2.986339e-16,1.081261e-16,4.5696140000000007e-17
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-1.176608,-2.557774,-2.944251,-1.353253,-0.8020765,-2.139149,-1.23923,-1.047075
25%,-0.8700146,-0.713751,-0.7080164,-1.353253,-0.8020765,-0.7397639,-0.7071377,-0.7908149
50%,-0.2568276,-0.177308,-0.01994412,0.148364,-0.3312443,-0.01700479,-0.3008999,-0.3637154
75%,0.6629529,0.6189746,0.6681281,0.7359534,0.5938025,0.6134872,0.4677657,0.6613233
max,2.809107,2.639018,3.076381,2.564009,3.795461,3.196967,5.802225,4.078119


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle = True, random_state = 42)

## Tuning, Training & Evaluation

In [15]:
store = []
def eval_and_collect(name, model, params, method="grid"):
    global store
    
    assert method in ["grid", "random"], "Invalid Method"

    if method == "grid":
        grid_search = GridSearchCV(estimator=model, param_grid=params, verbose=1, cv=5, n_jobs=-1, scoring="f1")

        grid_result = grid_search.fit(X_train, y_train)
        print(grid_search.best_params_)

        y_pred=grid_result.predict(X_test)

    elif method == "random":
        random_search = RandomizedSearchCV(estimator=model, param_distributions=params, verbose=1, cv=5, n_iter=400, n_jobs=-1, random_state=42, scoring="f1")

        random_result = random_search.fit(X_train, y_train)
        print(random_search.best_params_)

        y_pred=random_result.predict(X_test)

    print(classification_report(y_test, y_pred, target_names=["Non-Diabetic", "Diabetic"]))
    metrics = classification_report(y_test, y_pred, target_names=["Non-Diabetic", "Diabetic"], output_dict=True)
    store.append({"Name": name, "model": model, "metrics": metrics})

### SVM

In [16]:
svm = svm.SVC()
params_grid = {
    "kernel":["linear", "rbf", "sigmoid", "poly"],
    
    "C": [0.1, 1, 10],
    "gamma": ["scale", "auto"],
    "degree": [2, 3],

    "class_weight":["balanced"]
}
eval_and_collect("SVM", svm, params_grid, "grid")

Fitting 5 folds for each of 48 candidates, totalling 240 fits


{'C': 1, 'class_weight': 'balanced', 'degree': 2, 'gamma': 'scale', 'kernel': 'rbf'}
              precision    recall  f1-score   support

Non-Diabetic       0.87      0.77      0.82        92
    Diabetic       0.62      0.76      0.69        46

    accuracy                           0.77       138
   macro avg       0.75      0.77      0.75       138
weighted avg       0.79      0.77      0.77       138



### KNN

In [17]:
knn = KNeighborsClassifier()
params_grid = {
    "n_neighbors":range(4,31)
}
eval_and_collect("KNN", knn, params_grid, "grid")

Fitting 5 folds for each of 27 candidates, totalling 135 fits
{'n_neighbors': 9}
              precision    recall  f1-score   support

Non-Diabetic       0.77      0.89      0.82        92
    Diabetic       0.68      0.46      0.55        46

    accuracy                           0.75       138
   macro avg       0.72      0.67      0.68       138
weighted avg       0.74      0.75      0.73       138



### Logistic Regression

In [18]:
lrg = LogisticRegression(solver="saga")
params_grid = [
    {'penalty': ['l1'], 'C': [0.1, 1, 10], 'solver': ['saga'], "class_weight":["balanced"],"random_state":[42]},
    {'penalty': ['l2'], 'C': [0.1, 1, 10], 'solver': ['saga'], "class_weight":["balanced"],"random_state":[42]},
    {'penalty': ['elasticnet'], 'C': [0.1, 1, 10], 'l1_ratio': [0.1, 0.5, 0.9], 'solver': ['saga'], "class_weight":["balanced"],"random_state":[42]},
]
eval_and_collect("Logistic Regression", lrg, params_grid, "grid")

Fitting 5 folds for each of 15 candidates, totalling 75 fits
{'C': 0.1, 'class_weight': 'balanced', 'l1_ratio': 0.9, 'penalty': 'elasticnet', 'random_state': 42, 'solver': 'saga'}
              precision    recall  f1-score   support

Non-Diabetic       0.84      0.83      0.84        92
    Diabetic       0.67      0.70      0.68        46

    accuracy                           0.78       138
   macro avg       0.76      0.76      0.76       138
weighted avg       0.79      0.78      0.78       138



### Decision Tree

In [19]:
clf = DecisionTreeClassifier()
params_grid = {
    "max_depth":np.linspace(1,21).astype(np.int32),
    "class_weight":["balanced"],
    "random_state":[42]
}
eval_and_collect("Decision Tree", clf, params_grid, "grid")

Fitting 5 folds for each of 50 candidates, totalling 250 fits
{'class_weight': 'balanced', 'max_depth': np.int32(5), 'random_state': 42}
              precision    recall  f1-score   support

Non-Diabetic       0.89      0.64      0.75        92
    Diabetic       0.54      0.85      0.66        46

    accuracy                           0.71       138
   macro avg       0.72      0.74      0.70       138
weighted avg       0.78      0.71      0.72       138



### Random Forest

In [None]:
rf = RandomForestClassifier()
param_distributions = {
    'n_estimators': randint(100, 300),       # number of trees
    'max_depth': randint(2,15),
    'min_samples_split': randint(2, 10),    # min samples to split a node
    'min_samples_leaf': randint(1, 5),      # min samples in a leaf
    'max_features': ['sqrt', 'log2', None],  # features to consider at split
    "class_weight": ["balanced"],

    "random_state":[42],
}
eval_and_collect("Random Forest", rf, param_distributions, "random")
# best results found: {'class_weight': 'balanced', 'max_depth': 3, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 152, 'random_state': 42}

Fitting 5 folds for each of 400 candidates, totalling 2000 fits
{'class_weight': 'balanced', 'max_depth': 3, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 152, 'random_state': 42}
              precision    recall  f1-score   support

Non-Diabetic       0.87      0.78      0.82        92
    Diabetic       0.64      0.76      0.69        46

    accuracy                           0.78       138
   macro avg       0.75      0.77      0.76       138
weighted avg       0.79      0.78      0.78       138



### XGBoost

In [None]:
pos_weight = np.sum(y_train == 0) / np.sum(y_train == 1)
xgb_model = XGBClassifier()
param_distributions = {
    "n_estimators":randint(100,300),       # number of boosting rounds
    "learning_rate":loguniform(1e-5,1e-1),      # step size shrinkage
    "max_depth":randint(2,15),            # max depth of trees
    "subsample":uniform(.7,.2),          # fraction of samples per tree
    "colsample_bytree":uniform(.7,.2),   # fraction of features per tree
    "scale_pos_weight":[pos_weight],
    
    "eval_metric":["logloss"],
    "random_state":[42],
}
eval_and_collect("XGBoost", xgb_model, param_distributions, "random")
# best results found: {'colsample_bytree': np.float64(0.8972421488959206), 'eval_metric': 'logloss', 'learning_rate': np.float64(0.0062036438354928424), 'max_depth': 2, 'n_estimators': 260, 'random_state': 42, 'scale_pos_weight': np.float64(2.066666666666667), 'subsample': np.float64(0.7822413441744372)}

Fitting 5 folds for each of 400 candidates, totalling 2000 fits
{'colsample_bytree': np.float64(0.8972421488959206), 'eval_metric': 'logloss', 'learning_rate': np.float64(0.0062036438354928424), 'max_depth': 2, 'n_estimators': 260, 'random_state': 42, 'scale_pos_weight': np.float64(2.066666666666667), 'subsample': np.float64(0.7822413441744372)}
              precision    recall  f1-score   support

Non-Diabetic       0.89      0.78      0.83        92
    Diabetic       0.65      0.80      0.72        46

    accuracy                           0.79       138
   macro avg       0.77      0.79      0.78       138
weighted avg       0.81      0.79      0.79       138



## Evaluation

In [34]:
flattened_store = {}
for item in store:
    combined = {}
    for k,v in item["metrics"].items():
        if isinstance(v, dict):
            for sub_key, sub_value in v.items():
                combined[f"{k}_{sub_key}"] = sub_value
        else:
            combined[k] = v

    flattened_store[item["Name"]] = combined
flattened_df = pd.DataFrame.from_dict(flattened_store, orient="index")
flattened_df

Unnamed: 0,Non-Diabetic_precision,Non-Diabetic_recall,Non-Diabetic_f1-score,Non-Diabetic_support,Diabetic_precision,Diabetic_recall,Diabetic_f1-score,Diabetic_support,accuracy,macro avg_precision,macro avg_recall,macro avg_f1-score,macro avg_support,weighted avg_precision,weighted avg_recall,weighted avg_f1-score,weighted avg_support
SVM,0.865854,0.771739,0.816092,92.0,0.625,0.76087,0.686275,46.0,0.768116,0.745427,0.766304,0.751183,138.0,0.785569,0.768116,0.772819,138.0
KNN,0.766355,0.891304,0.824121,92.0,0.677419,0.456522,0.545455,46.0,0.746377,0.721887,0.673913,0.684788,138.0,0.73671,0.746377,0.731232,138.0
Logistic Regression,0.844444,0.826087,0.835165,92.0,0.666667,0.695652,0.680851,46.0,0.782609,0.755556,0.76087,0.758008,138.0,0.785185,0.782609,0.783727,138.0
Decision Tree,0.893939,0.641304,0.746835,92.0,0.541667,0.847826,0.661017,46.0,0.710145,0.717803,0.744565,0.703926,138.0,0.776515,0.710145,0.718229,138.0
Random Forest,0.86747,0.782609,0.822857,92.0,0.636364,0.76087,0.693069,46.0,0.775362,0.751917,0.771739,0.757963,138.0,0.790434,0.775362,0.779595,138.0
XGBoost,0.888889,0.782609,0.83237,92.0,0.649123,0.804348,0.718447,46.0,0.789855,0.769006,0.793478,0.775408,138.0,0.808967,0.789855,0.794395,138.0


In [37]:
summary = flattened_df.sort_values("weighted avg_f1-score", ascending=False)
summary["weighted avg_f1-score"]

XGBoost                0.794395
Logistic Regression    0.783727
Random Forest          0.779595
SVM                    0.772819
KNN                    0.731232
Decision Tree          0.718229
Name: weighted avg_f1-score, dtype: float64

In [38]:
summary

Unnamed: 0,Non-Diabetic_precision,Non-Diabetic_recall,Non-Diabetic_f1-score,Non-Diabetic_support,Diabetic_precision,Diabetic_recall,Diabetic_f1-score,Diabetic_support,accuracy,macro avg_precision,macro avg_recall,macro avg_f1-score,macro avg_support,weighted avg_precision,weighted avg_recall,weighted avg_f1-score,weighted avg_support
XGBoost,0.888889,0.782609,0.83237,92.0,0.649123,0.804348,0.718447,46.0,0.789855,0.769006,0.793478,0.775408,138.0,0.808967,0.789855,0.794395,138.0
Logistic Regression,0.844444,0.826087,0.835165,92.0,0.666667,0.695652,0.680851,46.0,0.782609,0.755556,0.76087,0.758008,138.0,0.785185,0.782609,0.783727,138.0
Random Forest,0.86747,0.782609,0.822857,92.0,0.636364,0.76087,0.693069,46.0,0.775362,0.751917,0.771739,0.757963,138.0,0.790434,0.775362,0.779595,138.0
SVM,0.865854,0.771739,0.816092,92.0,0.625,0.76087,0.686275,46.0,0.768116,0.745427,0.766304,0.751183,138.0,0.785569,0.768116,0.772819,138.0
KNN,0.766355,0.891304,0.824121,92.0,0.677419,0.456522,0.545455,46.0,0.746377,0.721887,0.673913,0.684788,138.0,0.73671,0.746377,0.731232,138.0
Decision Tree,0.893939,0.641304,0.746835,92.0,0.541667,0.847826,0.661017,46.0,0.710145,0.717803,0.744565,0.703926,138.0,0.776515,0.710145,0.718229,138.0


XGBoost was found to be the best model (according the metric of weighted avg f1-score), although most of the results are comparable