# Machine Learning Concepts and Principles
## Software Defect Detection

> Lazaros Panitsidis & Konstantinos Kravaritis<br />
> MSc Data Science <br />
> International Hellenic University <br />
> lpanitsidis@ihu.edu.gr & kkravaritis@ihu.edu.gr

## Contents
1. [Useful Python Libraries](#0)
1. [Data Content](#1)
1. [Feature Engineering](#2)
     1. [Data Preprocessing](#3)
     1. [Visualization & Analysis](#4)
1. [Feature Selection and Random Forest Classification](#5)
     1. [Feature Selection by Correlation](#6)
     1. [Univariate feature selection (SelectKbest)](#7)
     1. [Recursive Feature Elimination (RFE)](#8)
     1. [Recursive Feature Elimination with Cross-Validation (RFECV)](#9)
     1. [Feature importances with a forest of trees](#10)
     1. [XGBoost Feature Importances](#11)
     1. [Minimum Redundancy & Maximum Relevance](#12)
1. [Feature extraction with PCA](#11)
1. [Summary](#12)

<a id='0'></a>
## Useful Python Libraries

In [1]:
## write all the pip commands to download the packages below

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns # data visualization library  
import statistics as stats # https://docs.python.org/3/library/statistics.html#statistics.fmean
#import scipy.stats as spstats
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None  # default='warn'
#import warnings library
import warnings
# ignore all warnings
warnings.filterwarnings('ignore')
# Ignore ConvergenceWarning
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings('ignore', category=ConvergenceWarning)
from sklearn.utils._testing import ignore_warnings
with warnings.catch_warnings():
    # Catch and ignore ConvergenceWarnings
    warnings.filterwarnings('ignore', category=ConvergenceWarning)

# Filter FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Filter ConvergenceWarnings
warnings.simplefilter(action='ignore', category=ConvergenceWarning)

### Validation & Normalization methods ###
from sklearn.model_selection import cross_validate, cross_val_score , GridSearchCV , StratifiedKFold, RepeatedStratifiedKFold
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler

### ML models ###
from sklearn.linear_model import LogisticRegression # C1
from sklearn.linear_model import SGDClassifier # C1 loss: log_loss => LogisticRegression with SGD
from sklearn.linear_model import Perceptron # C2
from sklearn.svm import SVC, LinearSVC # C3 , C4
from sklearn.tree import DecisionTreeClassifier # C5
from sklearn.ensemble import RandomForestClassifier # C6
from sklearn.neural_network import MLPClassifier # C7

### Metrics ###
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, make_scorer, classification_report
from imblearn.metrics import geometric_mean_score # https://imbalanced-learn.org/stable/references/generated/imblearn.metrics.geometric_mean_score.html
import time
import timeit # https://stackoverflow.com/questions/17579357/time-time-vs-timeit-timeit


### Pipeline ###
from sklearn.pipeline import make_pipeline , Pipeline # https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html


### Custom Modules ###
import sys

sys.path.append("..")

from functions.data_types import optimize_dtypes
from functions.dataframe_actions import df_info, df_clean
from functions.ml_training import train_classifiers, train_classifiers_tuned

## Data preprocessing

### read the .csv files and make dataframes

In [3]:
# to read .csv files from another directory
data_location = "" # /<path>

jm1 = pd.read_csv(data_location + "jm1.csv")
mc1 = pd.read_csv(data_location + "mc1.csv")
pc3 = pd.read_csv(data_location + "pc3.csv")

### clean the dataframes from non-numeric data

In [4]:
# drop all rows that cointain non numeric valeus
jm1 = df_clean(jm1)
mc1 = df_clean(mc1)
pc3 = df_clean(pc3)

### extract useful information about the dataframes

In [5]:
dataframes = [jm1, mc1, pc3]
dataframe_names = ["jm1", "mc1", "pc3"]
df_info(dataframes, dataframe_names)

----- information for  jm1  -----
jm1  :  (10880, 22) (rows, columns)
jm1  :  0 missing values
jm1  :  1973 duplicated values
jm1  : Value counts for  defects
defects
False    8777
True     2103
Name: count, dtype: int64
----- information for  mc1  -----
mc1  :  (9466, 39) (rows, columns)
mc1  :  0 missing values
mc1  :  7450 duplicated values
mc1  : Value counts for  c
c
False    9398
True       68
Name: count, dtype: int64
----- information for  pc3  -----
pc3  :  (1563, 38) (rows, columns)
pc3  :  0 missing values
pc3  :  124 duplicated values
pc3  : Value counts for  c
c
False    1403
True      160
Name: count, dtype: int64


#### Label Encoding

* Use map instead of LabelEncoder() to ensure that False is 0 and True is 1 in all dataframes.
* With LabelEncoder() it depends on the order that the labels appear in the dataframe.

In [6]:
# class_le = LabelEncoder()
# jm1['defects'] = class_le.fit_transform(jm1['defects'].values)
# print("Classes of Label Encoder:", class_le.classes_)

In [7]:
# Map the "size" ordinal feature to an integer value
map_lexicon = {False: 0, True: 1}
jm1['defects'] = jm1['defects'].map(map_lexicon)
mc1['c'] = mc1['c'].map(map_lexicon)
pc3['c'] = pc3['c'].map(map_lexicon)

#### find optimal data types for faster computation

In [8]:
jm1 = optimize_dtypes(jm1)
mc1 = optimize_dtypes(mc1)
pc3 = optimize_dtypes(pc3)

In [9]:
print("---------- Optimal Data Types ----------")
print(jm1.dtypes)
print("---------- Optimal Data Types ----------")
print(mc1.dtypes)
print("---------- Optimal Data Types ----------")
print(pc3.dtypes)

---------- Optimal Data Types ----------
loc                  float16
v(g)                 float16
ev(g)                float16
iv(g)                float16
n                    float16
v                    float32
l                    float16
d                    float16
i                    float16
e                    float32
b                    float16
t                    float32
lOCode                uint16
lOComment             uint16
lOBlank               uint16
locCodeAndComment      uint8
uniq_Op              float16
uniq_Opnd            float16
total_Op             float16
total_Opnd           float16
branchCount          float16
defects                uint8
dtype: object
---------- Optimal Data Types ----------
LOC_BLANK                            uint8
BRANCH_COUNT                        uint16
CALL_PAIRS                           uint8
LOC_CODE_AND_COMMENT                 uint8
LOC_COMMENTS                         uint8
CONDITION_COUNT                     uint16
CYCLOMAT

#### Define Feature variables (inputs or predictors) and Target variables

In [10]:
jm1_y = jm1.defects
jm1_x = jm1.drop('defects',axis = 1 )

mc1_y = mc1.c
mc1_x = mc1.drop('c',axis = 1 )

pc3_y = pc3.c
pc3_x = pc3.drop('c',axis = 1 )

## ML Models

#### define the classifiers

In [11]:
lr = LogisticRegression()
perc = Perceptron()
#linear_svm = SVC(kernel='linear', cache_size=30000) # training doesnt stop
linear_svm = LinearSVC(dual="auto")
rbf_svm = SVC(kernel='rbf', cache_size=30000)
tree = DecisionTreeClassifier()
rf = RandomForestClassifier()
mlp = MLPClassifier()

classifiers = [lr, perc, linear_svm, rbf_svm, tree, rf, mlp]

In [24]:
from scipy.stats import loguniform


# Hyperparameter grid for Logistic Regression
lr_param_grid = {
    'classifier__C': [0.1, 1.0, 10.0],
    'classifier__penalty': ['l1', 'l2', 'elasticnet'],
    'classifier__max_iter': [100, 500, 1000],
    'classifier__solver': ['lbfgs', 'sag', 'saga'],
}

# Hyperparameter grid for Perceptron
perc_param_grid = {
    'classifier__alpha': [0.01, 0.1, 1],
    'classifier__max_iter': [100, 500, 1000],
    'classifier__eta0': [0.1, 0.01, 0.001],
}

# Hyperparameter grid for Linear SVM
linear_svm_param_grid = {
    'classifier__C': loguniform(1e-5, 1e5),
    'classifier__max_iter': [100, 500, 1000],
}

# Hyperparameter grid for RBF SVM
rbf_svm_param_grid = {
    'classifier__C': loguniform(1e-5, 1e5),
    'classifier__gamma': ['scale', 'auto'] + list(loguniform(1e-5, 1e5).rvs(size=10)),
}

# Hyperparameter grid for Decision Tree
tree_param_grid = {
    'classifier__criterion': ['gini', 'entropy'],
    'classifier__max_depth': [None, 10, 20, 30, 40, 50],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4],
}

# Hyperparameter grid for Random Forest
rf_param_grid = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__criterion': ['gini', 'entropy'],
    'classifier__max_depth': [None, 10, 20, 30, 40, 50],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4],
}

# Hyperparameter grid for MLP
mlp_param_grid = {
    'classifier__hidden_layer_sizes': [(50,), (100,), (50, 50), (200)],
    'classifier__activation': ['logistic', 'tanh', 'relu'],
    'classifier__alpha': loguniform(1e-5, 1e0),
    'classifier__max_iter': [200, 500, 1000],
}

# # Combine all hyperparameter grids into a dictionary
# hyperparameter_grids = {
#     'Logistic Regression': lr_param_grid,
#     'Perceptron': perc_param_grid,
#     'Linear SVM': linear_svm_param_grid,
#     'RBF SVM': rbf_svm_param_grid,
#     'Decision Tree': tree_param_grid,
#     'Random Forest': rf_param_grid,
#     'MLP': mlp_param_grid,
# }

#### define the metrics

In [13]:
# Define multiple metrics
scoring = {'Accuracy': make_scorer(accuracy_score),
           'F1-score': make_scorer(f1_score, average='weighted'),
           'G-Mean score': make_scorer(geometric_mean_score, average='weighted')
          }

# time: start - end time or %timeit

#https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_validate.html

# sum(['fit_time])

#### define the normalization methods

In [14]:
min_max_scaler = MinMaxScaler()
std_scaler = StandardScaler()

## If we use MinMaxScaler or StandardScaler, the feature names will be lost, so we do it mannually.

# x_scaled = (x - x.min(axis=0)) / (x.max(axis=0)-x.min(axis=0))
# x_scaled = (x - x.mean())/x.std()

#### define the Cross Validation folds method

In [15]:
#rng = np.random.RandomState(13) # random number generator , use it in every random state if shuffle=True for different results.Usefull to test a specific algorithm multiple times within a for loop.
cv=StratifiedKFold(n_splits=5, shuffle=False, random_state=None)
search_cv = StratifiedKFold(n_splits=3, shuffle=False, random_state=None)

### N1: No Normalization

#### jm1

In [25]:
jm1_nn_results = train_classifiers(classifiers, jm1_x, jm1_y, cv, scoring)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [26]:
# convert to dataframe for easier visualization (https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.from_dict.html)
df_jm1_nn_results = pd.DataFrame.from_dict(jm1_nn_results, orient='index').reset_index().rename(columns={'index': 'Classifier'})
df_jm1_nn_results

Unnamed: 0,Classifier,Accuracy,F1-score,G-Mean score,Fit time
0,LogisticRegression,0.774449,0.735764,0.474667,0.635539
1,Perceptron,0.677482,0.616844,0.423554,0.096944
2,LinearSVC,0.809375,0.749041,0.459171,3.352216
3,SVC,0.806801,0.725206,0.405115,15.662385
4,DecisionTreeClassifier,0.713327,0.719154,0.542539,0.632987
5,RandomForestClassifier,0.794669,0.759966,0.511642,10.160682
6,MLPClassifier,0.723346,0.71116,0.523977,3.345064


In [27]:
jm1_nn_results_tuned = train_classifiers_tuned(classifiers, jm1_x, jm1_y, cv, search_cv, scoring, lr_param_grid)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

AttributeError: 'GridSearchCV' object has no attribute 'best_estimator_'

In [None]:
# convert to dataframe for easier visualization (https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.from_dict.html)
df_jm1_nn_results_tuned = pd.DataFrame.from_dict(jm1_nn_results_tuned, orient='index').reset_index().rename(columns={'index': 'Classifier'})
df_jm1_nn_results_tuned

#### mc1

In [17]:
mc1_nn_results = train_classifiers(classifiers, mc1_x, mc1_y, cv, scoring)

ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    htt

In [18]:
# convert to dataframe for easier visualization
df_mc1_nn_results = pd.DataFrame.from_dict(mc1_nn_results, orient='index').reset_index().rename(columns={'index': 'Classifier'})
df_mc1_nn_results

Unnamed: 0,Classifier,Accuracy,F1-score,G-Mean score,Fit time
0,LogisticRegression,0.990809,0.988658,0.202398,2.238232
1,Perceptron,0.962608,0.973534,0.08183,0.163795
2,LinearSVC,0.992711,0.989185,0.084428,7.863124
3,SVC,0.992816,0.989238,0.084437,0.986657
4,DecisionTreeClassifier,0.99345,0.993501,0.74224,0.21352
5,RandomForestClassifier,0.995035,0.994175,0.628787,3.383218
6,MLPClassifier,0.990915,0.988368,0.122763,3.351949


#### pc3

In [19]:
pc3_nn_results = train_classifiers(classifiers, pc3_x, pc3_y, cv, scoring)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [20]:
# convert to dataframe for easier visualization
df_pc3_nn_results = pd.DataFrame.from_dict(pc3_nn_results, orient='index').reset_index().rename(columns={'index': 'Classifier'})
df_pc3_nn_results

Unnamed: 0,Classifier,Accuracy,F1-score,G-Mean score,Fit time
0,LogisticRegression,0.899551,0.873839,0.464719,0.208079
1,Perceptron,0.560619,0.623865,0.362446,0.070722
2,LinearSVC,0.901464,0.87491,0.464281,0.26171
3,SVC,0.897633,0.84921,0.303131,0.169468
4,DecisionTreeClassifier,0.851604,0.856297,0.586286,0.140456
5,RandomForestClassifier,0.900201,0.874475,0.464374,1.788494
6,MLPClassifier,0.748042,0.766518,0.519552,0.849904


### N2: Min-Max Normalization

#### jm1

In [21]:
jm1_mmn_results = train_classifiers(classifiers, jm1_x, jm1_y, cv, scoring, min_max_scaler)



In [22]:
# convert to dataframe for easier visualization
df_jm1_mmn_results = pd.DataFrame.from_dict(jm1_mmn_results, orient='index').reset_index().rename(columns={'index': 'Classifier'})
df_jm1_mmn_results

Unnamed: 0,Classifier,Accuracy,F1-score,G-Mean score,Fit time
0,LogisticRegression,0.808915,0.743697,0.44623,0.540911
1,Perceptron,0.807904,0.752556,0.474086,0.143267
2,LinearSVC,0.808915,0.744953,0.449224,0.495969
3,SVC,0.805974,0.738094,0.435143,21.164718
4,DecisionTreeClassifier,0.71296,0.718899,0.542264,0.758319
5,RandomForestClassifier,0.793474,0.758333,0.50828,9.546427
6,MLPClassifier,0.806066,0.7525,0.471484,41.445235


#### mc1

In [23]:
mc1_mmn_results = train_classifiers(classifiers, mc1_x, mc1_y, cv, scoring, min_max_scaler)

In [24]:
# convert to dataframe for easier visualization
df_mc1_mmn_results = pd.DataFrame.from_dict(mc1_mmn_results, orient='index').reset_index().rename(columns={'index': 'Classifier'})
df_mc1_mmn_results

Unnamed: 0,Classifier,Accuracy,F1-score,G-Mean score,Fit time
0,LogisticRegression,0.992922,0.989486,0.125447,0.537795
1,Perceptron,0.968199,0.976164,0.214581,0.287029
2,LinearSVC,0.992922,0.989486,0.125447,0.408467
3,SVC,0.992816,0.989238,0.084437,1.52179
4,DecisionTreeClassifier,0.994084,0.993993,0.743938,0.218422
5,RandomForestClassifier,0.994929,0.994076,0.628747,2.963271
6,MLPClassifier,0.994401,0.993002,0.529507,16.075832


#### pc3

In [25]:
pc3_mmn_results = train_classifiers(classifiers, pc3_x, pc3_y, cv, scoring, min_max_scaler)



In [26]:
# convert to dataframe for easier visualization
df_pc3_mmn_results = pd.DataFrame.from_dict(pc3_mmn_results, orient='index').reset_index().rename(columns={'index': 'Classifier'})
df_pc3_mmn_results

Unnamed: 0,Classifier,Accuracy,F1-score,G-Mean score,Fit time
0,LogisticRegression,0.89763,0.856994,0.354157,0.086156
1,Perceptron,0.875223,0.849124,0.417129,0.067828
2,LinearSVC,0.896991,0.855507,0.346225,0.096604
3,SVC,0.897633,0.84921,0.303131,0.171203
4,DecisionTreeClassifier,0.854172,0.859807,0.604,0.147992
5,RandomForestClassifier,0.903391,0.877437,0.471319,1.831437
6,MLPClassifier,0.895081,0.861439,0.393374,4.198096


### N3: Feature Standardization

#### jm1

In [27]:
jm1_fs_results = train_classifiers(classifiers, jm1_x, jm1_y, cv, scoring, std_scaler)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [28]:
# convert to dataframe for easier visualization
df_jm1_fs_results = pd.DataFrame.from_dict(jm1_fs_results, orient='index').reset_index().rename(columns={'index': 'Classifier'})
df_jm1_fs_results

Unnamed: 0,Classifier,Accuracy,F1-score,G-Mean score,Fit time
0,LogisticRegression,0.80864,0.753674,0.472044,0.772743
1,Perceptron,0.694577,0.686717,0.487946,0.126389
2,LinearSVC,0.809283,0.750465,0.462955,0.992979
3,SVC,0.806066,0.74369,0.449249,16.812753
4,DecisionTreeClassifier,0.713603,0.718956,0.540592,0.656724
5,RandomForestClassifier,0.791912,0.755929,0.503389,9.135368
6,MLPClassifier,0.804963,0.761566,0.498601,33.21329


#### mc1

In [29]:
mc1_fs_results = train_classifiers(classifiers, mc1_x, mc1_y, cv, scoring, std_scaler)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [30]:
# convert to dataframe for easier visualization
df_mc1_fs_results = pd.DataFrame.from_dict(mc1_fs_results, orient='index').reset_index().rename(columns={'index': 'Classifier'})
df_mc1_fs_results

Unnamed: 0,Classifier,Accuracy,F1-score,G-Mean score,Fit time
0,LogisticRegression,0.994084,0.992329,0.451976,1.623574
1,Perceptron,0.989964,0.988491,0.265061,0.26914
2,LinearSVC,0.992816,0.989238,0.084437,1.440133
3,SVC,0.99419,0.992625,0.497669,2.611587
4,DecisionTreeClassifier,0.993767,0.993714,0.739792,0.32347
5,RandomForestClassifier,0.994824,0.993996,0.628722,3.164549
6,MLPClassifier,0.994296,0.993241,0.579864,27.647152


#### pc3

In [31]:
pc3_fs_results = train_classifiers(classifiers, pc3_x, pc3_y, cv, scoring, std_scaler)



In [32]:
# convert to dataframe for easier visualization
df_pc3_fs_results = pd.DataFrame.from_dict(pc3_fs_results, orient='index').reset_index().rename(columns={'index': 'Classifier'})
df_pc3_fs_results

Unnamed: 0,Classifier,Accuracy,F1-score,G-Mean score,Fit time
0,LogisticRegression,0.898908,0.870648,0.442991,0.169794
1,Perceptron,0.838171,0.830928,0.437503,0.087271
2,LinearSVC,0.900827,0.872541,0.448861,0.154795
3,SVC,0.898915,0.852271,0.318965,0.210638
4,DecisionTreeClassifier,0.847127,0.852825,0.579807,0.153533
5,RandomForestClassifier,0.902748,0.878238,0.481859,1.783597
6,MLPClassifier,0.895707,0.876195,0.502807,4.543466


In [33]:
# ## example without using train_classifiers function

# lr_fs_pipe = Pipeline([('std_scaler', StandardScaler()), ('lr', LogisticRegression())])

# jm1_lr_fs_scores = cross_validate(lr_fs_pipe, jm1_x, jm1_y,
#                         cv=cv, scoring=scoring,
#                         n_jobs=None, return_train_score=False)

# jm1_lr_fs_accuracy = stats.fmean(jm1_lr_fs_scores['test_Accuracy'])
# jm1_lr_fs_f1 = stats.fmean(jm1_lr_fs_scores['test_F1-score'])
# jm1_lr_fs_g_mean = stats.fmean(jm1_lr_fs_scores['test_G-Mean score'])
# jm1_lr_fs_fit_time = sum(jm1_lr_fs_scores['fit_time'])

In [34]:
# import pandas as pd

# # Your nested dictionary
# data = {
#     'LogisticRegression': {'Accuracy': 0.8086397058823529, 'F1-score': 0.17786116284767575, 'G-Mean score': 0.318193228263499, 'Fit time': 0.688051700592041},
#     'Perceptron': {'Accuracy': 0.6945772058823529, 'F1-score': 0.2432930071571351, 'G-Mean score': 0.41653117562185005, 'Fit time': 0.11531352996826172},
#     'SVC': {'Accuracy': 0.8060661764705882, 'F1-score': 0.1296596340306902, 'G-Mean score': 0.26974098862437146, 'Fit time': 23.230370044708252},
#     'DecisionTreeClassifier': {'Accuracy': 0.7123161764705882, 'F1-score': 0.2962553781685001, 'G-Mean score': 0.5028291471048385, 'Fit time': 0.7795755863189697},
#     'RandomForestClassifier': {'Accuracy': 0.7921875, 'F1-score': 0.24727746301227507, 'G-Mean score': 0.4057223922349434, 'Fit time': 9.655367851257324},
#     'MLPClassifier': {'Accuracy': 0.8061580882352942, 'F1-score': 0.24598826193493067, 'G-Mean score': 0.39262706986628954, 'Fit time': 28.254403591156006}
# }

# df = pd.DataFrame.from_dict(data, orient='index').reset_index().rename(columns={'index': 'Classifier'})

# df

In [35]:

# # Set 'classifier' column as the index for better plotting
# df.set_index('Classifier', inplace=True)

# # Plotting
# df.plot(kind='bar', figsize=(10, 6), rot=45, colormap='viridis')
# plt.title('Classifier Performance Metrics')
# plt.ylabel('Score')
# plt.xlabel('Classifier')
# plt.show()

In [36]:
# df['Accuracy'].plot(kind='bar', figsize=(13, 6), color='skyblue', rot=0)
# plt.title('Classifier Accuracy')
# plt.ylabel('Accuracy Score')
# plt.xlabel('Classifier')
# plt.show()

# ΕΡΩΤΉΣΕΙΣ

RepeatedStratifiedKFold or StratifiedKFold

default or tuning models