# **Project Machine Learning: Using methods of machine learning with ECG to classify cardiac diseases**

**Group:**\
Rithy SOCHET\
Marc KASPAR

In [None]:
# Import :
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns   # For data visualisation

# Import of the different models :
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier

# Import of the preprocessing functions:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

# Import the Pipeline and Cross validation functions:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [None]:
# Loading the data :
data = pd.read_csv("ECGCvdata.csv")

Link of the dataset : https://www.kaggle.com/datasets/akki2703/ecg-of-cardiac-ailments-dataset

The file consists of 1200 records of cardiac ailments.

Each 300 records belongs to one class of ailment and 56 columns :

Column 1 give number of records.

Column 56 give the class of the record :
- Class 0 : NSR - Normal Sinus Rhythm (Normal situation)
- Class 1 : ARR - Arrhythmia
- Class 2 : AFF - Atrial Fibrillation
- Class 3 : CHF - Congestive heart failure

So we deal with a Multiclass classification problem.

The 54 remaining columns are features extracted using MODWPT method. The original signals are taken from MIT-BIH physionet database and are processed.

# Exploration of the data:

In [None]:
# Printing the first few lines:
data.head()

In [None]:
# Printing a summary of the data:
data.describe()

In [None]:
# Let's look for the number of missing values:
missing_values = data.isnull().sum()
missing_values = missing_values[missing_values > 0]
print(missing_values)

In [None]:
# Let's see the distribution of the 4 classes in the dataset:
class_count = data.groupby("ECG_signal").size()
classes = class_count.index.values
count = class_count.values
plt.bar(classes, count, color=["blue", "orange", "green", "red"])
plt.title("Number of records per classes")
plt.show()

In [None]:
# Correlation matrix and heatmap:
correlation_matrix = data.corr(numeric_only=True)
sns.set_style("whitegrid")

fig, ax = plt.subplots(figsize=(12, 12))
sns.heatmap(correlation_matrix, square=True, linecolor="White", linewidths=0.1, center=0)
plt.title("Heatmap of the dataset")
plt.show()

In [None]:
# We will select some features to visualise:
features = ["hbpermin", "QRSarea", "QRSperi", "RRmean"]

# We plot these features pairwise:
sns.pairplot(data, vars=features, hue="ECG_signal")
plt.show()

In [None]:
# Another way to visualize data: Boxplots
for f in features:
    sns.boxplot(data=data, x="ECG_signal", y=f, hue="ECG_signal")
    plt.title("Distribution of "+ f)
    plt.show()

In [None]:
# Let's now prepare the data for the training of our models :
data = data.to_numpy()
X_data = data[:, 1:-1]   # The first column is the ID of the record so we do not consider it.
y_data = data[:,-1]      # The last column is the class.

# Change the values of y by 0, 1, 2 or 3 depending on the class :
y_data[y_data == "NSR"] = 0
y_data[y_data == "ARR"] = 1
y_data[y_data == "AFF"] = 2
y_data[y_data == "CHF"] = 3
y_data = np.array(y_data, dtype=int)

# I. Let's test some models using only 2 classes (AFF and CHF) :

In [None]:
# For this part we will consider the following 2 classes :
# class 2 : AFF - Atrial Fibrillation
# class 3 : CHF - Congestive heart failure
y_class2 = y_data[y_data == 2]
y_class3 = y_data[y_data == 3]
y_1 = np.concatenate((y_class2, y_class3))

X_class0 = X_data[y_data==2 , :]
X_class1 = X_data[y_data==3, :]
X_1 = np.vstack((X_class0, X_class1))

# We split the dataset into a train set and a test set:
X_train, X_holdout, y_train, y_holdout = train_test_split(
    X_1, y_1, stratify = y_1, test_size = 0.2, random_state = 42)

# We store the classifier used and their accuracy on the test set:
classifier_list = []
accuracy_list = []


In [None]:
# Pipeline of LDA :

pipeline_LDA = Pipeline(
    [
        ( "imputer", SimpleImputer() ),                     # Imputer
        ( "scaler", StandardScaler() ),                     # Standardisation
        ( "classifier_LDA", LinearDiscriminantAnalysis(solver="lsqr") )  # Classifier LDA
    ]
)

params = [
    {
        "scaler" : [StandardScaler(), "passthrough"],       # Standardisation on/off,
        "imputer__strategy" : ["mean" , "median"],          # Mean vs Median imputer
        "classifier_LDA__shrinkage" : [None,"auto",0.01,0.1,0.3]    # Shrinkage parameters
    }
]

rskf = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)

cv = GridSearchCV(pipeline_LDA, params, cv=rskf, scoring="accuracy", n_jobs=-1)

cv.fit(X_train, y_train)
print(f'Best accuracy score: {cv.best_score_:.3f}\n')
print(f'Best parameter set: {cv.best_params_}\n')
print("Train Scores:")
print(f'{classification_report(y_train, cv.predict(X_train))}')

preds = cv.predict(X_holdout)
print("Test Scores:")
print(f'{classification_report(y_holdout, preds)}\n')
print(f'Test accuracy score: {accuracy_score(y_holdout, preds):.3f}')

classifier_list.append("LDA")
accuracy_list.append(accuracy_score(y_holdout, preds))

In [None]:
# Pipeline of Logistic Regression :

pipeline_LogReg = Pipeline(
    [
        ( "imputer", SimpleImputer() ),                     # Imputer
        ( "scaler", StandardScaler() ),                     # Standardisation
        ( "classifier_LogReg", LogisticRegression() )       # Classifier Logistic Regression
    ]
)

params = [
    {
        "scaler" : [StandardScaler(), "passthrough"],       # Standardisation on/off,
        "imputer__strategy" : ["mean" , "median"],          # Mean vs Median imputer
        "classifier_LogReg__penalty" : ["l2", None]         # Penalty used for regularisation
    }
]

rskf = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)

cv = GridSearchCV(pipeline_LogReg, params, cv=rskf, scoring="accuracy", n_jobs=-1)

cv.fit(X_train, y_train)
print(f'Best accuracy score: {cv.best_score_:.3f}\n')
print(f'Best parameter set: {cv.best_params_}\n')
print("Train Scores:")
print(f'{classification_report(y_train, cv.predict(X_train))}')

preds = cv.predict(X_holdout)
print("Test Scores:")
print(f'{classification_report(y_holdout, preds)}\n')
print(f'Test accuracy score: {accuracy_score(y_holdout, preds):.3f}')

classifier_list.append("Logistic Regression")
accuracy_list.append(accuracy_score(y_holdout, preds))

In [None]:
# Pipeline of Decisions Trees :

pipeline_DecTree = Pipeline(
    [
        ( "imputer", SimpleImputer() ),                     # Imputer
        ( "scaler", StandardScaler() ),                     # Standardisation
        ( "classifier_DecisionTree", DecisionTreeClassifier() )       # Classifier Decision Trees

    ]
)

params = [
    {
        "scaler" : [StandardScaler(), "passthrough"],        # Standardisation on/off,
        "imputer__strategy" : ["mean" , "median"],           # Mean vs Median imputer
        "classifier_DecisionTree__criterion" : ["gini", "entropy", "log_loss"],     # Spliting criterion
        "classifier_DecisionTree__splitter" : ["best", "random"],                   # Strategy for splitting
        "classifier_DecisionTree__max_depth" : [i for i in range(10,31,5)]          # Maximum depth
    }
]

rskf = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)

cv = GridSearchCV(pipeline_DecTree, params, cv=rskf, scoring="accuracy", n_jobs=-1)

cv.fit(X_train, y_train)
print(f'Best accuracy score: {cv.best_score_:.3f}\n')
print(f'Best parameter set: {cv.best_params_}\n')
print("Train Scores:")
print(f'{classification_report(y_train, cv.predict(X_train))}')

preds = cv.predict(X_holdout)
print("Test Scores:")
print(f'{classification_report(y_holdout, preds)}\n')
print(f'Test accuracy score: {accuracy_score(y_holdout, preds):.3f}')

classifier_list.append("Decisions Trees")
accuracy_list.append(accuracy_score(y_holdout, preds))

In [None]:
# Summary:
print("Performance with class 2 and 3")
print("Here are the differents accuracy score obtained after getting the best parameters for each classifier:")
for i in range(3):
    print("Classifier: " + classifier_list[i] + ", test accuracy = ", accuracy_list[i])

Performance with class 2 and 3
Here are the differents accuracy score obtained after getting the best parameters for each classifier:
Classifier: LDA, test accuracy =  0.9583333333333334
Classifier: Logistic Regression, test accuracy =  0.9583333333333334
Classifier: Decisions Trees, test accuracy =  0.95


# II. Model considering 2 classes : normal ECG and abnormal ECG

We will try different models and use an accuracy function to determine which model is the most appropriate for our problem.

We will try LDA, Logistic Regression.

In [None]:
# For this part we will consider the following 2 classes :
# class 0 : NSR - Normal Sinus Rhythm (Normal situation)
# class 1 : The other classes (Abnormal situation)
y_class0 = y_data[y_data == 0]
y_class1 = y_data[y_data != 0]
y_class1[:] = 1
y_2 = np.concatenate((y_class0, y_class1))

X_class0 = X_data[y_data==0 , :]
X_class1 = X_data[y_data!=0, :]
X_2 = np.vstack((X_class0, X_class1))

# We split the dataset into a train set and a test set:
X_train, X_holdout, y_train, y_holdout = train_test_split(
    X_2, y_2, stratify = y_2, test_size = 0.2, random_state = 42)

# We store the classifier used and their accuracy on the test set:
classifier_list = []
accuracy_list = []


In [None]:
# Pipeline of LDA :

pipeline_LDA = Pipeline(
    [
        ( "imputer", SimpleImputer() ),                     # Imputer
        ( "scaler", StandardScaler() ),                     # Standardisation
        ( "classifier_LDA", LinearDiscriminantAnalysis(solver="lsqr") )  # Classifier LDA
    ]
)

params = [
    {
        "scaler" : [StandardScaler(), "passthrough"],       # Standardisation on/off,
        "imputer__strategy" : ["mean" , "median"],          # Mean vs Median imputer
        "classifier_LDA__shrinkage" : [None,"auto",0.01,0.1,0.3]    # Shrinkage parameter
    }
]

rskf = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)

cv = GridSearchCV(pipeline_LDA, params, cv=rskf, scoring="accuracy", n_jobs=-1)

cv.fit(X_train, y_train)
print(f'Best accuracy score: {cv.best_score_:.3f}\n')
print(f'Best parameter set: {cv.best_params_}\n')
print("Train Scores:")
print(f'{classification_report(y_train, cv.predict(X_train))}')

preds = cv.predict(X_holdout)
print("Test Scores:")
print(f'{classification_report(y_holdout, preds)}\n')
print(f'Test accuracy score: {accuracy_score(y_holdout, preds):.3f}')

classifier_list.append("LDA")
accuracy_list.append(accuracy_score(y_holdout, preds))

In [None]:
# Pipeline of Logistic Regression :

pipeline_LogReg = Pipeline(
    [
        ( "imputer", SimpleImputer() ),                     # Imputer
        ( "scaler", StandardScaler() ),                     # Standardisation
        ( "classifier_LogReg", LogisticRegression() )       # Classifier Logistic Regression
    ]
)

params = [
    {
        "scaler" : [StandardScaler(), "passthrough"],       # Standardisation on/off,
        "imputer__strategy" : ["mean" , "median"],          # Mean vs Median imputer
        "classifier_LogReg__penalty" : ["l2", None]         # Penalty for regularisation
    }
]

rskf = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)

cv = GridSearchCV(pipeline_LogReg, params, cv=rskf, scoring="accuracy", n_jobs=-1)

cv.fit(X_train, y_train)
print(f'Best accuracy score: {cv.best_score_:.3f}\n')
print(f'Best parameter set: {cv.best_params_}\n')
print("Train Scores:")
print(f'{classification_report(y_train, cv.predict(X_train))}')

preds = cv.predict(X_holdout)
print("Test Scores:")
print(f'{classification_report(y_holdout, preds)}\n')
print(f'Test accuracy score: {accuracy_score(y_holdout, preds):.3f}')

classifier_list.append("Logistic Regression")
accuracy_list.append(accuracy_score(y_holdout, preds))

In [None]:
# Pipeline of Decisions Trees :

pipeline_DecTree = Pipeline(
    [
        ( "imputer", SimpleImputer() ),                     # Imputer
        ( "scaler", StandardScaler() ),                     # Standardisation
        ( "classifier_DecisionTree", DecisionTreeClassifier() )       # Classifier Decision Trees

    ]
)

params = [
    {
        "scaler" : [StandardScaler(), "passthrough"],       # Standardisation on/off,
        "imputer__strategy" : ["mean" , "median"],          # Mean vs Median imputer
        "classifier_DecisionTree__criterion" : ["gini", "entropy", "log_loss"],     # Splitting criterion
        "classifier_DecisionTree__splitter" : ["best", "random"],                   # Strategy for splitting
        "classifier_DecisionTree__max_depth" : [i for i in range(10,31,5)]          # Maximum depth
    }
]

rskf = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)

cv = GridSearchCV(pipeline_DecTree, params, cv=rskf, scoring="accuracy", n_jobs=-1)

cv.fit(X_train, y_train)
print(f'Best accuracy score: {cv.best_score_:.3f}\n')
print(f'Best parameter set: {cv.best_params_}\n')
print("Train Scores:")
print(f'{classification_report(y_train, cv.predict(X_train))}')

preds = cv.predict(X_holdout)
print("Test Scores:")
print(f'{classification_report(y_holdout, preds)}\n')
print(f'Test accuracy score: {accuracy_score(y_holdout, preds):.3f}')

classifier_list.append("Decision Trees")
accuracy_list.append(accuracy_score(y_holdout, preds))

In [None]:
# Summary:
print("Performance with class normal and anormal")
print("Here are the differents accuracy score obtained after getting the best parameters for each classifier:")
for i in range(3):
    print("Classifier: " + classifier_list[i] + ", test accuracy = ", accuracy_list[i])

# III. Model considering the 4 classes :

In [None]:
# We split the dataset into a train set and a test set:
X_train, X_holdout, y_train, y_holdout = train_test_split(
    X_data, y_data, stratify = y_data, test_size = 0.2, random_state = 42)

# We store the classifier used and their accuracy on the test set:
classifier_list = []
accuracy_list = []


In [None]:
# Pipeline of LDA :

pipeline_LDA = Pipeline(
    [
        ( "imputer", SimpleImputer() ),                          # Imputer
        ( "scaler", StandardScaler() ),                          # Standardisation
        ( "classifier_LDA", LinearDiscriminantAnalysis(solver="lsqr") )       # Classifier LDA
    ]
)

params = [
    {
        "scaler" : [StandardScaler(), "passthrough"],        # Standardisation on/off,
        "imputer__strategy" : ["mean" , "median"],           # Mean vs Median imputer
        "classifier_LDA__shrinkage" : [None,"auto",0.01,0.1,0.3]    # Shrinkage parameter
    }
]

rskf = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)

cv = GridSearchCV(pipeline_LDA, params, cv=rskf, scoring="accuracy", n_jobs=-1)

cv.fit(X_train, y_train)
print(f'Best accuracy score: {cv.best_score_:.3f}\n')
print(f'Best parameter set: {cv.best_params_}\n')
print("Train Scores:")
print(f'{classification_report(y_train, cv.predict(X_train))}')

preds = cv.predict(X_holdout)
print("Test Scores:")
print(f'{classification_report(y_holdout, preds)}\n')
print(f'Test accuracy score: {accuracy_score(y_holdout, preds):.3f}')

classifier_list.append("LDA")
accuracy_list.append(accuracy_score(y_holdout, preds))

In [None]:
cm = confusion_matrix(preds, y_holdout)

print("Confusion Matrix for the LDA:")
print(cm)

In [None]:
# Pipeline of Logistic Regression :

pipeline_LogReg = Pipeline(
    [
        ( "imputer", SimpleImputer() ),                     # Imputer
        ( "scaler", StandardScaler() ),                     # Standardisation
        ( "classifier_LogReg", LogisticRegression() )       # Classifier Logistic Regression
    ]
)

params = [
    {
        "scaler" : [StandardScaler(), "passthrough"],       # Standardisation on/off,
        "imputer__strategy" : ["mean" , "median"],          # Mean vs Median imputer
         "classifier_LogReg__penalty" : ["l2", None]        # Penalty used for regularisation
    }
]

rskf = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)

cv = GridSearchCV(pipeline_LogReg, params, cv=rskf, scoring="accuracy", n_jobs=-1)

cv.fit(X_train, y_train)
print(f'Best accuracy score: {cv.best_score_:.3f}\n')
print(f'Best parameter set: {cv.best_params_}\n')
print("Train Scores:")
print(f'{classification_report(y_train, cv.predict(X_train))}')

preds = cv.predict(X_holdout)
print("Test Scores:")
print(f'{classification_report(y_holdout, preds)}\n')
print(f'Test accuracy score: {accuracy_score(y_holdout, preds):.3f}')

classifier_list.append("Logistic Regression")
accuracy_list.append(accuracy_score(y_holdout, preds))

In [None]:
cm = confusion_matrix(preds, y_holdout)

print("Confusion Matrix for the Logistic Regression:")
print(cm)
# Most errors are from the classes 2 and 3.

In [None]:
# Pipeline of Decisions Trees :

pipeline_DecTree = Pipeline(
    [
        ( "imputer", SimpleImputer() ),                     # Imputer
        ( "scaler", StandardScaler() ),                     # Standardisation
        ( "classifier_DecisionTree", DecisionTreeClassifier() )       # Classifier Logitic Regression

    ]
)

params = [
    {
        "scaler" : [StandardScaler(), "passthrough"],       # Standardisation on/off,
        "imputer__strategy" : ["mean" , "median"],          # Mean vs Median imputer
        "classifier_DecisionTree__criterion" : ["gini", "entropy", "log_loss"],     # Splitting criterion
        "classifier_DecisionTree__splitter" : ["best", "random"],                   # Strategy for splitting
        "classifier_DecisionTree__max_depth" : [i for i in range(10,31,5)]          # Maximum depth
    }
]

rskf = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)

cv = GridSearchCV(pipeline_DecTree, params, cv=rskf, scoring="accuracy", n_jobs=-1)

cv.fit(X_train, y_train)
print(f'Best accuracy score: {cv.best_score_:.3f}\n')
print(f'Best parameter set: {cv.best_params_}\n')
print("Train Scores:")
print(f'{classification_report(y_train, cv.predict(X_train))}')

preds = cv.predict(X_holdout)
print("Test Scores:")
print(f'{classification_report(y_holdout, preds)}\n')
print(f'Test accuracy score: {accuracy_score(y_holdout, preds):.3f}')

classifier_list.append("Decision Trees")
accuracy_list.append(accuracy_score(y_holdout, preds))

In [None]:
cm = confusion_matrix(preds, y_holdout)

print("Confusion Matrix for the Decision Trees:")
print(cm)
# Once again, most errors are from the classes 2 and 3.

In [None]:
# Pipeline of Neural Networks :

pipeline_NN = Pipeline(
    [
        ( "imputer", SimpleImputer() ),                     # Imputer
        ( "scaler", StandardScaler() ),                     # Standardisation
        ( "classifier_NN", MLPClassifier() )                # Classifier Neural Network
    ]
)

params = [
    {
        "scaler" : [StandardScaler(), "passthrough"],       # Standardisation on/off,
        "imputer__strategy" : ["mean" , "median"],          # Mean vs Median imputer
        "classifier_NN__activation" : ["identity", "logistic", "tanh", "relu"]
    }
]

rskf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

cv = GridSearchCV(pipeline_NN, params, cv=rskf, scoring="accuracy", n_jobs=-1)

cv.fit(X_train, y_train)
print(f'Best accuracy score: {cv.best_score_:.3f}\n')
print(f'Best parameter set: {cv.best_params_}\n')
print("Train Scores:")
print(f'{classification_report(y_train, cv.predict(X_train))}')

preds = cv.predict(X_holdout)
print("Test Scores:")
print(f'{classification_report(y_holdout, preds)}\n')
print(f'Test accuracy score: {accuracy_score(y_holdout, preds):.3f}')

classifier_list.append("Neural Network")
accuracy_list.append(accuracy_score(y_holdout, preds))

In [None]:
cm = confusion_matrix(preds, y_holdout)

print("Confusion Matrix of the neural network:")
print(cm)
# The only errors are from the classes 2 and 3.

In [None]:
# Summary:
print("Performance when we consider the 4 classes of the dataset")
print("Here are the differents accuracy score obtained after getting the best parameters for each classifier:")
for i in range(4):
    print("Classifier: " + classifier_list[i] + ", accuracy = ", accuracy_list[i])

# IV. Reduction of dimension :

In [None]:
# We split the dataset into a training set and a test set:
X_train, X_holdout, y_train, y_holdout = train_test_split(
    X_data, y_data, stratify = y_data, test_size = 0.2, random_state = 42)

In [None]:
# Applying PCA :

pipeline_LogReg = Pipeline(
    [
        ( "imputer", SimpleImputer() ),                     # Imputer
        ( "scaler", StandardScaler() ),                     # Standardisation
        ( "pca", PCA() ),                                   # We apply PCA
        ( "classifier_SVC", LogisticRegression() )          # Classifier Logistic Regression
    ]
)

params = [
    {
        "scaler" : [StandardScaler()],           # Standardisation
        "imputer__strategy" : ["mean"],          # Imputer = mean
        "pca__n_components" : [2*i for i in range(1,28)]     # Number of features to keep
    }
]

rskf = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)

cv = GridSearchCV(pipeline_LogReg, params, cv=rskf, scoring="accuracy", n_jobs=-1)

cv.fit(X_train, y_train)
print(f'Best accuracy score: {cv.best_score_:.3f}\n')
print(f'Best parameter set: {cv.best_params_}\n')
print("Train Scores:")
print(f'{classification_report(y_train, cv.predict(X_train))}')

preds = cv.predict(X_holdout)
print("Test Scores:")
print(f'{classification_report(y_holdout, preds)}\n')
print(f'Test accuracy score: {accuracy_score(y_holdout, preds):.3f}')


In [None]:
# Let's plot our accuracy:
accuracy = np.zeros(27)
for i in range(10):
    for j in range(27):
        accuracy[j] += cv.cv_results_["split"+str(i)+"_test_score"][j]
accuracy = accuracy/10

number_of_features = [2*i for i in range(1,28)]

plt.plot(number_of_features, accuracy)
plt.xlabel("Number of features")
plt.ylabel("Accuracy")
plt.scatter(2*(np.argmax(accuracy)+1), np.max(accuracy), color="red", marker="x")
plt.grid(visible=True)
plt.show()
print("Maximum accuracy is reached with", 2*(np.argmax(accuracy)+1), "features.")

In [None]:
# We test the Neural Network to see if we can reduce the number of features:

pipeline_NN = Pipeline(
    [
        ( "imputer", SimpleImputer() ),               # Imputer
        ( "scaler", StandardScaler() ),               # Standardisation
        ( "pca", PCA() ),                             # We apply PCA
        ( "classifier_NN", MLPClassifier() )          # Classifier Logistic Regression
    ]
)

params = [
    {
        "scaler" : [StandardScaler()],           # Standardisation
        "imputer__strategy" : ["mean"],          # Imputer = mean
        "pca__n_components" : [2*i for i in range(1,28)]     # Number of features to keep
    }
]

rskf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

cv = GridSearchCV(pipeline_NN, params, cv=rskf, scoring="accuracy", n_jobs=-1)

cv.fit(X_train, y_train)
print(f'Best accuracy score: {cv.best_score_:.3f}\n')
print(f'Best parameter set: {cv.best_params_}\n')
print("Train Scores:")
print(f'{classification_report(y_train, cv.predict(X_train))}')

preds = cv.predict(X_holdout)
print("Test Scores:")
print(f'{classification_report(y_holdout, preds)}\n')
print(f'Test accuracy score: {accuracy_score(y_holdout, preds):.3f}')


In [None]:
# Let's plot our accuracy:
accuracy = np.zeros(27)
for i in range(5):
    for j in range(27):
        accuracy[j] += cv.cv_results_["split"+str(i)+"_test_score"][j]
accuracy = accuracy/5

number_of_features = [2*i for i in range(1,28)]

plt.plot(number_of_features, accuracy)
plt.xlabel("Number of features")
plt.ylabel("Accuracy")
plt.scatter(2*(np.argmax(accuracy)+1), np.max(accuracy), color="red", marker="x")
plt.grid(visible=True)
plt.show()
print("La précision maximale est atteinte avec", 2*(np.argmax(accuracy)+1), "features")

In [None]:
# We now create a pipeline of Neural Networks with this number of features :

pipeline_NN = Pipeline(
    [
        ( "imputer", SimpleImputer() ),                     # Imputer
        ( "scaler", StandardScaler() ),                     # Standardisation
        ( "pca", PCA() ),                                   # PCA
        ( "classifier_NN", MLPClassifier() )                # Classifier Neural Network
    ]
)

params = [
    {
        "scaler" : [StandardScaler(), "passthrough"],        # Standardisation on/off,
        "imputer__strategy" : ["mean", "median"],            # Mean vs Median imputer
        "classifier_NN__activation" : ["relu"],
        "pca__n_components" : [2*(np.argmax(accuracy)+1)],   # We use the number of components that maximises the accuracy according to the previous pipeline.
        "classifier_NN__hidden_layer_sizes" : [(50,25),(75,),(100,50,25),(200,100,50,25)]
    }
]

rskf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

cv = GridSearchCV(pipeline_NN, params, cv=rskf, scoring="accuracy", n_jobs=-1)

cv.fit(X_train, y_train)
print(f'Best accuracy score: {cv.best_score_:.3f}\n')
print(f'Best parameter set: {cv.best_params_}\n')
print("Train Scores:")
print(f'{classification_report(y_train, cv.predict(X_train))}')

preds = cv.predict(X_holdout)
print("Test Scores:")
print(f'{classification_report(y_holdout, preds)}\n')
print(f'Test accuracy score: {accuracy_score(y_holdout, preds):.3f}')

In [None]:
cm = confusion_matrix(preds, y_holdout)

print("Confusion Matrix of the neural network classifier with 38 features.")
print(cm)

In [None]:
# We do a quick analysis on the variance of the dataset:

# We preprocess the data using the best scaler and imputer according to the Pipeline:
imp = SimpleImputer(strategy = "mean")
scaler = StandardScaler()
X_PCA = scaler.fit_transform(X_data)
X_PCA = imp.fit_transform(X_data)

# We apply the PCA:
pca = PCA()
pca.fit(X_PCA)

# The cumulative variance ratio will allow us to see how many features are needed to keep 99% of the information:
cumulative_variance_ratio = pca.explained_variance_ratio_.cumsum()
nb_comp = (cumulative_variance_ratio < 0.99).sum() + 1
print("We keep 99% of the information with", nb_comp , "features.")

In [None]:
# Test Neural Network with a PCA that keeps 99% of the information:

pipeline_NN = Pipeline(
    [
        ( "imputer", SimpleImputer() ),                     # Imputer
        ( "scaler", StandardScaler() ),                     # Standardisation
        ( "pca", PCA(n_components=nb_comp) ),               # PCA
        ( "classifier_NN", MLPClassifier() )                # Classifier Neural Network
    ]
)

params = [
    {
        "scaler" : [StandardScaler(),"passthrough"],        # Standardisation on/off,
        "imputer__strategy" : ["mean" , "median"],          # Mean vs Median imputer
        "classifier_NN__activation" : ["relu"],
        "classifier_NN__hidden_layer_sizes" : [(50,25),(75,),(100,50,25),(200,100,50,25)]
    }
]

rskf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

cv = GridSearchCV(pipeline_NN, params, cv=rskf, scoring="accuracy", n_jobs=-1)

cv.fit(X_train, y_train)
print(f'Best accuracy score: {cv.best_score_:.3f}\n')
print(f'Best parameter set: {cv.best_params_}\n')
print("Train Scores:")
print(f'{classification_report(y_train, cv.predict(X_train))}')

preds = cv.predict(X_holdout)
print("Test Scores:")
print(f'{classification_report(y_holdout, preds)}\n')
print(f'Test accuracy score: {accuracy_score(y_holdout, preds):.3f}')


In [None]:
# We can see that the accuracy is lower:
cm = confusion_matrix(preds, y_holdout)

print("Confusion Matrix of the neural network classifier with " + str(nb_comp) + " features.")
print(cm)