In [112]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns

In [113]:
file_path="C:/Users/kleop/Documents/repos/Exercises/Machine_Learning/Coursework_2/heart.csv"
data=pd.read_csv(file_path, sep=',', decimal=".")
#data.head()
data.dtypes #to check. if there is any non-numeric variable, we should be omitting

age           int64
sex           int64
cp            int64
trestbps      int64
chol          int64
fbs           int64
restecg       int64
thalach       int64
exang         int64
oldpeak     float64
slope         int64
ca            int64
thal          int64
target        int64
dtype: object

We want to check, whether all the above parameters(age, sex,trastbps...) have an effect on the probability of heart attack appearing on a person/patient.

Notes on features

age(in years: discrete nominal),
sex: 1-male/0-female (binary-categorical),
cp: chest pain type (4 values: categorical),
trestbps (discrete nominal),
chol:discrete nominal,
fbs: (fasting blood sugar: 1 = true; 0 = false) binary categorical,
restecg: (3 values:0,1,2) categorical,
thalach: maximum heart rate achieved (discrete nominal),
exang: exercise induced angina (1 = yes; 0 = no) binary categorical,
oldpeak: continuous nominal,
slope: 3 values(0-upsloping,1-flat,2-downsloping): categorical,
ca:number of major vessels(0-3) discerete ordinal,
thal: 0 = normal; 1 = fixed defect; 2 = reversable defect (categorical)

target:0(less chance of heartt attack), 1(more chance of heart attack) (binary categorical)

In [114]:
#Separate data into Matrix of features X and target variable y
X=data.iloc[:,:-1].values
y=data.iloc[:,-1].values

print(X)
print(y) #While printing the data, we get the intuition that the 2 classes (0 and 1) are balanced.

[[63.  1.  3. ...  0.  0.  1.]
 [37.  1.  2. ...  0.  0.  2.]
 [41.  0.  1. ...  2.  0.  2.]
 ...
 [68.  1.  0. ...  1.  2.  3.]
 [57.  1.  0. ...  1.  1.  3.]
 [57.  0.  1. ...  1.  1.  2.]]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0]


# Feature Scaling


In [115]:
scale=StandardScaler()
X=scale.fit_transform(X)
print(X)

[[ 0.9521966   0.68100522  1.97312292 ... -2.27457861 -0.71442887
  -2.14887271]
 [-1.91531289  0.68100522  1.00257707 ... -2.27457861 -0.71442887
  -0.51292188]
 [-1.47415758 -1.46841752  0.03203122 ...  0.97635214 -0.71442887
  -0.51292188]
 ...
 [ 1.50364073  0.68100522 -0.93851463 ... -0.64911323  1.24459328
   1.12302895]
 [ 0.29046364  0.68100522 -0.93851463 ... -0.64911323  0.26508221
   1.12302895]
 [ 0.29046364 -1.46841752  0.03203122 ... -0.64911323  0.26508221
  -0.51292188]]


# Prepare Dataset

In [116]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

# Principal Component Analysis (PCA) to reduce the dimensionality of the data in both Training and Test Set

In [117]:
# Principal Component Analysis to select k features such that they explain as much variance as possible
pca =PCA(n_components =10)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)
print(pca.explained_variance_ratio_.cumsum())

#After trying with several number of variables starting from 2, I realize that for this specific dataset the dimensionality reduction
#does not help with the exlainability of a significant amount of variance, in fact the 2 dimensions were covering nearly the 30%
#of the variance of the dataset, so I chose to reduce the dimensions to 10, which does explain an over 90% of the variance of the
#dataset.So I sacrifice the visualisation part for the sake of keeping as much important information as possible.


[0.20879927 0.32995097 0.42369281 0.51514961 0.59630549 0.67101175
 0.73647104 0.79594704 0.85092807 0.90117182]


In [118]:
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_expanded = poly.fit_transform(X_train_pca)
X_test_expanded = poly.transform(X_test_pca)

# Logistic Regression

In [119]:
#Since the possible outcome is either 0 or 1,i.e we perform a binary classification task, we can perform binary logistic regression
classifier=LogisticRegression(fit_intercept=True,max_iter=1000)
classifier.fit(X_train_expanded,y_train)


In [120]:
# make predictions using the testing set
y_pred_train=classifier.predict(X_train_expanded)
y_pred_test=classifier.predict(X_test_expanded)


# Evaluation

In [122]:
# Getting the Training and Test Accuracy of the Logistic Regression Model
print('Training Accuracy of the Model: ', metrics.accuracy_score(y_train, y_pred_train))
print('Test Accuracy of the Model: ', metrics.accuracy_score(y_test, y_pred_test))
print()

#Getting the confusion matrix for both training and test set
print("Confusion matrix for train: \n",metrics.confusion_matrix(y_train, y_pred_train))
print("Confusion matrix for test: \n",metrics.confusion_matrix(y_test,y_pred_test))
print()

# Getting the Training and Test Precision of the Logistic Regression Model
print('Training Precision of the Model: ', metrics.precision_score(y_train, y_pred_train))
print('Test Precision of the Model: ', metrics.precision_score(y_test, y_pred_test))
print()

# Getting the Training and Test Recall of the Logistic Regression Model
print('Training Recall of the Model: ', metrics.recall_score(y_train, y_pred_train))
print('Test Recall of the Model: ', metrics.recall_score(y_test, y_pred_test))
print()

# Getting the Training and Test F1-Score of the Logistic Regression Model
print('Training F1-Score of the Model: ', metrics.f1_score(y_train, y_pred_train))
print('Test F1-Score of the Model: ', metrics.f1_score(y_test, y_pred_test))
print()

Training Accuracy of the Model:  0.9380165289256198
Test Accuracy of the Model:  0.8360655737704918

Confusion matrix for train: 
 [[101  10]
 [  5 126]]
Confusion matrix for test: 
 [[21  6]
 [ 4 30]]

Training Precision of the Model:  0.9264705882352942
Test Precision of the Model:  0.8333333333333334

Training Recall of the Model:  0.9618320610687023
Test Recall of the Model:  0.8823529411764706

Training F1-Score of the Model:  0.9438202247191011
Test F1-Score of the Model:  0.8571428571428571



# Regularisation using C=[1,5,10]

In [123]:
C_values = [1, 5, 10]

# Train and evaluate logistic regression classifiers for each C value
#solver=liblinear, because it is well-suited for binary classification problems.
for C in C_values:
    classifier = LogisticRegression(C=C, solver='liblinear', random_state=42)
    classifier.fit(X_train_expanded, y_train)
    y_pred = classifier.predict(X_test_expanded)
    accuracy = metrics.accuracy_score(y_test, y_pred)
    print(f"Accuracy with C={C}: {accuracy}")

Accuracy with C=1: 0.8360655737704918
Accuracy with C=5: 0.8524590163934426
Accuracy with C=10: 0.8360655737704918


# Re-training the Logistic Regression Classifier with the best hyper-parameter, C = 5 (obtained above)

In [124]:
# re-training the Logistic Regression Classifier with the best hyper-parameter, C = 5
model = LogisticRegression(C = 5).fit(X_train_expanded, y_train)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Obtaining the Training Set and Test Set Predictions given by the model, trained in the last step

In [125]:
# getting the Training Set Predictions
y_train_pred_lr = model.predict(X_train_expanded)

# getting the Test Set Predictions
y_test_pred_lr = model.predict(X_test_expanded)

# Performance Analysis of the Logistic Regression Model (with feature expansion) in terms of Accuracy and Confusion Matrix

In [126]:
# Getting the Training and Test Accuracy of the Logistic Regression Model
print('Training Accuracy of the Model: ', metrics.accuracy_score(y_train, y_train_pred_lr))
print('Test Accuracy of the Model: ', metrics.accuracy_score(y_test, y_test_pred_lr))
print()

#Getting the confusion matrix for both training and test set
print("Confusion matrix for train: \n",metrics.confusion_matrix(y_train, y_train_pred_lr))
print("Confusion matrix for test: \n",metrics.confusion_matrix(y_test,y_test_pred_lr))
print()

Training Accuracy of the Model:  0.9669421487603306
Test Accuracy of the Model:  0.8524590163934426

Confusion matrix for train: 
 [[106   5]
 [  3 128]]
Confusion matrix for test: 
 [[21  6]
 [ 3 31]]



The Logistic Regression model, after being trained with feature expansion methods, inicated the best performing results, by achieving an accuracy of over 85% on the test set! This must be due to the fact that the 'PolynomialFeatures' techinque  generates additional features, which capture nonlinear relationships or interactions between the existing features and such relationships could be hiding behind our data. After performing Pearson correlation, we observed that there is no siginificant  correlation between the variables, so feature expansion helped to find deeper relationships between our variables.