In [27]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.linear_model import Perceptron
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV,KFold,cross_val_score
import matplotlib.pyplot as plt
import seaborn as sns


In [21]:
data=pd.read_csv("Dataset_raw.csv",sep=",")


In [3]:
data.head()

Unnamed: 0,RANDID,TOTCHOL,AGE,SYSBP,DIABP,TIMEMI,CIGPDAY,TIME,STROKE,BMI
0,2448,196.0,52.0,100.0,62.0,6438.0,0.0,4628,2.0,
1,6238,223.0,58.0,122.0,75.0,8766.0,0.0,4344,2.0,28.5
2,11252,232.0,58.0,131.0,70.0,8766.0,0.0,4285,2.0,24.6
3,11263,216.0,55.0,140.0,70.0,8766.0,0.0,4351,1.0,31.2
4,12806,208.0,57.0,127.0,75.0,8766.0,0.0,4289,2.0,22.0


In [4]:
data.shape

(620, 10)

In [5]:
#remove row which contains NAN value
data = data.dropna()
data.head()

Unnamed: 0,RANDID,TOTCHOL,AGE,SYSBP,DIABP,TIMEMI,CIGPDAY,TIME,STROKE,BMI
1,6238,223.0,58.0,122.0,75.0,8766.0,0.0,4344,2.0,28.5
2,11252,232.0,58.0,131.0,70.0,8766.0,0.0,4285,2.0,24.6
3,11263,216.0,55.0,140.0,70.0,8766.0,0.0,4351,1.0,31.2
4,12806,208.0,57.0,127.0,75.0,8766.0,0.0,4289,2.0,22.0
5,14367,177.0,64.0,148.0,66.0,8766.0,18.0,4438,1.0,25.7


In [6]:
data.shape


(596, 10)

In [7]:
df=data.drop(['RANDID', 'TIMEMI','TIME'], axis=1)
# based on article following selection of features were made

In [8]:
df.head()

Unnamed: 0,TOTCHOL,AGE,SYSBP,DIABP,CIGPDAY,STROKE,BMI
1,223.0,58.0,122.0,75.0,0.0,2.0,28.5
2,232.0,58.0,131.0,70.0,0.0,2.0,24.6
3,216.0,55.0,140.0,70.0,0.0,1.0,31.2
4,208.0,57.0,127.0,75.0,0.0,2.0,22.0
5,177.0,64.0,148.0,66.0,18.0,1.0,25.7


In [9]:
# Systolic Blood Pressure and Diastolic blood pressure is converted from continuous values to categorical ones as per the article


def classify_bp(row):
    # Define systolic and diastolic bins
    systolic_bins = [0, 120, 140, 160, float('inf')]
    systolic_labels = ['Normal', 'Normal', 'Borderline', 'Hypertension']
    
    diastolic_bins = [0, 80, 90, 100, float('inf')]
    diastolic_labels = ['Normal', 'Normal', 'Borderline', 'Hypertension']

    # Classify systolic and diastolic separately using pd.cut
    systolic_class = pd.cut([row['SYSBP']], bins=systolic_bins, labels=systolic_labels, right=False,ordered=False)[0]
    diastolic_class = pd.cut([row['DIABP']], bins=diastolic_bins, labels=diastolic_labels, right=False,ordered=False)[0]
    
    # Combine classifications for systolic and diastolic
    if systolic_class == 'Hypertension' or diastolic_class == 'Hypertension':
        return 'Hypertension'
    elif systolic_class == 'Borderline' or diastolic_class == 'Borderline':
        return 'Borderline'
    else:
        return 'Normal'

# Sample data




df = pd.DataFrame(df)

# Apply the classify_bp function to the DataFrame
df['BP Classification'] = df.apply(classify_bp, axis=1)

df


Unnamed: 0,TOTCHOL,AGE,SYSBP,DIABP,CIGPDAY,STROKE,BMI,BP Classification
1,223.0,58.0,122.0,75.0,0.0,2.0,28.5,Normal
2,232.0,58.0,131.0,70.0,0.0,2.0,24.6,Normal
3,216.0,55.0,140.0,70.0,0.0,1.0,31.2,Borderline
4,208.0,57.0,127.0,75.0,0.0,2.0,22.0,Normal
5,177.0,64.0,148.0,66.0,18.0,1.0,25.7,Borderline
...,...,...,...,...,...,...,...,...
614,196.0,53.0,126.5,72.5,0.0,2.0,28.6,Normal
615,201.0,59.0,149.0,80.0,0.0,1.0,30.5,Borderline
616,212.0,61.0,187.0,105.0,0.0,1.0,22.7,Hypertension
617,306.0,70.0,170.0,85.0,0.0,1.0,24.1,Hypertension


In [10]:
#used describe method to check data distribution
df.describe()

Unnamed: 0,TOTCHOL,AGE,SYSBP,DIABP,CIGPDAY,STROKE,BMI
count,596.0,596.0,596.0,596.0,596.0,596.0,596.0
mean,216.236577,60.231544,137.071309,77.594799,3.567114,1.498322,25.870638
std,39.478909,8.313202,25.93697,12.241642,8.415703,0.500417,4.330776
min,135.0,45.0,94.0,30.0,0.0,1.0,15.2
25%,190.0,53.0,119.0,70.0,0.0,1.0,22.9
50%,209.5,59.0,130.0,75.5,0.0,1.0,25.55
75%,232.0,67.0,152.0,83.625,0.0,2.0,28.0
max,390.0,79.0,254.0,119.0,40.0,2.0,48.6


In [11]:
# Total is converted from continuous values to categorical ones as per the article
df['CHOLTYPE']=pd.cut(x=df['TOTCHOL'],bins=[0,100,130,200,400],labels=['Healthy','Healthy','Healthy','Unhealthy'],ordered=False)
df

Unnamed: 0,TOTCHOL,AGE,SYSBP,DIABP,CIGPDAY,STROKE,BMI,BP Classification,CHOLTYPE
1,223.0,58.0,122.0,75.0,0.0,2.0,28.5,Normal,Unhealthy
2,232.0,58.0,131.0,70.0,0.0,2.0,24.6,Normal,Unhealthy
3,216.0,55.0,140.0,70.0,0.0,1.0,31.2,Borderline,Unhealthy
4,208.0,57.0,127.0,75.0,0.0,2.0,22.0,Normal,Unhealthy
5,177.0,64.0,148.0,66.0,18.0,1.0,25.7,Borderline,Healthy
...,...,...,...,...,...,...,...,...,...
614,196.0,53.0,126.5,72.5,0.0,2.0,28.6,Normal,Healthy
615,201.0,59.0,149.0,80.0,0.0,1.0,30.5,Borderline,Unhealthy
616,212.0,61.0,187.0,105.0,0.0,1.0,22.7,Hypertension,Unhealthy
617,306.0,70.0,170.0,85.0,0.0,1.0,24.1,Hypertension,Unhealthy


In [12]:
df

Unnamed: 0,TOTCHOL,AGE,SYSBP,DIABP,CIGPDAY,STROKE,BMI,BP Classification,CHOLTYPE
1,223.0,58.0,122.0,75.0,0.0,2.0,28.5,Normal,Unhealthy
2,232.0,58.0,131.0,70.0,0.0,2.0,24.6,Normal,Unhealthy
3,216.0,55.0,140.0,70.0,0.0,1.0,31.2,Borderline,Unhealthy
4,208.0,57.0,127.0,75.0,0.0,2.0,22.0,Normal,Unhealthy
5,177.0,64.0,148.0,66.0,18.0,1.0,25.7,Borderline,Healthy
...,...,...,...,...,...,...,...,...,...
614,196.0,53.0,126.5,72.5,0.0,2.0,28.6,Normal,Healthy
615,201.0,59.0,149.0,80.0,0.0,1.0,30.5,Borderline,Unhealthy
616,212.0,61.0,187.0,105.0,0.0,1.0,22.7,Hypertension,Unhealthy
617,306.0,70.0,170.0,85.0,0.0,1.0,24.1,Hypertension,Unhealthy


In [13]:
#Ensuring no row is without data 
print(df['CHOLTYPE'].isna().any().any())
print(df['BP Classification'].isna().any().any())  # Check for NaNs by column
#print(df['BP Classification'].isna().any(axis=1)

False
False


Use of Ordinal encoding instead of Label Encoding:
Label encoding can introduce a misleading ordinal relationship between categories that don't have any inherent ranking.
Used Ordinal Encoding since the categories have a meaningful order.In this case, data is normal, borderline, hypertension
in case of blood presssure and healthy and unhealthy in case of cholestrol, which gives it a partcular order
Thus ordinal encoding has been preferred.Although it has its drawbacks as well but they will be taken care of
when the choice of machine learning models has been made.

In [14]:
from sklearn.preprocessing import OrdinalEncoder
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load your dataset


# Identify numerical columns (excluding categorical ones)
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns


#changed the bp_classification column from obejct type to category type
df['BP Classification'] = df['BP Classification'].astype('category')




# Applied  Encoding for categorical columns
ordinal_encoder = OrdinalEncoder()
df[['BP Classification', 'CHOLTYPE']] = ordinal_encoder.fit_transform(df[['BP Classification', 'CHOLTYPE']])


  

# Check the final DataFrame
df.head()





Unnamed: 0,TOTCHOL,AGE,SYSBP,DIABP,CIGPDAY,STROKE,BMI,BP Classification,CHOLTYPE
1,223.0,58.0,122.0,75.0,0.0,2.0,28.5,2.0,1.0
2,232.0,58.0,131.0,70.0,0.0,2.0,24.6,2.0,1.0
3,216.0,55.0,140.0,70.0,0.0,1.0,31.2,0.0,1.0
4,208.0,57.0,127.0,75.0,0.0,2.0,22.0,2.0,1.0
5,177.0,64.0,148.0,66.0,18.0,1.0,25.7,0.0,0.0


In [15]:
x=df.iloc[:,0:10]
y=df.iloc[:,5]

x.head()

Unnamed: 0,TOTCHOL,AGE,SYSBP,DIABP,CIGPDAY,STROKE,BMI,BP Classification,CHOLTYPE
1,223.0,58.0,122.0,75.0,0.0,2.0,28.5,2.0,1.0
2,232.0,58.0,131.0,70.0,0.0,2.0,24.6,2.0,1.0
3,216.0,55.0,140.0,70.0,0.0,1.0,31.2,0.0,1.0
4,208.0,57.0,127.0,75.0,0.0,2.0,22.0,2.0,1.0
5,177.0,64.0,148.0,66.0,18.0,1.0,25.7,0.0,0.0


In [16]:
y.head()

1    2.0
2    2.0
3    1.0
4    2.0
5    1.0
Name: STROKE, dtype: float64

In [17]:
y.shape

(596,)

Deleted the original columns which have been encoded as it can lead to multicolinearity
High multicollinearity makes it hard to interpret the effect of individual variables because it becomes difficult to determine how each predictor affects the dependent variable.
For example, if two features are highly correlated, it becomes unclear whether the effect is due to one feature or the other,
leading to ambiguous conclusions

In [18]:

x=df.drop(["TOTCHOL",'SYSBP','DIABP','STROKE'],axis=1)
x.head()

Unnamed: 0,AGE,CIGPDAY,BMI,BP Classification,CHOLTYPE
1,58.0,0.0,28.5,2.0,1.0
2,58.0,0.0,24.6,2.0,1.0
3,55.0,0.0,31.2,0.0,1.0
4,57.0,0.0,22.0,2.0,1.0
5,64.0,18.0,25.7,0.0,0.0


Now since data has been encoded, and original columns have been deleted, I have checked whether data is balanced or imbalanced

In [25]:
import numpy as np



# Check the distribution of the target classes
class_counts = np.bincount(y)
print("Class Counts:", class_counts)

# Check if the dataset is imbalanced based on counts
if class_counts.max() > 0.75 * len(y):
    print("The dataset is imbalanced.")
else:
    print("The dataset is balanced.")


Class Counts: [  0 299 297]
The dataset is balanced.


Since both classes have similar distribution, and neither of classes dominate i.e. both at risk patients and healthy patients distribution is similar, I dont need to balance the data.
I will now proceed towards splitting the data using KFOLD strategy.
Justification for Choice of Kfold: In K-Fold Cross-Validation, the dataset is split into K folds, and each fold is used as the validation set exactly once, while the other K-1 folds are used for training. 
Characteristics of KFold Technique
Better Generalization: Provides a more robust estimate of model performance by testing it on different subsets of data.
Reduced Variance: The use of multiple folds reduces the impact of data variability and leads to a more stable performance estimate.
Efficient Use of Data: Every data point is used for both training and testing, making optimal use of the available data.
Mitigates Overfitting: By evaluating the model on multiple test sets, overfitting is less likely.
Model and Hyperparameter Selection: Can be used effectively in hyperparameter tuning, leading to better model selection.
Flexibility: Adjustable number of folds based on dataset size and computational constraints.


I have used nested CV , as I intend to tune the hyperparametres in order to get an unbiased performance of my model and prevent overfitting.
Characteristics of nested CV which led me to choose it 
It Prevents Data Leakage: By keeping the test set completely separate from the training and hyperparameter selection processes, nested CV ensures that you get an unbiased estimate of your model’s generalization ability.
Better Hyperparameter Selection: It allows for a more realistic selection of hyperparameters, ensuring that the hyperparameters are chosen based only on training data that the model has not seen before.
Reliable Performance : Nested CV provides a more reliable estimate of how the model will perform on unseen data because it evaluates performance after the model has been optimized with hyperparameters.
Model Comparison: When comparing multiple models or algorithms, nested CV can help ensure that the comparison is fair, as the hyperparameters for each model are selected in a way that avoids overfitting.

In [131]:
from sklearn.model_selection import  GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np

def evaluate_model_nested_cv(model, param_grid, X, y, n_splits_outer=5, n_splits_inner=3):
  
    
    # Initialize the outer StratifiedKFold Cross-Validation
    outer_cv = KFold(n_splits=n_splits_outer, shuffle=True, random_state=42)
    
    # Initialize the inner KFold Cross-Validation for hyperparameter tuning
    inner_cv = KFold(n_splits=n_splits_inner, shuffle=True, random_state=42)
    #scaled data 
    pipeline = Pipeline([
        ('scaler', StandardScaler()),  # Scaling step
        ('model', model)# The classifier model
    ])
    
    # Set up GridSearchCV for hyperparameter tuning (inner loop)
    grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=inner_cv, scoring='accuracy')

    # Lists to store metrics for each outer fold
    # the performance of models will be assessed on precision, recall and F1 score since we need the model to perform
    # well on both counts i.e for high risk patients as well as healthy people 
    # as per need of doctors mentioned in the course brief
    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []
   
    
    # Outer Cross-Validation Loop (Model Evaluation)
    for train_idx, test_idx in outer_cv.split(X, y):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        
        # Perform hyperparameter tuning on the training data using the inner CV
        grid_search.fit(X_train, y_train)

        # Get the best model (after hyperparameter tuning)
        best_model = grid_search.best_estimator_
        
        # Make predictions on the outer test fold
        y_pred = best_model.predict(X_test)
        
        # Calculate metrics for this fold
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='binary')  # For binary classification
        recall = recall_score(y_test, y_pred, average='binary')  # For binary classification
        f1 = f1_score(y_test, y_pred, average='binary')  # For binary classification
        
        # Append the results to the lists
        accuracies.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1)
    print(grid_search.best_params_)

    
    # Calculate the average of each metric across all outer folds
    avg_accuracy = np.mean(accuracies)
    avg_precision = np.mean(precisions)
    avg_recall = np.mean(recalls)
    avg_f1_score = np.mean(f1_scores)

    # Return the results as a dictionary
    metrics = {
        'accuracy': avg_accuracy,
        'precision': avg_precision,
        'recall': avg_recall,
        'f1_score': avg_f1_score
    }
    
    return metrics




Checked classifier models like Naive Bayes, KNN, MLP 
Assesed their metrics f1-score, recall,precision in comparison to SVC, Logistic Regression.
Found SVC(with rbf kernel) and Logistic Regression(l2 regularization) to perform better on basis of metrics
Rejected MLP because nested cross-validation with MLP is computationally expensive due to the nature of neural network training 
and the double cross-validation loop.

Justification for use of recall 
High Importance of Catching Stroke Patients:
Stroke patients need immediate medical attention, and missing these patients can have severe consequences. For example, if a patient is at risk of a stroke and the model fails to detect this (false negative), the patient might not receive timely intervention, which can lead to irreversible damage or even death.
Therefore, the priority is to ensure that as many stroke patients as possible are correctly identified (true positives).
A high recall means that most stroke patients are identified by the model, even if this leads to some healthy individuals being incorrectly classified as at risk (false positives).
False negatives occur when the model misses a stroke patient, meaning a person who is at risk of a stroke is not flagged by the model for further medical intervention.
Recall ensures that we minimize these false negatives, which is crucial in high-stakes medical predictions like stroke risk.

Justification for use of precsion 
Avoiding Unnecessary Medical Interventions:
While it’s critical to catch all stroke patients (high recall), it's also important that those flagged by the model actually are at risk of having a stroke. If the model labels healthy individuals as stroke patients (false positives), this can lead to unnecessary tests, treatments, and medical procedures.
Precision ensures that predicted stroke patients are truly at risk—i.e., you minimize the number of healthy individuals falsely identified as at risk.

Justification for Choice of F1-score for judging a model's preformance
Stroke patients may represent a much smaller portion of the dataset than healthy individuals. This is a common issue in medical datasets, where positive cases (e.g., stroke patients or individuals at risk of stroke) are much less frequent than the negative cases (healthy individuals).In this case, metrics like accuracy can be misleading, because the model could simply predict the majority class (healthy individuals) most of the time and still achieve a high accuracy, despite failing to identify at-risk stroke patients correctly.
Importance of Correctly Identifying Stroke Patients:
False negatives (FN), where the model fails to identify a patient at risk of a stroke, are extremely costly in a medical context. Missing a stroke patient means the patient does not receive timely intervention or treatment, which could lead to severe consequences, including death or permanent disability.
On the other hand, false positives (FP), where healthy individuals are incorrectly classified as at risk, may result in unnecessary tests or treatments, but the impact is generally much less severe compared to a false negative.
The F1 score balances both precision and recall. It’s particularly useful when you need to strike a balance between identifying as many stroke patients as possible (high recall) and ensuring that the patients identified are actually at risk (high precision)


'''
Naive Bayes
p_grid = {
    'model__var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]  # The var_smoothing parameter for GaussianNB
}
evaluate_model_nested_cv(GaussianNB(), p_grid, x, y, n_splits_outer=5, n_splits_inner=3)
KNN
p_grid = {
    'model__n_neighbors': [3, 5, 7, 9, 11], 
}
evaluate_model_nested_cv(KNeighborsClassifier(), p_grid, x, y, n_splits_outer=5, n_splits_inner=3)
MLP
p_grid = {
    'model__alpha': [0.0001, 0.001, 0.01, 0.1],  # Regularization strength
     early_stopping=True
}
evaluate_model_nested_cv(MLPClassifier(hidden_layer_sizes=(80,),max_iter=1000), p_grid, x, y, n_splits_outer=5, n_splits_inner=3)
'''
'''
p_grid = {
    'model__C': [0.001, 0.01, 0.1, 1, 10, 100],           # Regularization strength
                            
   
}
evaluate_model_nested_cv(LogisticRegression(penalty='l2'), p_grid, x, y, n_splits_outer=5, n_splits_inner=3)
'''

In [119]:
#Grid of parametres to search
p_grid={
    "model__kernel":["poly","rbf"],
    "model__C":[1,10],
    "model__gamma":[0.01,0.1]
       }
evaluate_model_nested_cv(SVC(), p_grid, x, y, n_splits_outer=5, n_splits_inner=3)

{'model__C': 1, 'model__gamma': 0.1, 'model__kernel': 'rbf'}


{'accuracy': 0.8540336134453781,
 'precision': 0.9065990802675585,
 'recall': 0.7877830865651267,
 'f1_score': 0.8428452058029489}

Since ordinal encoding has been done, it has certain drawbacks:
Ordinal encoding treats the categories as though they represent equally spaced intervals. If the spacing between categories is not uniform, logistic regression may struggle because it assumes a constant relationship between the encoded values and the target.L1 (Lasso) or L2 (Ridge) regularization can help mitigate these issues by penalizing overly complex models and ensuring generalization. I have used l2 regularization but the metrics are not as good as the SVC
Drawbacks for SVC linear kernel:
Assumption of Equidistant Intervals
Problem: Ordinal encoding assigns integer values to ordered categories (e.g., "low" = 0, "medium" = 1, "high" = 2). This assumes that the difference between each successive category is equal (i.e., the distance between "low" and "medium" is the same as the distance between "medium" and "high"). However, in many cases, the differences between categories may not be equal or meaningful.How this affects SVC: SVC, especially with a linear kernel, interprets these encoded values as continuous and equally spaced. If the categories are not truly equidistant, the model might misinterpret the ordinal feature, leading to incorrect decision boundaries and poor model performance.
Non-linear Relationship with Target Variable
Problem: Ordinal encoding assumes that the relationship between the categories is linear, but in many cases, the relationship between the ordinal feature and the target variable might be non-linear.
How this affects SVC: SVC with a linear kernel may not capture the non-linear relationship well, as it tries to fit a straight-line decision boundary. SVC with an RBF kernel can handle non-linearities better, but it still relies on the assumption that the feature encoding reflects meaningful distance between categories, which is not always true with ordinal encoding.
Considering the drawbacks and performanace of the SVC model
I decided to choose SVC with rbf kernel as it was a better performing model in comparison to Logistic regression in terms of metrics and the drawbacks of using an rbf kernel are less than drawbacks of using a linear kernel.




References
Scikit_Learn Kfold. Available at: https://scikit-learn.org/dev/modules/generated/sklearn.model_selection.KFold.html 
(Accessed: 1 January 2025).
Scikit_Learn Nested versus non-nested cross-validation. Available at:https://scikit-learn.org/stable/modules/generated/sklearn.metrics.explained_variance_score.html (Accessed: 12 January 2025).
Scikit_Learn Metrics and scoring: quantifying the quality of predictions. Available at:https://scikit-learn.org/1.5/modules/model_evaluation.html#metrics-and-scoring-quantifying-the-quality-of-predictions (Accessed: 11 January 2025).
Scikit_Learn SVC. Available at: https://scikit-learn.org/dev/modules/generated/sklearn.svm.SVC.html (Accessed: 7 January 2025).
Scikit_Learn OrdinalEncoder. Available at:https://scikit-learn.org/dev/modules/generated/sklearn.preprocessing.OrdinalEncoder.htmlScikit_Learn  (Accessed: 9 January 2025).
