# **Imports**

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
import lightgbm as lgb
import xgboost

from imblearn.ensemble import EasyEnsembleClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn.ensemble import StackingClassifier

from sklearn.metrics import classification_report, roc_auc_score, roc_curve, precision_recall_curve, auc


# **Functions**

In [2]:
def read_file(file_path):
    data = pd.read_csv(file_path, index_col=0)

    return data

In [3]:
def label_encoding(data, column, mapping):
    
    data[column] = data[column].map(mapping)

    return data[column]

# **Loading Data**

In [4]:
file_path = r"C:\Users\Space\Documents\py\Projects\TuringCollege\Stroke\Stroke_New\Data\stroke.csv"
data = read_file(file_path)
data = data.drop(columns=['ID'])

# **BMI Missing Values**

In [5]:
data['BMI'] = data['BMI'].fillna(0)
data['BMI Available'] = 0  
data.loc[data['BMI'] != 0, 'BMI Available'] = 1 

# **Encoding**

In [6]:
gender_mapping = {'Male': 1, 'Female': 2}
data ['Gender'] = label_encoding(data, 'Gender', gender_mapping)

In [7]:
marriage_mapping = {'Yes': 1, 'No': 2}
data ['Ever Married']= label_encoding(data, 'Ever Married', marriage_mapping)

In [8]:
work_mapping = {'Private': 1, 'Self-employed': 2, 'Govt_job': 3, 'Never_worked': 4, 'children': 5}
data['Work'] = label_encoding(data, 'Work', work_mapping)

In [9]:
residence_mapping = {'Urban': 1, 'Rural': 2}
data['Residence'] = label_encoding(data, 'Residence', residence_mapping)

In [10]:
smoking_mapping = {'never smoked': 1, 'smokes': 2, 'formerly smoked': 3, 'Unknown': 4}
data['Smoking'] = label_encoding(data, 'Smoking', smoking_mapping)

# **Grouping by Age**

In [11]:
gender_stroke = data.groupby('Age')['Stroke'].sum()

In [12]:
data = data[data['Age'] > 57]

In [13]:
data.shape

(1524, 12)

# **Machine Learning**

## **Scaling**

In [14]:
target_column = 'Stroke'
random_seed = 42

In [15]:
X = data.drop(columns=[target_column])  
y = data[target_column]

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_seed, stratify=y)

In [17]:
columns_to_scale = ['Age', 'AVG Glucose', 'BMI']

scaler = RobustScaler()

X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

X_train_scaled[columns_to_scale] = scaler.fit_transform(X_train[columns_to_scale])
X_test_scaled[columns_to_scale] = scaler.transform(X_test[columns_to_scale])

## **Balanced Models**    

In [18]:
# log = LogisticRegression(class_weight='balanced', random_state=random_seed)
# log.fit(X_train_scaled, y_train)

# y_predict = log.predict(X_test_scaled)

# y_predict_proba = log.predict_proba(X_test_scaled)[:, 1]  # Probability estimates for the positive class
# auc_score = roc_auc_score(y_test, y_predict_proba)
# precision, recall, _ = precision_recall_curve(y_test, y_predict_proba)
# auc_pr_score = auc(recall, precision)

# print(classification_report(y_test, y_predict, zero_division=0))
# print(f"AUC Score: {auc_score:.2f}")
# print(f"AUC-PR Score: {auc_pr_score:.2f}")

# **Logistic Regression with Elasticnet Regularization**

In [19]:
en = LogisticRegression(
    penalty="elasticnet",
    solver="saga",
    multi_class="ovr",
    max_iter=5000,
    l1_ratio=0.5,
    C=1,
)

In [20]:
en.fit(X_train_scaled, y_train)

y_predict = en.predict(X_test_scaled)

y_predict_proba = en.predict_proba(X_test_scaled)[:, 1]  # Probability estimates for the positive class
auc_score = roc_auc_score(y_test, y_predict_proba)
precision, recall, _ = precision_recall_curve(y_test, y_predict_proba)
auc_pr_score = auc(recall, precision)

print(classification_report(y_test, y_predict, zero_division=0))
print(f"AUC Score: {auc_score:.2f}")
print(f"AUC-PR Score: {auc_pr_score:.2f}")

              precision    recall  f1-score   support

           0       0.88      1.00      0.93       266
           1       0.67      0.05      0.10        39

    accuracy                           0.88       305
   macro avg       0.77      0.52      0.51       305
weighted avg       0.85      0.88      0.83       305

AUC Score: 0.77
AUC-PR Score: 0.32
