# **Imports**

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
import lightgbm as lgb
import xgboost

from imblearn.ensemble import EasyEnsembleClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn.ensemble import StackingClassifier

from sklearn.metrics import classification_report, roc_auc_score, roc_curve, precision_recall_curve, auc


# **Functions**

In [2]:
def read_file(file_path):
    data = pd.read_csv(file_path, index_col=0)

    return data

In [3]:
def label_encoding(data, column, mapping):
    
    data[column] = data[column].map(mapping)

    return data[column]

# **Loading Data**

In [4]:
file_path = r"C:\Users\Space\Documents\py\Projects\TuringCollege\Stroke\Stroke_New\Data\stroke.csv"
data = read_file(file_path)
data = data.drop(columns=['ID'])

# **BMI Missing Values**

In [5]:
data['BMI'] = data['BMI'].fillna(0)
data['BMI Available'] = 0  
data.loc[data['BMI'] != 0, 'BMI Available'] = 1 

# **Encoding**

In [6]:
gender_mapping = {'Male': 1, 'Female': 2}
data ['Gender'] = label_encoding(data, 'Gender', gender_mapping)

In [7]:
marriage_mapping = {'Yes': 1, 'No': 2}
data ['Ever Married']= label_encoding(data, 'Ever Married', marriage_mapping)

In [8]:
work_mapping = {'Private': 1, 'Self-employed': 2, 'Govt_job': 3, 'Never_worked': 4, 'children': 5}
data['Work'] = label_encoding(data, 'Work', work_mapping)

In [9]:
residence_mapping = {'Urban': 1, 'Rural': 2}
data['Residence'] = label_encoding(data, 'Residence', residence_mapping)

In [10]:
smoking_mapping = {'never smoked': 1, 'smokes': 2, 'formerly smoked': 3, 'Unknown': 4}
data['Smoking'] = label_encoding(data, 'Smoking', smoking_mapping)

# **Grouping by Age**

In [11]:
gender_stroke = data.groupby('Age')['Stroke'].sum()

In [12]:
data = data[data['Age'] > 57]

In [13]:
data.shape

(1524, 12)

In [14]:
data.head()

Unnamed: 0,Gender,Age,Hypertension,Heart Disease,Ever Married,Work,Residence,AVG Glucose,BMI,Smoking,Stroke,BMI Available
0,1,67.0,0,1,1,1,1,228.69,36.6,3,1,1
1,2,61.0,0,0,1,2,2,202.21,0.0,1,1,0
2,1,80.0,0,1,1,1,2,105.92,32.5,1,1,1
4,2,79.0,1,0,1,2,2,174.12,24.0,1,1,1
5,1,81.0,0,0,1,1,1,186.21,29.0,3,1,1


# **Machine Learning**

In [15]:
target_column = 'Stroke'
random_seed = 42

In [16]:
X = data.drop(columns=[target_column])  
y = data[target_column]

# **TPOT**

In [17]:
import warnings
from tpot import TPOTClassifier
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, roc_auc_score

# Suppress user warnings
warnings.filterwarnings('ignore', category=UserWarning)

# Load a dataset

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and fit TPOT with F1 scoring
tpot = TPOTClassifier(generations=2, population_size=20, verbosity=2, scoring='f1_weighted')
tpot.fit(X_train, y_train)

# Extract the best pipeline
best_pipeline = tpot.fitted_pipeline_

# Make predictions
y_pred = best_pipeline.predict(X_test)
y_pred_proba = best_pipeline.predict_proba(X_test)[:, 1]

# Calculate metrics
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
auc = roc_auc_score(y_test, y_pred_proba, average='weighted')

# Display the results
print("Best Pipeline:", best_pipeline)
print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)

# Export the best pipeline as a Python script
tpot.export('tpot_best_pipeline.py')




                                                                            
Generation 1 - Current best internal CV score: 0.824735968607342
                                                                            
Generation 2 - Current best internal CV score: 0.824735968607342
                                                                            
Best pipeline: BernoulliNB(input_matrix, alpha=1.0, fit_prior=True)
Best Pipeline: Pipeline(steps=[('bernoullinb', BernoulliNB())])
Precision: 0.8467199949445214
Recall: 0.8655737704918033
AUC: 0.6737108547209794
