# Table of contents

##
![green-divider](https://user-images.githubusercontent.com/7065401/52071924-c003ad80-2562-11e9-8297-1c6595f8a7ff.png)
# Import Library

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import joblib
import warnings
warnings.filterwarnings('ignore')
plt.style.use('seaborn')

##
![green-divider](https://user-images.githubusercontent.com/7065401/52071924-c003ad80-2562-11e9-8297-1c6595f8a7ff.png)
# Loading Data

In [None]:
stroke = pd.read_csv('data/healthcare-dataset-stroke-data.csv')

##
![green-divider](https://user-images.githubusercontent.com/7065401/52071924-c003ad80-2562-11e9-8297-1c6595f8a7ff.png)
# 1. Data Preprocessing

In [None]:
stroke.info()

In [None]:
stroke.describe(include="object")

In [None]:
stroke['gender'].value_counts()

 *The Age columns have a type float64 that mean inside data have a float number rows*

In [None]:
stroke[stroke['age']<2]

In [None]:
stroke['age']=stroke['age'].astype(int)
stroke['age']=stroke['age'].replace([0],1)
stroke[stroke['age']<=9]

  ##### *At the smoking_status columns, from 1 to 9 years old, the data set has the value 'Unknown', So I did a search and statistic that in children under 10, the smoking rate seems to be less than <1%*
   > [Children and young people use tobacco](https://www.blackpooljsna.org.uk/Home.aspx)



In [None]:
df=stroke[(stroke['age']<=9) & (stroke['smoking_status']=='Unknown')]
df['smoking_status']=df['smoking_status'].replace(['Unknown'],'never smoked')
df

In [None]:
df1=stroke[(stroke['age']<=9) & (stroke['smoking_status']!='Unknown')|(stroke['age']>9)]

In [None]:
stroke=pd.concat([df,df1])

In [None]:
stroke.shape

In [None]:
stroke[stroke['age']==1]

In [None]:
df = pd.get_dummies(data=df, columns=['smoking_status'])
df = pd.get_dummies(data=df, columns=['work_type'])
df.head()

*The column BMI is missing data, so i need to solve this problem*

In [None]:
stroke.isnull().sum()

At this data, we have 201/5110 percent for missing data at BMI column. its approximate 4% data, so i think the best way is filling missing values by mean

In [None]:
stroke[stroke['bmi'].isnull()]

In [None]:
mean=stroke['bmi'].mean(skipna=True)
stroke['bmi']=stroke['bmi'].mask(stroke['bmi'].isnull(),mean)

In [None]:
stroke.isnull().sum()

#### Show all unique classes

In [None]:
for col in stroke.columns:
    if(stroke.dtypes[col]=='O'):
        print('-----------------------\n',stroke[col].value_counts())

#### Drop the Unsual from the datase

In [None]:
stroke=stroke.drop(columns='id',axis=1)

In [None]:
stroke.drop(stroke.loc[stroke['smoking_status']=='Unknown'].index, inplace=True)
stroke.shape

In [None]:
stroke = stroke[stroke['gender']!='Other']

In [None]:
stroke=stroke.reset_index(drop=True)

In [None]:
stroke_Final=stroke.copy()

#### Encoding Categorical Features

In [None]:
stroke["Residence_type"] = stroke["Residence_type"].apply(lambda x: 1 if x=="Urban" else 0)
stroke["ever_married"] = stroke["ever_married"].apply(lambda x: 1 if x=="Yes" else 0)
stroke["gender"] = stroke["gender"].apply(lambda x: 1 if x=="Male" else 0)

In [None]:
stroke = pd.get_dummies(data=stroke, columns=['smoking_status'])
stroke = pd.get_dummies(data=stroke, columns=['work_type'])

In [None]:
stroke.head()

In [None]:
stroke_Final.head()

##
![green-divider](https://user-images.githubusercontent.com/7065401/52071924-c003ad80-2562-11e9-8297-1c6595f8a7ff.png)
# 2. Visualization

## Count stroke

In [None]:
df = pd.read_csv('data/healthcare-dataset-stroke-data.csv')
df = df[df['gender'] != 'Other']
df.rename({'Residence_type': 'residence_type'}, axis=1, inplace=True)
mean = df['bmi'].mean(skipna=True)
df['bmi']= df['bmi'].mask(df['bmi'].isnull(), mean)
df.drop(columns=['id', 'ever_married', 'work_type', 'residence_type', 'smoking_status'], inplace=True)

In [None]:
ax = sns.countplot(data=df, x='stroke')
[ax.annotate('{:.0f}'.format(p.get_height()), (p.get_x()+0.35, p.get_height()+100)) for p in ax.patches]
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.title('Target Class Count plot', fontsize=20)
plt.show()

## Age Distribution

In [None]:
ax = sns.kdeplot(df['age'], color='lightgray')
ax.lines[0].set_color('red')
plt.axvline(df['age'].mean(), linestyle='--', lw=4, zorder=1, color='blue')
plt.annotate(f' Average Age', (44, 0.008), fontsize=18,color='blue')
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.title('Age Distribution plot', fontsize=20)
plt.xlabel('Age')
plt.show()

### Gender

In [None]:
fig, ax = plt.subplots(figsize = (7,8))
labels = [ 'Female','Male']
plt.pie(stroke_Final["gender"].value_counts(),autopct='%1.2f%%',labels=labels)
plt.title("Percentage of All Gender",fontsize=18,fontweight="bold")
fig.set_facecolor('#FFFFCC')
plt.legend()
plt.show()

In [None]:
ax=sns.countplot(x='gender', data=stroke_Final, hue =stroke_Final['stroke'],palette='hls')
[ax.annotate('{:.0f}'.format(p.get_height()), (p.get_x()+0.15, p.get_height()+40)) for p in ax.patches]
plt.title("The Gender With Stroke",fontsize=18,fontweight="bold")
plt.ylabel('Number of persons',fontsize=15)
plt.legend(['No Stroke','Has Stroke'],title='Stroke')

### BOXPLOT with three feature AGE, BMI , AVG_GLUTCOSE_LEVEL

In [None]:
Have_stroke=stroke_Final[stroke_Final['stroke']==1]
Non_stroke=stroke_Final[stroke_Final['stroke']==0]

In [None]:
stroke_Final["index"]= range(1, len(stroke_Final) + 1)
Have_stroke["index"]= range(1, len(Have_stroke) + 1)
Non_stroke["index"]= range(1, len(Non_stroke) + 1)

##### *Three features when has a stroke*

In [None]:
col=['age','avg_glucose_level','bmi']
plt.figure(figsize=(10,6), facecolor='w')
sns.boxplot(data=stroke_Final[stroke_Final['stroke']==1][col])
plt.show()

##### *Three features when there is no a stroke*

In [None]:
col=['age','avg_glucose_level','bmi']
plt.figure(figsize=(7,5), facecolor='w')
sns.boxplot(data=stroke_Final[stroke_Final['stroke']==0][col], showmeans=True)
plt.show()

### Hypertension

In [None]:
labels = ['No Hypertension', 'Has Hypertension']
fig, ax = plt.subplots(figsize = (7,8))
plt.pie(stroke_Final["hypertension"].value_counts(),autopct='%1.2f%%',labels=labels)
plt.title("Percentage of All hypertension",fontsize=18,fontweight="bold")
fig.set_facecolor('#FFFFCC')
plt.legend()
plt.show()

In [None]:
Object={0:"No Hypertension",1:"Has Hypertension"}
stroke_Final["Has_No_Hypertension"]=stroke_Final['hypertension']
stroke_Final['Has_No_Hypertension']=stroke_Final['Has_No_Hypertension'].map(Object)
ax=sns.countplot(x='Has_No_Hypertension', data=stroke_Final, hue =stroke_Final['stroke'],palette='hls')
[ax.annotate('{:.0f}'.format(p.get_height()), (p.get_x()+0.15, p.get_height()+40)) for p in ax.patches]
plt.ylabel('Number of persons',fontsize=15)
plt.legend(['No Stroke','Has Stroke'],title='Stroke')
plt.show()

### Heart_disease

In [None]:
labels = ['No Heart disease', 'Has Heart disease']
fig, ax = plt.subplots(figsize = (7,8))
plt.pie(stroke_Final["heart_disease"].value_counts(),autopct='%1.2f%%', labels=labels)
plt.title("Percentage of All Heart disease",fontsize=18,fontweight="bold")
fig.set_facecolor('#FFFFCC')
plt.legend()
plt.show()

In [None]:
Object={0:"No Heart disease",1:"Has Heart disease"}
stroke_Final["Has_No_Heart_disease"]=stroke_Final['heart_disease']
stroke_Final['Has_No_Heart_disease']=stroke_Final['Has_No_Heart_disease'].map(Object)
ax=sns.countplot(x='Has_No_Heart_disease', data=stroke_Final, hue =stroke_Final['stroke'],palette='hls')
[ax.annotate('{:.0f}'.format(p.get_height()), (p.get_x()+0.15, p.get_height()+40)) for p in ax.patches]
plt.ylabel('Number of persons',fontsize=15)
plt.legend(['No Stroke','Has Stroke'],title='Stroke')
plt.show()

### Ever married

In [None]:
labels = ['No', 'Yes']
fig, ax = plt.subplots(figsize = (7,8))
plt.pie(stroke_Final["ever_married"].value_counts(),autopct='%1.2f%%',labels=labels)
plt.title("Percent of all marriages",fontsize=18,fontweight="bold")
fig.set_facecolor('#FFFFCC')
plt.legend()
plt.show()

In [None]:
ax=sns.countplot(x='ever_married', data=stroke_Final, hue =stroke_Final['stroke'],palette='hls')
[ax.annotate('{:.0f}'.format(p.get_height()), (p.get_x()+0.15, p.get_height()+40)) for p in ax.patches]
plt.ylabel('Number of persons',fontsize=15)
plt.legend(['No Stroke','Has Stroke'],title='Stroke')
plt.show()

### Work type

In [None]:
labels = ['Private', 'Self-employed', 'Govt job', 'Children','Never Worked']
fig, ax = plt.subplots(figsize = (7,8))
plt.pie(stroke_Final["work_type"].value_counts(),labels=labels, autopct='%1.1f%%')
plt.title("Percent of all Work type",fontsize=18,fontweight="bold")
fig.set_facecolor('#FFFFCC')
plt.legend()
plt.show()

In [None]:
ax=sns.countplot(x='work_type', data=stroke_Final, hue =stroke_Final['stroke'],palette='hls')
[ax.annotate('{:.0f}'.format(p.get_height()), (p.get_x()+0.1, p.get_height()+40)) for p in ax.patches]
plt.ylabel('Number of persons',fontsize=15)
plt.show()

### Residence_type

In [None]:
labels = ['Urban', 'Rural']
fig, ax = plt.subplots(figsize = (7,8))
plt.pie(stroke_Final["Residence_type"].value_counts(),autopct='%1.2f%%', labels=labels)
plt.title("Percent of all Area",fontsize=18,fontweight="bold")
fig.set_facecolor('#FFFFCC')
plt.legend()
plt.show()

In [None]:
ax=sns.countplot(x='Residence_type', data=stroke_Final, hue =stroke_Final['stroke'],palette='hls')
[ax.annotate('{:.0f}'.format(p.get_height()), (p.get_x()+0.15, p.get_height()+40)) for p in ax.patches]
plt.ylabel('Number of persons',fontsize=15)
plt.legend(['No Stroke','Has Stroke'],title='Stroke')
plt.show()

### Smoking_status

In [None]:
labels = ['never smoked', 'formerly smoked', 'smokes']
fig, ax = plt.subplots(figsize = (7,8))
plt.pie(stroke_Final["smoking_status"].value_counts(),labels=labels, autopct='%1.1f%%')
plt.title("Percent of all Smoking status",fontsize=18,fontweight="bold")
fig.set_facecolor('#FFFFCC')
plt.legend()
plt.show()

In [None]:
ax=sns.countplot(x='smoking_status', data=stroke_Final, hue =stroke_Final['stroke'],palette='hls')
[ax.annotate('{:.0f}'.format(p.get_height()), (p.get_x()+0.15, p.get_height()+40)) for p in ax.patches]
plt.ylabel('Number of persons',fontsize=15)
plt.legend(['No Stroke','Has Stroke'],title='Stroke')
plt.show()

## Average Glucose Level Distribution

In [None]:
ax = sns.histplot(data=df, x= df['avg_glucose_level'], kde=True,hue=df['stroke'], alpha=0.2)
plt.axvline(df['avg_glucose_level'].mean(), linestyle='--', lw=2, zorder=1, color='red')
plt.annotate(f' Mean Average Glucose Level', (108, 350), fontsize=15,color='red')
plt.title('Average Glucose Level Distribution')
plt.xlabel('Average Glucose Level')
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.show()

### Correlation Matrix

In [None]:
plt.figure(figsize=(10,7))
corrmat = stroke_Final.corr()
sns.heatmap(data=corrmat,annot=True, cmap="Blues")

##
![green-divider](https://user-images.githubusercontent.com/7065401/52071924-c003ad80-2562-11e9-8297-1c6595f8a7ff.png)
# 3. Model Building

In [None]:
from imblearn.combine import SMOTEENN
from imblearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_predict
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn import metrics
import xgboost as xgb

### Data Splitting

In [None]:
df = pd.read_csv('data/healthcare-dataset-stroke-data.csv')
df.rename({'Residence_type': 'residence_type'}, axis=1, inplace=True)
df_new = df[df['gender'] != 'Other']
mean = df_new['bmi'].mean(skipna=True)
df_new['bmi']= df_new['bmi'].mask(df_new['bmi'].isnull(), mean)
df_new.drop(columns=['id'], inplace=True)

In [None]:
X = df_new.drop('stroke', axis=1)
Y = df_new['stroke']

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size= 0.20, random_state =42, stratify=Y)

### Imbalance Data

In [None]:
numerical_columns = X_train.select_dtypes(exclude=['object']).columns.to_list()
categorical_columns = X_train.select_dtypes(include=['object']).columns.to_list()

column_transformer = ColumnTransformer(
    [("sc", StandardScaler(), numerical_columns),
     ("ohe", OneHotEncoder(), categorical_columns)],
    remainder="passthrough"
)
X_train = column_transformer.fit_transform(X_train)
X_test = column_transformer.transform(X_test)

In [None]:
X_train, Y_train = SMOTEENN(random_state=42).fit_resample(X_train, Y_train)

#### Data Scaling

### Modeling

In [None]:
def stack_model():
    # define the base models
    base_models = list()
    base_models.append(('svc', SVC()))
    base_models.append(('dt', DecisionTreeClassifier()))
    base_models.append(('knn', KNeighborsClassifier()))
    base_models.append(('NaiveBayes', GaussianNB()))
    # define the meta model
    meta_model = LogisticRegression()
    # define the stacking ensemble
    model = StackingClassifier(estimators=base_models, final_estimator=meta_model)
    return model

In [None]:
models = [LogisticRegression(), 
          DecisionTreeClassifier(),
          SVC(),
          KNeighborsClassifier(),
          RandomForestClassifier(), 
          GradientBoostingClassifier(), 
          xgb.XGBClassifier(verbosity=0),
          stack_model()]

In [None]:
# Change parameters here
lr_params = {
    "penalty": ["l1", "l2", "elasticnet", "none"],
    "C": [-1000, -100, -10, -0.1, -0.01, 0, 0.1, 0.01, 1, 10, 100]
}
dt_params = {
    "max_depth": list(range(6,16)),
    "max_features" : [2,3,4,5,6]
}
svm_params = {
    "C": [-1000, -100, -10, -0.1, -0.01, 0, 0.1, 0.01, 1, 10, 100],
    "kernel": ["linear", "poly", "rbf", "sigmoid"]
}
knn_params = {
    "n_neighbors": list(range(1,11)),
    "weights": ["uniform", "distance"],
}
rf_params = {
    "max_depth": list(range(6,16)), 
    "max_features" : [2,3,4,5,6]
}
gb_params = {
    "max_depth": list(range(6,16)),
    "max_features" : [2,3,4,5,6]
}
xgb_params = {
    "max_depth": list(range(6,16)),
    "max_features" : [2,3,4,5,6]
}
stack_params = {

}
model_params = []

In [None]:
def plot_multiple_roc_auc_curves(model, X, y):
    fig, axs = plt.subplots(4, 2, figsize=(12, 8))
    for model, ax in zip(models, axs.flatten()):
        y_pred = model.predict(X)
        fpr, tpr, _ = metrics.roc_curve(y, y_pred)
        roc_auc = metrics.auc(fpr, tpr)
        metrics.RocCurveDisplay(
            fpr=fpr,
            tpr=tpr,
            roc_auc=roc_auc
        ).plot(ax=ax)
        ax.title.set_text(type(model).__name__)
    plt.tight_layout()
    plt.show()

In [None]:
def plot_multiple_precision_recall_curves(models, X, y):
    fig, axs = plt.subplots(4, 2, figsize=(20, 15))
    for model, ax in zip(models, axs.flatten()):
        y_pred = model.predict(X)
        precision, recall, _ = metrics.precision_recall_curve(y, y_pred)
        metrics.PrecisionRecallDisplay(
            precision=precision,
            recall=recall
        ).plot(ax=ax)
        ax.title.set_text(type(model).__name__)
    plt.tight_layout()
    plt.show()

In [None]:
def plot_multiple_confusion_matrixes(models, X, y):
    fig, axs = plt.subplots(4, 2, figsize=(10, 15))
    for model, ax in zip(models, axs.flatten()):
        ax.xaxis.grid()
        ax.yaxis.grid()
        y_pred = model.predict(X)
        cm = metrics.confusion_matrix(y, y_pred)
        names = ['True Negative','False Positive','False Negative','True Positive']
        counts = [value for value in cm.flatten()]
        percentages = ['{0:.2%}'.format(value) for value in cm.flatten()/np.sum(cm)]
        labels = [f'{v1}\n{v2}\n{v3}' for v1, v2, v3 in zip(names,counts,percentages)]
        labels = np.asarray(labels).reshape(2,2)
        sns.heatmap(cm, annot=labels, cmap="mako", fmt ='', ax=ax)
        ax.title.set_text(type(model).__name__)
    plt.tight_layout()
    plt.show()

In [None]:
def get_model_params(model, model_params):
    index = models.index(model)
    return model_params[index]

In [None]:
def training_process(X_train, Y_train, models, print_process=True, plot_metric=True):
    scores = []
    for model in models:
        model.fit(X_train, Y_train)
        Y_pred = cross_val_predict(model, X_train, Y_train, cv=10)
        _, _, score, _ = metrics.precision_recall_fscore_support(Y_train, Y_pred, average='weighted')
        scores.append(score)
        if print_process:
            print(model, " trained.")
            print("Weighted F1 score: %.2f" % score)
            print("Classification report: \n", metrics.classification_report(Y_train, Y_pred))
    if plot_metric:
        plot_multiple_confusion_matrixes(models, X_train, Y_train)
        plot_multiple_roc_auc_curves(models, X_train, Y_train)
        plot_multiple_precision_recall_curves(models, X_train, Y_train)
    max_score = max(scores)
    max_index = scores.index(max_score)
    return models[max_index]

In [None]:
def grid_search_process(X_train, Y_train, model, model_param, print_process=True):
    grid_search = GridSearchCV(model, model_param, scoring='f1_weighted')
    grid_search.fit(X_train, Y_train)
    if print_process:
        print("Model after grid search: ", grid_search)
        print("Best parameters: ", grid_search.best_params_)
        print("Best score: %.2f", grid_search.best_score_)
    return grid_search

In [None]:
best_model = training_process(X_train, Y_train, models, print_process=False)
# best_params = get_model_params(best_model, model_params)
# best_cv_model = grid_search_process(X_train, Y_train, best_model, best_params, print_process=True)

In [None]:
Y_pred_final = best_model.predict(X_test)

In [None]:
cm = metrics.confusion_matrix(Y_test, Y_pred_final)
names = ['True Negative','False Positive','False Negative','True Positive']
counts = [value for value in cm.flatten()]
percentages = ['{0:.2%}'.format(value) for value in cm.flatten()/np.sum(cm)]
labels = [f'{v1}\n{v2}\n{v3}' for v1, v2, v3 in zip(names,counts,percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cm, annot=labels, cmap="mako", fmt ='')
plt.grid(False)
plt.title("Confusion matrix")

In [None]:
fpr, tpr, _ = metrics.roc_curve(Y_test, Y_pred_final)
roc_auc = metrics.auc(fpr, tpr)
metrics.RocCurveDisplay(
    fpr=fpr,
    tpr=tpr,
    roc_auc=roc_auc
).plot()
plt.title("ROC curve")

In [None]:
precision, recall, _ = metrics.precision_recall_curve(Y_test, Y_pred_final)
metrics.PrecisionRecallDisplay(
    precision=precision,
    recall=recall
).plot()
plt.title("Precision-Recall curve")

# Build pipeline

## Create a pipeline

In [None]:
class CustomPipeline():
    def __init__(self, model):
        self.column_transformer = ColumnTransformer(
            [("sc", StandardScaler(), numerical_columns),
            ("ohe", OneHotEncoder(), categorical_columns)],
            remainder="passthrough"
        )
        self.smoteenn = SMOTEENN(random_state=42)
        self.model = model
    
    def fit(self, X, y):
        X = self.column_transformer.fit_transform(X)
        X, y = self.smoteenn.fit_resample(X, y)
        self.model.fit(X, y)
        return self
    
    def predict(self, X):
        X = self.column_transformer.transform(X)
        return self.model.predict(X)

## Test pipeline

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42, stratify=Y)

In [None]:
pipeline = CustomPipeline(best_model)
pipeline.fit(X_train, Y_train)
Y_pred = pipeline.predict(X_test)

In [None]:
metrics.ConfusionMatrixDisplay(
    confusion_matrix=metrics.confusion_matrix(Y_test, Y_pred_final)
).plot()
plt.grid(False)
plt.title("Confusion matrix")

In [None]:
fpr, tpr, _ = metrics.roc_curve(Y_test, Y_pred_final)
roc_auc = metrics.auc(fpr, tpr)
metrics.RocCurveDisplay(
    fpr=fpr,
    tpr=tpr,
    roc_auc=roc_auc
).plot()
plt.title("ROC curve")

In [None]:
precision, recall, _ = metrics.precision_recall_curve(Y_test, Y_pred_final)
metrics.PrecisionRecallDisplay(
    precision=precision,
    recall=recall
).plot()
plt.title("Precision-Recall curve")

# Create a pickle file

In [None]:
file_name = 'pipeline.gz'
joblib.dump(pipeline, file_name)