## Imports

In [789]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve
from sklearn.linear_model import LogisticRegression
import pickle

pd.options.mode.chained_assignment = None  # default='warn'
df_train=pd.read_csv("../data/healthcare-dataset-stroke-data.csv")

## Split and format dataframe

In [791]:
def split_test_train(df):
    
    X = df_train.iloc[:, :-1]
    Y = df_train.iloc[:, -1]
    xtrain, xtest, ytrain, ytest = train_test_split(X, 
                                                    Y, 
                                                    random_state=43, 
                                                    test_size=0.33)
    return (xtrain,
            xtest,
            ytrain,
            ytest)

In [792]:
def reformat_work_type(df):
    df["work_type"] = df["work_type"].replace("Self-employed","Self employed")
    df["work_type"] = df["work_type"].replace("Govt_job","Govt job")
    df["work_type"] = df["work_type"].replace("children","Children")
    df["work_type"] = df["work_type"].replace("Never_worked","Never worked")
    
    return df

In [793]:
def rename_columns(df):
    df = df.rename(columns={"heart_disease":"heart disease", 
                            "avg_glucose_level":"avg glucose level",
                            "ever_married":"ever married",
                            "Residence_type":"residence type",
                            "work_type":"work type",
                            "smoking_status":"smoking status"})
    return df

In [794]:
def format_df(df):
    xtrain, xtest, ytrain, ytest = split_test_train(df_train)
    xtrain = reformat_work_type(xtrain)
    xtrain = rename_columns(xtrain)
    return xtrain, xtest, ytrain, ytest

In [795]:
def format_inference_df(df):
    df = reformat_work_type(df)
    df = rename_columns(df)
    return df

## Preprocessing 

### 1- Outliers and missing values

In [796]:
def remove_outliers(df):
    upper_fence_age = df["age"].mean() + 3*df["age"].std()
    lower_fence_age = df["age"].mean() - 3*df["age"].std()
    
    age_index = df[(df["age"] > upper_fence_age) | (df["age"] < lower_fence_age)].index.tolist()
    
    upper_fence_glucose = df["avg_glucose_level"].mean() + 3*df["avg_glucose_level"].std()
    lower_fence_glucose = df["avg_glucose_level"].mean() - 3*df["avg_glucose_level"].std()

    glucose_index = df[(df["avg_glucose_level"] > upper_fence_glucose) | (df["avg_glucose_level"] < lower_fence_glucose)].index.tolist()

    upper_fence_bmi = df["bmi"].mean() + 3*df["bmi"].std()
    lower_fence_bmi = df["bmi"].mean() - 3*df["bmi"].std()

    bmi_index = df[(df["bmi"] > upper_fence_bmi) | (df["bmi"] < lower_fence_bmi)].index.tolist()
    index_to_be_removed=bmi_index +age_index +glucose_index
    
    df=df.drop(index=index_to_be_removed)
    return df

In [797]:
def fit_KNN_missing_values(df):
    
    file = "../models/imputer_KNN.pickle"
    imputer = KNNImputer(n_neighbors=5)
    imputer.fit(df[["bmi"]])
    pickle.dump(imputer,open(file, "wb"))
    
    return df

In [798]:
def transform_imputer(df):
    file = "../models/imputer_KNN.pickle"
    imputer = pickle.load(open(file, "rb"))
    df["bmi"] = imputer.transform(df[["bmi"]])
    return df

### 2- Feature Engineering

In [799]:
def preprocess_gender(df):
    df = df.replace("Male", 0)
    df = df.replace("Female", 1)
    other_inde = df[df["gender"]=="Other"].index
    df = df.drop(other_inde)
    return df,other_inde

In [800]:
def preprocess_ever_married(df):
    df = df.replace("no",0)
    df = df.replace("yes",1)
    return df

In [801]:
def fit_scaler(df):
    
    cols = ["age","avg glucose level","bmi"]
    file = "../models/Scaler.pickle"
    scaler = MinMaxScaler()
    scaler.fit(df[cols])
    pickle.dump(scaler,open(file, "wb"))

In [802]:
def transfom_scaler(df):
    
    cols = ["age","avg glucose level","bmi"]
    file = "../models/Scaler.pickle"
    scaler = pickle.load(open(file,"rb"))
    df[cols] = scaler.transform(df[cols])
    return df

In [803]:
def preprocess_residence(df):
    df = df.replace("Urban", 0)
    df = df.replace("Rural", 1)
    return df

In [804]:
def preprocess_ever_married(df):
    df = df.replace("Yes", 0)
    df = df.replace("No", 1)
    return df

In [805]:
def fit_encoder(df):
    
    file = "../models/OneHot.pickle"
    enc = OneHotEncoder(handle_unknown="ignore", sparse=False)
    enc.fit(df[["work type", "smoking status"]])
    
    pickle.dump(enc,open(file, "wb"))

In [806]:
def transform_encoder(df):
    
    file = "../models/OneHot.pickle"
    enc = pickle.load(open(file, "rb"))
    list_name = enc.get_feature_names_out(["work type", "smoking status"])
    df[list_name] = enc.transform(df[["work type", "smoking status"]])
    df = df.drop(columns=["work type", "smoking status"])
    return df

### 3- Removing Ids and building model

In [807]:
def store_id(df):
    df_ids = df["id"]
    df = df.drop(columns=["id"])
    return df_ids,df

In [808]:
def build_model(xtrain,ytrain):
    
    file="../models/classifier.pickle"
    kfold = StratifiedKFold(n_splits=10)
    classifier=LogisticRegression()
    classifier.fit(xtrain,ytrain)
    pickle.dump(classifier,open(file, "wb"))

### 4- Creating piepline train

In [809]:
def pipeline_train(df):
    xtrain, xtest, ytrain, ytest = format_df(df)
    fit_KNN_missing_values(xtrain)
    xtrain = transform_imputer(xtrain)
    xtrain, ind = preprocess_gender(xtrain)
    ytrain = ytrain.drop(index=ind)
    xtrain = preprocess_ever_married(xtrain)
    xtrain = preprocess_residence(xtrain)
    fit_encoder(xtrain)
    xtrain = transform_encoder(xtrain) 
    fit_scaler(xtrain)
    xtrain = transfom_scaler(xtrain)
    ids, xtrain = store_id(xtrain)
    return xtrain,ytrain,xtest,ytest


In [810]:
xtrain,ytrain,xtest,ytest = pipeline_train(df_train)
xtrain

Unnamed: 0,gender,age,hypertension,heart disease,ever married,residence type,avg glucose level,bmi,work type_Children,work type_Govt job,work type_Never worked,work type_Private,work type_Self employed,smoking status_Unknown,smoking status_formerly smoked,smoking status_never smoked,smoking status_smokes
3774,1,0.133301,0,0,1,0,0.101746,0.100802,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3316,1,1.000000,0,0,0,0,0.139583,0.266896,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
27,0,0.707031,0,0,0,1,0.634006,0.212143,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2525,1,0.243164,0,0,1,1,0.195633,0.195876,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3091,1,0.914551,0,0,0,1,0.241000,0.200458,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307,0,0.621582,0,0,0,0,0.778248,0.375716,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2064,0,0.890137,0,0,0,1,0.127112,0.208477,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2325,1,0.377441,0,0,0,1,0.024895,0.238259,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2303,1,0.914551,0,0,0,1,0.773072,0.264605,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


### 5- Create pipeline test

In [811]:
def pipeline_test(df):
    df=format_inference_df(df)
    if df["bmi"].isnull().sum() > 0:
        df = transform_imputer(df)
    df, ind = preprocess_gender(df)
    df = preprocess_ever_married(df)
    df = preprocess_residence(df)
    df = transform_encoder(df) 
    fit_scaler(df)
    df = transfom_scaler(df)
    ids, df = store_id(df)
    return df

### 6- Putting everything together

In [812]:
def make_model_train(df):
    xtrain,ytrain,xtest,ytest = pipeline_train(df)
    build_model(xtrain, ytrain)
    return xtest, ytest

In [813]:
def evaluate_model(xtest,ytest):
    xtest = pipeline_test(xtest)
    file = "../models/classifier.pickle"
    classifier = pickle.load(open(file, "rb"))
    return classifier.score(xtest, ytest)

In [814]:
xtest,ytest = make_model_train(df_train)
evaluate_model(xtest,ytest)

0.9549496147006521