## Imports

In [55]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve
from sklearn.linear_model import LogisticRegression
import pickle

pd.options.mode.chained_assignment = None  # default='warn'
df=pd.read_csv("../data/healthcare-dataset-stroke-data.csv")

## Split and format dataframe

In [56]:
def split_test_train(df):
    
    X = df.iloc[:, :-1]
    Y = df.iloc[:, -1]
    xtrain, xtest, ytrain, ytest = train_test_split(X, 
                                                    Y, 
                                                    random_state=43, 
                                                    test_size=0.33)
    return (xtrain,
            xtest,
            ytrain,
            ytest)

In [57]:
def reformat_work_type(df):
    df["work_type"] = df["work_type"].replace("Self-employed","Self employed")
    df["work_type"] = df["work_type"].replace("Govt_job","Govt job")
    df["work_type"] = df["work_type"].replace("children","Children")
    df["work_type"] = df["work_type"].replace("Never_worked","Never worked")
    
    return df

In [58]:
def rename_columns(df):
    df = df.rename(columns={"heart_disease":"heart disease", 
                            "avg_glucose_level":"avg glucose level",
                            "ever_married":"ever married",
                            "Residence_type":"residence type",
                            "work_type":"work type",
                            "smoking_status":"smoking status"})
    return df

In [59]:
def format_df(df):
    xtrain, xtest, ytrain, ytest = split_test_train(df_train)
    xtrain = reformat_work_type(xtrain)
    xtrain = rename_columns(xtrain)
    return xtrain, xtest, ytrain, ytest

In [60]:
def format_inference_df(df):
    df = reformat_work_type(df)
    df = rename_columns(df)
    return df

## Preprocessing 

### 1- Outliers and missing values

In [61]:
def remove_outliers(df):
    upper_fence_age = df["age"].mean() + 3*df["age"].std()
    lower_fence_age = df["age"].mean() - 3*df["age"].std()
    
    age_index = df[(df["age"] > upper_fence_age) | (df["age"] < lower_fence_age)].index.tolist()
    
    upper_fence_glucose = df["avg_glucose_level"].mean() + 3*df["avg_glucose_level"].std()
    lower_fence_glucose = df["avg_glucose_level"].mean() - 3*df["avg_glucose_level"].std()

    glucose_index = df[(df["avg_glucose_level"] > upper_fence_glucose) | (df["avg_glucose_level"] < lower_fence_glucose)].index.tolist()

    upper_fence_bmi = df["bmi"].mean() + 3*df["bmi"].std()
    lower_fence_bmi = df["bmi"].mean() - 3*df["bmi"].std()

    bmi_index = df[(df["bmi"] > upper_fence_bmi) | (df["bmi"] < lower_fence_bmi)].index.tolist()
    index_to_be_removed=bmi_index +age_index +glucose_index
    
    df=df.drop(index=index_to_be_removed)
    return df

In [62]:
def fit_KNN_missing_values(df):
    
    file = "../models/imputer_KNN.pickle"
    imputer = KNNImputer(n_neighbors=5)
    imputer.fit(df[["bmi"]])
    pickle.dump(imputer,open(file, "wb"))
    
    return df

In [63]:
def transform_imputer(df):
    file = "../models/imputer_KNN.pickle"
    imputer = pickle.load(open(file, "rb"))
    df["bmi"] = imputer.transform(df[["bmi"]])
    return df

### 2- Feature Engineering

In [64]:
def preprocess_gender(df):
    df = df.replace("Male", 0)
    df = df.replace("Female", 1)
    other_inde = df[df["gender"]=="Other"].index
    df = df.drop(other_inde)
    return df,other_inde

In [65]:
def preprocess_ever_married(df):
    df = df.replace("no",0)
    df = df.replace("yes",1)
    return df

In [66]:
def fit_scaler(df):
    
    cols = ["age","avg glucose level","bmi"]
    file = "../models/Scaler.pickle"
    scaler = MinMaxScaler()
    scaler.fit(df[cols])
    pickle.dump(scaler,open(file, "wb"))

In [67]:
def transfom_scaler(df):
    
    cols = ["age","avg glucose level","bmi"]
    file = "../models/Scaler.pickle"
    scaler = pickle.load(open(file,"rb"))
    df[cols] = scaler.transform(df[cols])
    return df

In [68]:
def preprocess_residence(df):
    df = df.replace("Urban", 0)
    df = df.replace("Rural", 1)
    return df

In [69]:
def preprocess_ever_married(df):
    df = df.replace("Yes", 0)
    df = df.replace("No", 1)
    return df

In [70]:
def fit_encoder(df):
    
    file = "../models/OneHot.pickle"
    enc = OneHotEncoder(handle_unknown="ignore", sparse=False)
    enc.fit(df[["work type", "smoking status"]])
    
    pickle.dump(enc,open(file, "wb"))

In [71]:
def transform_encoder(df):
    
    file = "../models/OneHot.pickle"
    enc = pickle.load(open(file, "rb"))
    list_name = enc.get_feature_names_out(["work type", "smoking status"])
    df[list_name] = enc.transform(df[["work type", "smoking status"]])
    df = df.drop(columns=["work type", "smoking status"])
    return df

In [72]:
def fit_scaler_encoder(df):
    fit_KNN_missing_values(df)
    fit_encoder(df)
    fit_scaler(df)

def transform_scaler_encoder(df):
    df = transform_imputer(df)
    df = transform_encoder(df) 
    df = transfom_scaler(df)
    return df

### 3- Removing Ids and building model

In [73]:
def store_id(df):
    df_ids = df["id"]
    df = df.drop(columns=["id"])
    return df_ids,df

In [74]:
def build_model(xtrain,ytrain):
    
    file="../models/classifier.pickle"
    kfold = StratifiedKFold(n_splits=10)
    classifier = LogisticRegression()
    classifier.fit(xtrain,ytrain)
    pickle.dump(classifier,open(file, "wb"))

### 4- Creating piepline train

In [75]:
def pipeline_train(df):
    xtrain, xtest, ytrain, ytest = format_df(df)
    fit_scaler_encoder(xtrain)
    xtrain = transform_scaler_encoder(xtrain)
    xtrain, ind = preprocess_gender(xtrain)
    ytrain = ytrain.drop(index=ind)
    xtrain = preprocess_ever_married(xtrain)
    xtrain = preprocess_residence(xtrain) 
    ids, xtrain = store_id(xtrain)
    return xtrain,ytrain,xtest,ytest

### 5- Create pipeline test

In [76]:
def pipeline_test(df):
    df = format_inference_df(df)
    if df["bmi"].isnull().sum() > 0:
        df = transform_imputer(df)
    df, ind = preprocess_gender(df)
    df = preprocess_ever_married(df)
    df = preprocess_residence(df)
    df = transform_scaler_encoder(df)
    ids, df = store_id(df)
    return df

### 6- Putting everything together

In [77]:
def make_model_train(df):
    xtrain,ytrain,xtest,ytest = pipeline_train(df)
    build_model(xtrain, ytrain)
    return xtest, ytest

In [78]:
def evaluate_model(xtest,ytest):
    xtest = pipeline_test(xtest)
    file = "../models/classifier.pickle"
    classifier = pickle.load(open(file, "rb"))
    return classifier.score(xtest, ytest)

In [79]:
xtest,ytest = make_model_train(df_train)
# evaluate_model(xtest,ytest)

In [80]:
def make_prediction(X):
    X = pipeline_test(X)
    file = "../models/classifier.pickle"
    classifier = pickle.load(open(file, "rb"))
    return classifier.predict(X)

In [81]:
make_prediction(xtest)

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)