In [None]:
import joblib
import json
import numpy as np
import os
import pandas as pd
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

In [None]:
def encode_categorical_features(dt: pd.DataFrame):
    """Use sklearns OneHotEncoder to encode categorical features

    Args:
        dt (pd.DataFrame): cleaned dataframe

    Returns:
        df_merged: dataframe of encoded categorical features as well as unchanged numerical ones
    """
    dt.drop("id", axis=1, inplace=True)
    text_columns = dt.select_dtypes(include=object).columns.tolist() #lists categorical features
    encoded_df = pd.DataFrame() # init empty dataframe to sequentially add encoded columns to
    for col in text_columns: #iterate through categorical features
        enc = OneHotEncoder(sparse_output=False) # init one hot encoder
        col_data = dt[[col]] # select column data
        enc_col = pd.DataFrame(enc.fit_transform(col_data), columns= enc.categories_) # fit encoder and transform data. Then convert to dataframe
        encoded_df = pd.concat([encoded_df, enc_col], axis=1) # add to encoded_df
       
           
    numeric_df= dt.select_dtypes(include=np.number) # select dataframe of numerical columns
    encoded_df.reset_index(inplace=True) # reset indices of both dataframes so they can be merged on the index column
    numeric_df.reset_index(inplace=True)
    df_merged = pd.merge(encoded_df, numeric_df, left_index=True, right_index=True) # merge numeric with encoded categorical
    df_merged.columns = df_merged.columns.astype(str) # cast column names to string, SVM doesn't like them otherwie
    df_merged = df_merged.drop("('index',)", axis = 1) # remove index columns 
    df_merged = df_merged.drop("index", axis = 1)

    return df_merged # return merged dataframe now ready for training
 
 





In [None]:
def split_data(features, labels):
    # generate train/test split whilst using stratify parameter to ensure the stroke class is equally represented in train and test set
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42, stratify = labels)
    return X_train, X_test, y_train, y_test

In [None]:
def train_svm(x_train, y_train):
    """Train SVM on train set

    Args:
        x_train 
        y_train 

    Returns:
        SVC_Gaussian: Trained Support vector machine with rbf kernel 
    """
    SVC_Gaussian = SVC(kernel="rbf")
    SVC_Gaussian.fit(x_train, y_train)
    
    return SVC_Gaussian

In [None]:
def evaluate_model(model, X_test, y_test):
    """Calculating evaluation metrics for model

    Args:
        model
        X_test 
        y_test 

    Returns:
        model_metrics: dictionary of accuracy, precision and recall scores
    """
    y_pred = model.predict(X_test)
    model_metrics = {} 
    model_metrics["Accuracy"] = metrics.accuracy_score(y_test, y_pred)
    model_metrics["Precision"] = metrics.precision_score(y_test, y_pred)
    model_metrics["Recall"] = metrics.recall_score(y_test, y_pred)
    return model_metrics

In [None]:
def save_model(folder, model, metrics):
    """Saves model and associated metrics in file specified by folder.
    """
    try:
        os.mkdir(folder)
    except FileExistsError:
        pass
    os.chdir(folder)
    joblib.dump(model,"model.joblib")
    with open("metrics.json", "w") as outfile:
        json.dump(metrics, outfile)
    os.chdir(root)

In [None]:
if __name__ == "__main__":
    root = os.getcwd()
    df = pd.read_pickle("cleaned_data.pkl") # import dataset
    df_merged = encode_categorical_features(df) # encode categorical features
    stroke = df_merged["stroke"] # separate label from features
    df_merged = df_merged.drop("stroke", axis = 1)
    X_train, X_test, y_train, y_test = split_data(df_merged, stroke) # split dataset and stratify on stroke column
    oversample = SMOTE() # init SMOTE instance 
    X_train, y_train = oversample.fit_resample(X_train, y_train) # resample the training data
    pd.DataFrame(X_train, y_train).pivot_table(index='stroke', aggfunc='size').plot(kind='bar', title='Verify resampling') # verify that SMOTE has correctly oversampled the minority class
    model = train_svm(X_train, y_train) # train SVM model
    eval_dict = evaluate_model(model, X_test, y_test)
    save_model("smote_oversampling", model, eval_dict)
    