In [800]:
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [801]:
df = pd.read_pickle("cleaned_data.pkl") # import dataset

In [802]:
def encode_categorical_features(dt: pd.DataFrame):
    dt.drop("id", axis=1, inplace=True)
    text_columns = dt.select_dtypes(include=object).columns.tolist() #lists categorical features
    encoded_df = pd.DataFrame()
    for col in text_columns: #iterate through categorical features
        enc = OneHotEncoder(sparse_output=False) # init one hot encoder
        col_data = dt[[col]] # select column data
        enc_col = pd.DataFrame(enc.fit_transform(col_data), columns= enc.categories_) # fit encoder and transform data. Then convert to dataframe
        encoded_df = pd.concat([encoded_df, enc_col], axis=1)
       
           
    numeric_df= dt.select_dtypes(include=np.number) # select dataframe of numerical columns
    encoded_df.reset_index(inplace=True) # reset indices of both dataframes so they can be merged on the index column
    numeric_df.reset_index(inplace=True)
    df_merged = pd.merge(encoded_df, numeric_df, left_index=True, right_index=True) # merge numeric with encoded categorical
    df_merged.columns = df_merged.columns.astype(str) # cast column names to string, SVM doesn't like them otherwie
    df_merged = df_merged.drop("('index',)", axis = 1) # remove index columns 
    df_merged = df_merged.drop("index", axis = 1)

    return df_merged # return merged dataframe now ready for ML 
 
 





In [803]:
def split_data(features, labels):
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

In [804]:
def train_svm(x_train, y_train):
    SVC_Gaussian = SVC(kernel="rbf")
    SVC_Gaussian.fit(x_train, y_train)
    
    return SVC_Gaussian

In [805]:
if __name__ == "__main__":
    df_merged = encode_categorical_features(df)
    stroke = df_merged["stroke"]
    df_merged = df_merged.drop("stroke", axis = 1)
    X_train, X_test, y_train, y_test = split_data(df_merged, stroke)
    model = train_svm(X_train, y_train)