# Imports

In [13]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold

from model_comparison import create_memory_cache, train_model, train_with_pca, evaluate
import sklearn
sklearn.set_config(working_memory=12000)

# Configuration

In [14]:
memory = create_memory_cache()
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Load Data

In [15]:
x_train = pd.read_csv("x_train_scaled.csv")
x_test = pd.read_csv("x_test_scaled.csv")
y_train = pd.read_csv("y_train.csv").squeeze()
y_test = pd.read_csv("y_test.csv").squeeze()

In [16]:
y_train.value_counts()

label
1    217
0    217
Name: count, dtype: int64

In [17]:
y_test.value_counts()

stroke
0    424
1     26
Name: count, dtype: int64

# MODELS

## SVC

In [ ]:
svc_pipeline = Pipeline([('svc', SVC(probability=True))], memory=memory)
svc_params = {
    'svc__C': [0.1, 1, 10],
    'svc__kernel': ['rbf', 'linear'],
    'svc__gamma': ['scale', 'auto']
}
svc_grid, svc_preds = train_model("SVC", svc_pipeline, svc_params, x_train, y_train, x_test, y_test, cv)

## LDA

In [ ]:
lda_pipeline = Pipeline([('lda', LDA(solver='lsqr', shrinkage='auto'))], memory=memory)
lda_grid, lda_preds = train_model("LDA", lda_pipeline, {}, x_train, y_train, x_test, y_test, cv)

## Random Forest

In [ ]:
rf_pipeline = Pipeline([('rf', RandomForestClassifier(random_state=42))], memory=memory)
rf_params = {
    'rf__n_estimators': [100, 200],
    'rf__max_depth': [None, 10],
    'rf__min_samples_split': [2, 5],
    'rf__min_samples_leaf': [1, 2]
}
rf_grid, rf_preds = train_model("Random Forest", rf_pipeline, rf_params, x_train, y_train, x_test, y_test, cv)

## XGBoost

In [ ]:
xgb_pipeline = Pipeline([('xgb', XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42))], memory=memory)
xgb_params = {
    'xgb__n_estimators': [100, 200],
    'xgb__max_depth': [3, 6, 9],
    'xgb__learning_rate': [0.01, 0.1],
    'xgb__subsample': [0.8, 1],
    'xgb__colsample_bytree': [0.8, 1]
}
xgb_grid, xgb_preds = train_model("XGBoost", xgb_pipeline, xgb_params, x_train, y_train, x_test, y_test, cv)

## Stacking

In [ ]:
stacking = StackingClassifier(
    estimators=[
        ('svc', svc_grid.best_estimator_),
        ('rf', rf_grid.best_estimator_),
        ('xgb', xgb_grid.best_estimator_)
    ],
    final_estimator=LogisticRegression(max_iter=1000),
    cv=cv,
    n_jobs=-1
)
stacking.fit(x_train, y_train)
stacking_preds = stacking.predict(x_test)
evaluate("Stacking", y_test, stacking_preds)

# PCA ANALYSIS

In [ ]:
train_with_pca('svc', SVC(probability=True), svc_params, x_train, y_train, x_test, y_test, cv, memory)
train_with_pca('rf', RandomForestClassifier(random_state=42), rf_params, x_train, y_train, x_test, y_test, cv, memory)
train_with_pca('xgb', XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42), xgb_params, x_train, y_train, x_test, y_test, cv, memory)