In [None]:
import os
import numpy as np
import pandas as pd
import statsmodels.api as sm
import scipy
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import confusion_matrix
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.discrete.discrete_model import Logit
from sklearn import tree
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [None]:
df = pd.read_csv("healthcare-dataset-stroke-data.csv")

In [None]:
df.describe()

### Which is better: linear or logistic regression for predicting stroke?

# Linear regression

In [None]:
df_number = df.select_dtypes(include=["number"])

In [None]:
df_number.dropna(axis = 0).shape[0] / df_number.shape[0]

In [None]:
df_number_dropped = df_number.dropna(axis=0)

In [None]:
model = sm.OLS(df_number_dropped["stroke"], sm.add_constant(df_number_dropped.drop(columns = ["stroke"])))
results = model.fit()
results.params

In [None]:
results.bse

In [None]:
results.tvalues

In [None]:
scaler = StandardScaler()
Scaled = scaler.fit_transform(df_number_dropped.drop(columns = ["stroke"]))
df_Scaled = sm.add_constant(pd.DataFrame(Scaled, columns = df_number_dropped.drop(columns = ["stroke"]).columns, index = df_number_dropped.index))

In [None]:
model = sm.OLS(df_number_dropped.stroke, df_Scaled)
results_scaled_linear = model.fit()
results_scaled_linear.params

In [None]:
results_scaled_linear.tvalues

In [None]:
results_scaled_linear.bse

In [None]:
df_number.corr()

# Logistic Regression

In [None]:
model = sm.Logit(df_number_dropped["stroke"], df_Scaled)
results = model.fit()
results.params

In [None]:
results.bse

In [None]:
df_Scaled_combined = pd.concat((df_number_dropped.stroke, df_Scaled), axis = 1).rename(columns = {0: "stroke"})
df_Scaled_combined["prediction"] = results.predict(df_Scaled)

In [None]:
sns.violinplot(df_Scaled_combined, x = "stroke", y = "prediction")

In [None]:
prediction_binary = df_Scaled_combined.prediction > 0.5 * (df_Scaled_combined.query("stroke == 0")["prediction"].quantile(q = 0.5) + df_Scaled_combined.query("stroke == 1")["prediction"].quantile(q = 0.5))
confusion_matrix(df_Scaled_combined.stroke, prediction_binary)

In [None]:
df_Scaled_combined.stroke.value_counts()

In [None]:
root_mean_squared_error(df_Scaled_combined.stroke, df_Scaled_combined.prediction)

In [None]:
df_Scaled_combined[["stroke", "prediction"]].corr().iloc[0, 1]

In [None]:
df_Scaled_combined["linear_prediction"] = results_scaled_linear.predict(df_Scaled)

In [None]:
plt.ylim((-0.1, 0.6))
sns.violinplot(df_Scaled_combined, x = "stroke", y = "linear_prediction")

In [None]:
linear_prediction_binary = df_Scaled_combined.linear_prediction > 0.5 * (df_Scaled_combined.query("stroke == 0")["linear_prediction"].quantile(q = 0.5) + df_Scaled_combined.query("stroke == 1")["linear_prediction"].quantile(q = 0.5))
confusion_matrix(df_Scaled_combined.stroke, linear_prediction_binary)

In [None]:
root_mean_squared_error(df_Scaled_combined.stroke, df_Scaled_combined.linear_prediction)

In [None]:
df_Scaled_combined[["stroke", "linear_prediction"]].corr().iloc[0, 1]

# Decision Trees

In [None]:
X = df_number_dropped.drop(columns = "stroke")
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, df_number_dropped.stroke)
confusion_matrix(df_number_dropped.stroke, clf.predict(X))
# Everything is classified perfectly, because we follow the tree to the end

In [None]:
X.columns

In [None]:
pd.DataFrame(clf.feature_importances_.reshape(1, -1), columns = X.columns)

In [None]:
scores = cross_val_score(clf, X, df_number_dropped.stroke, cv=5)
print("Cross-validation scores:", scores)
print("Mean CV accuracy:", scores.mean())

In [None]:
# f1 score definition
# precision = TP / (TP + FP) and recall = TP / (TP + FN)
# f1 = 2 * precision * recall / (precision + recall)

In [None]:
scores = cross_val_score(clf, X, df_number_dropped.stroke, scoring = "f1", cv=5)
print("Cross-validation scores:", scores)
print("Mean CV accuracy:", scores.mean())

In [None]:
clf = tree.DecisionTreeClassifier(min_samples_split = 10)
clf = clf.fit(X, df_number_dropped.stroke)
confusion_matrix(df_number_dropped.stroke, clf.predict(X))
# Most things get classified as 0, because at 50 samples, the last split is unlikely to identify a 1 node

In [None]:
pd.DataFrame(clf.feature_importances_.reshape(1, -1), columns = X.columns)

In [None]:
scores = cross_val_score(clf, X, df_number_dropped.stroke, cv=5)
print("Cross-validation scores:", scores)
print("Mean CV accuracy:", scores.mean())

In [None]:
scores = cross_val_score(clf, X, df_number_dropped.stroke, scoring = "f1", cv=5)
print("Cross-validation scores:", scores)
print("Mean CV accuracy:", scores.mean())

In [None]:
clf = tree.DecisionTreeClassifier(min_samples_leaf = 50)
clf = clf.fit(X, df_number_dropped.stroke)
confusion_matrix(df_number_dropped.stroke, clf.predict(X))
# Everything gets classified as 0, because no node of 50 samples is predominantly a 1

In [None]:
pd.DataFrame(clf.feature_importances_.reshape(1, -1), columns = X.columns)

In [None]:
scores = cross_val_score(clf, X, df_number_dropped.stroke, cv=5)
print("Cross-validation scores:", scores)
print("Mean CV accuracy:", scores.mean())

In [None]:
clf = tree.DecisionTreeClassifier(max_features = 2)
clf = clf.fit(X, df_number_dropped.stroke)
confusion_matrix(df_number_dropped.stroke, clf.predict(X))
# Everything gets classified perfectly, because we follow the tree to the end

In [None]:
pd.DataFrame(clf.feature_importances_.reshape(1, -1), columns = X.columns)

In [None]:
scores = cross_val_score(clf, X, df_number_dropped.stroke, cv=5)
print("Cross-validation scores:", scores)
print("Mean CV accuracy:", scores.mean())

In [None]:
param_grid = {
    'min_samples_split': [1, 3, 10, 30],
    'max_features': [1, 2, 4, 6],
}

# Set up the grid search
grid_search = GridSearchCV(
    estimator=DecisionTreeClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring='f1',
    n_jobs=-1
)

# Fit the model
grid_search.fit(X, df_number_dropped.stroke)

# Best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best CV Score:", grid_search.best_score_)