## Predict the demand for bike share using known tools (cont.)

### Recreating data from last class

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# sample data
bike_data = pd.read_csv("https://raw.githubusercontent.com/divenyijanos/ceu-ml/2023/data/bike_sharing_demand/bike_sample.csv")
features = bike_data.drop(columns=["count", "registered", "casual"]).select_dtypes(include=np.number)
label = bike_data["count"]

prng = np.random.RandomState(20240306)
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.2, random_state=prng)

# feature engineered data
def extract_dt_features(df_with_datetime):
    df_with_datetime['datetime'] = pd.to_datetime(df_with_datetime['datetime'], utc=True)
    df_with_datetime['year'] = df_with_datetime['datetime'].dt.year
    df_with_datetime['day'] = df_with_datetime['datetime'].dt.day
    df_with_datetime['month'] = df_with_datetime['datetime'].dt.month
    df_with_datetime['hour'] = df_with_datetime['datetime'].dt.hour
    df_with_datetime['dayofweek'] = df_with_datetime['datetime'].dt.dayofweek


extract_dt_features(bike_data)

feature_matrix = bike_data.drop(columns=["count", "registered", "casual"]).select_dtypes(include=np.number)
label = bike_data["count"]
prng = np.random.RandomState(20240306)
X_train_fe, X_test_fe, y_train, y_test = train_test_split(feature_matrix, label, test_size=0.2, random_state=prng)

# full data
bike_full = pd.read_csv("https://raw.githubusercontent.com/divenyijanos/ceu-ml/2023/data/bike_sharing_demand/train.csv")
extract_dt_features(bike_full)

full_data_without_original_test = bike_full.loc[~bike_full.datetime.isin(bike_data.filter(X_test.index, axis=0)['datetime'])]
full_data_without_original_test.shape

X_full = full_data_without_original_test.drop(columns=["count", "registered", "casual", "datetime"])
y_full = full_data_without_original_test['count']

### Evaluation function

In [None]:
# define loss function
def calculateRMSLE(prediction, y_obs):
    return round(np.sqrt(
        np.mean(
            (
                np.log(np.where(prediction < 0, 0, prediction) + 1) - 
                np.log(y_obs + 1)
            )**2
        )
    ), 4)

### Model #5: Tree

In [None]:
from sklearn.pipeline import Pipeline
from sklearn import tree

steps = [
    ("tree", tree.DecisionTreeRegressor(max_depth=5, random_state=prng))
]
pipe_tree = Pipeline(steps)

models = ["Tree", "Feature engineered tree", "Feature engineered tree large n"]
datasets = [
    (X_train, y_train, X_test),
    (X_train_fe, y_train, X_test_fe),
    (X_full, y_full, X_test_fe)
]

tree_results = []

for model, data in zip(models, datasets):

    # TODO: fit the tree model

    train_error = calculateRMSLE(pipe_tree.predict(data[0]), data[1])
    test_error = calculateRMSLE(pipe_tree.predict(data[2]), y_test)

    tree_result = [model, train_error, test_error]
    tree_results.append(tree_result)

pd.DataFrame(tree_results, columns = ["Model", "Train", "Test"])

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(20,10))
tree.plot_tree(pipe_tree["tree"], feature_names = X_full.columns.to_list(), max_depth=2)
plt.show()

In [None]:
# TODO: fit a more appropriate model


### Model #6: Random forest

#### Default settings

In [None]:
# random forest
from sklearn.ensemble import RandomForestRegressor

steps = [
    ("random_forest", RandomForestRegressor())
]
pipe_rf = Pipeline(steps)

models = ["RF", "Feature engineered RF", "Feature engineered RF large n"]

rf_results = []

for model, data in zip(models, datasets):

    pipe_rf.fit(data[0], data[1])

    train_error = calculateRMSLE(pipe_rf.predict(data[0]), data[1])
    test_error = calculateRMSLE(pipe_rf.predict(data[2]), y_test)

    rf_result = [model, train_error, test_error]
    rf_results.append(rf_result)

pd.DataFrame(rf_results, columns = ["Model", "Train", "Test"])

In [None]:
# Look at single trees
chosen_tree = pipe_rf["random_forest"].estimators_[1]
plt.figure(figsize=(20,10))
tree.plot_tree(chosen_tree, feature_names = X_full.columns.to_list(), max_depth=2)
plt.show()

In [None]:
(chosen_tree.tree_.max_depth, chosen_tree.tree_.node_count)

#### Hyper-parameter tuning

In [None]:
# hyper-parameter tuning
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 1000, num = 10)]
# Number of features to consider at every split
max_features = [0.05, 0.2, 0.5, 0.7]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(1, 100, num = 5)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
random_grid

In [None]:
# hyper-parameter tuning - fit the models on the full sample
rf = RandomForestRegressor(random_state=prng)

# Random search of parameters, using 5 fold cross validation,
# search across 50 different combinations, and use all available cores, evaluate by RMSLE
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid, n_iter=50, cv=5, scoring="neg_root_mean_squared_log_error", verbose=2, random_state=prng, n_jobs=-1)

# Fit the random search model
rf_random.fit(X_full, y_full)

In [None]:
rf_random.best_params_

In [None]:
pd.DataFrame(rf_random.cv_results_)

In [None]:
train_error = calculateRMSLE(rf_random.best_estimator_.predict(X_full), y_full)
test_error = calculateRMSLE(rf_random.best_estimator_.predict(X_test_fe), y_test)
cv_rf_result = ["CV RF large n", train_error, test_error]

rf_results.append(cv_rf_result)
pd.DataFrame(rf_results, columns = ["Model", "Train", "Test"])

### Model #7: XGBoost

#### Technical detour: category type

In [None]:
dummy_features = ["season", "holiday", "workingday", "weather", "year", "month", "day", "hour", "dayofweek"]
X_full[dummy_features] = X_full[dummy_features].astype("category")
X_full.dtypes

Why we did not deal with this so far? Because sklearn's implementation of trees [does not handle them differently](https://scikit-learn.org/stable/modules/tree.html#:~:text=and%20categorical%20data.-,However%2C%20the%20scikit%2Dlearn%20implementation%20does%20not%20support%20categorical%20variables%20for%20now.,-Other%20techniques%20are).

In [None]:
# Illustration
pipe_tree_deep.fit(X_full, y_full)

train_error = calculateRMSLE(pipe_tree_deep.predict(X_full), y_full)
test_error = calculateRMSLE(pipe_tree_deep.predict(X_test_fe), y_test)

tree_results.append(["Deep tree large n categories", train_error, test_error])

pd.DataFrame(tree_results, columns = ["Model", "Train", "Test"])

In [None]:
plt.figure(figsize=(20,10))
tree.plot_tree(pipe_tree_deep["deep_tree"], feature_names = X_full.columns.to_list(), max_depth=2)
plt.show()

In [None]:
# Boosted tree: xgboost
import xgboost as xgb
xgb_model = xgb.XGBRegressor(enable_categorical=True).fit(X_full, y_full)

In [None]:
train_error = calculateRMSLE(xgb_model.predict(X_full), y_full)
test_error = calculateRMSLE(xgb_model.predict(X_test_fe), y_test)

["XGB", train_error, test_error]

In [None]:
xgb_model.get_booster().trees_to_dataframe().head()

### Submit to Kaggle

In [None]:
# TODO

to_submit = pd.DataFrame({
    'datetime': bike_test.datetime.dt.strftime('%Y-%m-%d %H:%M:%S'),
    'count': 
})
to_submit.to_csv('', index=False)

## Predict heart failure

We are going to work with a heart disease data set collected from 5 different sources (for more detailed information consult [Kaggle](https://www.kaggle.com/datasets/fedesoriano/heart-failure-prediction)). It contains health condition data about 460 patients and whether they got a heart attack. We will use the subset of this dataset and preserve the rest for evaluating our final model on Kaggle.

Cardiovascular diseases (CVDs) are [the most common cause of death globally](https://ourworldindata.org/cardiovascular-diseases) making this task particularly relevant. As the competition states: _"People with cardiovascular disease or who are at high cardiovascular risk (due to the presence of one or more risk factors) need early detection and management wherein a machine learning model can be of great help."_

Attribute Information:
- `Age`: age of the patient [years]
- `Sex`: sex of the patient [M: Male, F: Female]
- `ChestPainType`: chest pain type [TA: Typical Angina, ATA: Atypical Angina, NAP: Non-Anginal Pain, ASY: Asymptomatic]
- `RestingBP`: resting blood pressure [mm Hg]
- `Cholesterol`: serum cholesterol [mm/dl]
- `FastingBS`: fasting blood sugar [1: if FastingBS > 120 mg/dl, 0: otherwise]
- `RestingECG`: resting electrocardiogram results [Normal: Normal, ST: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV), LVH: showing probable or definite left ventricular hypertrophy by Estes' criteria]
- `MaxHR`: maximum heart rate achieved [Numeric value between 60 and 202]
- `ExerciseAngina`: exercise-induced angina [Y: Yes, N: No]
- `Oldpeak`: oldpeak = ST [Numeric value measured in depression]
- `ST_Slope`: the slope of the peak exercise ST segment [Up: upsloping, Flat: flat, Down: downsloping]
- `HeartDisease`: output class [1: heart disease, 0: Normal]

### Know your data

In [None]:
import os

current_dir = os.getcwd()
heart_data = pd.read_csv("../data/heart_failure/train.csv")

In [None]:
heart_data.shape

In [None]:
heart_data.dtypes

In [None]:
from pandas.plotting import scatter_matrix
scatter_matrix(heart_data, alpha=0.2, figsize=(12, 12), diagonal="kde");

### Train-test split

In [None]:
features = heart_data.drop(columns=["HeartDisease"])
label = heart_data["HeartDisease"]

prng = np.random.RandomState(20240311)
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.2, random_state=prng)

### Benchmark evaluation

In [None]:
# benchmark accuracy
from sklearn.metrics import accuracy_score

def calculateAccuracy(observed, predicted):
    return round(accuracy_score(observed, predicted), 4)

benchmark_prediction = np.bincount(y_train).argmax()
train_accuracy = calculateAccuracy(y_train, np.repeat(benchmark_prediction, len(y_train)))
test_accuracy = calculateAccuracy(y_test, np.repeat(benchmark_prediction, len(y_test)))
[train_accuracy, test_accuracy]

In [None]:
# confusion matrix
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, np.repeat(benchmark_prediction, len(y_test)))
cm

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

display = ConfusionMatrixDisplay(confusion_matrix=cm)
display.plot()
plt.show()

In [None]:
from sklearn.metrics import precision_score, recall_score

def calculateMetrics(observed, predicted):
    return {
        "accuracy": calculateAccuracy(observed, predicted),
        "precision": round(precision_score(observed, predicted), 4),
        "recall": round(recall_score(observed, predicted), 4)
    }

pd.DataFrame(
    [calculateMetrics(y, np.repeat(benchmark_prediction, len(y))) for y in [y_train, y_test]],
    index=["Train", "Test"]
)

### Benchmark #2: Logistic regression

#### Pipeline with preprocessing and estimation

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

one_hot_encoder = OneHotEncoder(sparse_output=False, drop="first")
categorical_vars = heart_data.select_dtypes(include="object").columns.to_list()

column_transformer = ColumnTransformer(
    [("create_dummies", one_hot_encoder, categorical_vars)],
    remainder="passthrough"
)

In [None]:
# logit
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

pipe_logit = Pipeline([
    # TODO: create dummies, scale, then estimate without any penalty
])
pipe_logit.fit(X_train, y_train)

In [None]:
pipe_logit.predict(X_train)

In [None]:
pd.DataFrame(
    [calculateMetrics(y, pipe_logit.predict(x)) for x, y in [(X_train, y_train), (X_test, y_test)]],
    index=["Train", "Test"]
)

#### ROC curve

In [None]:
# To use a different cutoff we need to predict probabilities first
pipe_logit.predict_proba(X_train)[:5, :]  # look at first 5 results

In [None]:
def predictProbs(model, X):
    return model.predict_proba(X)[:, 1]

def predictWithCutoff(model, X, cutoff):
    return (predictProbs(model, X) >= cutoff).astype(int)

cutoff = 0.3
pd.DataFrame(
    [calculateMetrics(y, predictWithCutoff(pipe_logit, x, cutoff)) for x, y in [(X_train, y_train), (X_test, y_test)]],
    index=["Train", "Test"]
)

In [None]:
# ROC
from sklearn.metrics import roc_curve, roc_auc_score

y_scores = predictProbs(pipe_logit, X_test)
fpr, tpr, thresholds = roc_curve(y_test, y_scores) 
print(f"thresholds: {thresholds}\n")
print(f"FPR: {fpr}\n")
print(f"TPR: {tpr}\n")

In [None]:
from sklearn.metrics import RocCurveDisplay

roc_display = RocCurveDisplay.from_estimator(pipe_logit, X_test, y_test)
plt.show(roc_display)

#### Precision-recall plot

In [None]:
from sklearn.metrics import PrecisionRecallDisplay, precision_recall_curve

PrecisionRecallDisplay.from_estimator(pipe_logit, X_test, y_test)

#### Imputation

In [None]:
# Imputation
from sklearn.impute import KNNImputer

change_zero_cholesterol_to_closest_neighbors = KNNImputer(missing_values=0)

column_transformer_with_imputation = ColumnTransformer(
    [
        ("create_dummies", one_hot_encoder, categorical_vars),
        ("impute_cholesterol", change_zero_cholesterol_to_closest_neighbors, ["Cholesterol"])
    ],
    remainder="passthrough"
)
pipe_logit_with_imputation = Pipeline([
    ("preprocess", column_transformer_with_imputation),
    ("scale", MinMaxScaler()),
    ("logit" , LogisticRegression(penalty=None, random_state=prng))
])
pipe_logit_with_imputation

In [None]:
pipe_logit_with_imputation.fit(X_train, y_train)

In [None]:
roc_display_imputed = RocCurveDisplay.from_estimator(pipe_logit_with_imputation, X_test, y_test)

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))
roc_display.plot(ax=ax, name="Logit")
roc_display_imputed.plot(ax=ax, name="Logit with imputation")
plt.title('ROC Curves for Two Estimators')
plt.show()

### autoML
Some of the ML processes are easy to automate (e.g. hyper-parameter tuning, model evaluation and selection). There are many tools that solve this automation task, e.g. the [auto-sklearn package](https://automl.github.io/auto-sklearn/master/).