In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, accuracy_score, brier_score_loss, make_scorer
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier


## Have to convert our target variable "target" to predicted probabilities using Logistic Regression

In [2]:
df = pd.read_csv("../Data/cleanedHealthData.csv")

In [3]:
df['target'].value_counts(normalize=True)

target
healthy     0.70097
diseased    0.29903
Name: proportion, dtype: float64

In [4]:
## Set target as 1|0

df['target'] = df['target'].replace({'healthy': 1, 'diseased': 0})

y = df['target']
X = df.drop(columns='target')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

  df['target'] = df['target'].replace({'healthy': 1, 'diseased': 0})


In [5]:
## Separate all the columns
numerical_cols = df.select_dtypes(include=['number']).columns
categorical_cols = list(df.select_dtypes(exclude=['number']).columns)
numerical_cols = [col for col in numerical_cols if col != 'target']

In [6]:
## Setup Numerical and Categorical Cols Transformers (KNNImputer, Scaler and OHE)

num_transformer = Pipeline([
    ('imputer', KNNImputer(n_neighbors=5)),
    ('scaler', StandardScaler()),
])

cat_transformer = Pipeline([
    ('ohe', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers = [
        ('num', num_transformer, numerical_cols),
        ('cat', cat_transformer, categorical_cols)
    ]
)

In [13]:
## Logistic Regression pipeline (0.21077409547404927)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

pipeline.fit(X_train, y_train)

## Predict

y_pred_prob = pipeline.predict_proba(X_test)[:,1]

## Brier Score Loss
brier = brier_score_loss(y_test, y_pred_prob)
print(brier)

0.21077409547404927


In [8]:
## Logistic Regression pipeline with balanced class weight (0.25024089874378147)

# balanced_pipeline = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('classifier', LogisticRegression(class_weight='balanced'))
# ])

# balanced_pipeline.fit(X_train, y_train)

# ## Predict

# balanced_y_pred_prob = balanced_pipeline.predict_proba(X_test)[:,1]

# ## Brier Score Loss
# balanced_brier = brier_score_loss(y_test, balanced_y_pred_prob)
# print(balanced_brier)

In [9]:
## Logistic Regression pipeline with higher weight to minority class (0.2500069097637534)

# ratio_pipeline = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('classifier', LogisticRegression(class_weight={1: 0.29903, 0: 0.70097}))
# ])

# ratio_pipeline.fit(X_train, y_train)

# ## Predict

# ratio_y_pred_prob = ratio_pipeline.predict_proba(X_test)[:,1]

# ## Brier Score Loss
# ratio_brier = brier_score_loss(y_test, ratio_y_pred_prob)
# print(ratio_brier)

In [14]:
## Hyperparameter Tuning

param_grid = {
    'preprocessor__num__imputer__n_neighbors': [3, 5, 7, 9, 11],
    'classifier__C': [0.01, 0.1, 1, 10],
    # 'classifier__solver': ['lbfgs', 'sag', 'saga'], 
    'classifier__penalty': [None, 'l1', 'l2', 'elasticnet']
}

In [15]:
## Set scoring function

brier_scorer = make_scorer(brier_score_loss, needs_proba=True, greater_is_better=False)

In [None]:
## KFold GridSearchCV (Parallel Computing)

cv = StratifiedKFold(n_splits = 5, shuffle=True, random_state=42)

grid_search = GridSearchCV(
    estimator = pipeline,    ## Default pipeline had lowest brier score loss
    param_grid = param_grid,
    scoring = brier_scorer,
    cv = cv,
    verbose = 2,
    n_jobs = -1
)

grid_search.fit(X_train, y_train)

In [None]:
print("Best parameters:", grid_search.best_params_)
print("Best Brier score:", -grid_search.best_score_) 