In [1]:
import sqlalchemy
from sqlalchemy import text
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine
import pandas as pd

import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder


engine = create_engine('postgresql+psycopg2://breast_cancer_dataset_user:UnSNEeECgY7ky2i5KAPC2WtQn9XrRpvc@dpg-cnbvjf779t8c73epbb3g-a.oregon-postgres.render.com/breast_cancer_dataset')
with engine.connect() as connection:
    # Adjust the SQL query based on your database schema and structure
    query = text("""
    SELECT
    der_age_trunc, der_obesity, der_race_v2, der_smoking2,
    urban_rural, severity_of_covid_19_v2, der_cancertr_none, der_cancer_status_v4, der_dm2,
    der_card, der_pulm, der_renal
    FROM Raw_DataFrame
    """ )
    result = connection.execute(query)

    data = pd.DataFrame(result.fetchall(),
                                        columns=["der_age_trunc", "der_obesity","der_race_v2",
                                            "der_smoking2", "urban_rural", "severity_of_covid_19_v2",
                                            "der_cancertr_none", "der_cancer_status_v4", "der_dm2",
                                            "der_card", "der_pulm", "der_renal"])

data.dropna(inplace= True)
f"Number of rows in data: {len(data)}"

'Number of rows in data: 1044'

In [3]:
# training configuration
config = {
    'learning_rate': 1e-3,
    'hidden': 128,
    'epochs': 50,
    'monitor': 'val_auc', # 'val_accuracy', 'val_auc_score', 'val_loss'
    'verbose': 0,
}

# Preprocessing Objects

In [4]:
multi_categorical = ['der_race_v2', 'der_smoking2', 'urban_rural', 'der_cancer_status_v4']
binary = ['der_obesity', 'der_cancertr_none', 'der_dm2', 'der_card', 'der_pulm', 'der_renal']
continuous = ['der_age_trunc']
target = 'severity_of_covid_19_v2'

In [5]:
enc = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
enc.fit(data[multi_categorical + binary].values)

with open('ohe.pkl', 'wb') as f:
  pickle.dump(enc, f)

In [6]:
scaler = StandardScaler()
scaler.fit(data[continuous].values)

with open('scaler.pkl', 'wb') as f:
  pickle.dump(scaler, f)

# Create Training dataset

In [7]:
import numpy as np

X_continous = scaler.transform(data[continuous].values)
X_categorical = enc.transform(data[multi_categorical + binary].values)
X = np.concatenate([X_continous, X_categorical], axis=-1)

y = data[target].map({'Mild': 0, 'Moderate': 1, 'Severe': 1}).values
y = np.expand_dims(y, axis=-1).astype('float32')

X.shape, y.shape

((1044, 29), (1044, 1))

In [8]:
_, counts = np.unique(y, return_counts=1)
counts / counts.sum()

array([0.72796935, 0.27203065])

# Model Setup

In [9]:
import tensorflow as tf


def point_wise_feed_forward_network(d_model, hidden):
    return tf.keras.Sequential([
        tf.keras.layers.Dense(hidden, activation='relu'),
        tf.keras.layers.Dense(d_model, activation='sigmoid')
    ])


def create_model(hidden, learning_rate=1e-4):
  nn = point_wise_feed_forward_network(1, hidden)
  nn.compile(loss="binary_crossentropy",
             optimizer=tf.keras.optimizers.Adam(learning_rate),
             metrics=['accuracy', tf.keras.metrics.AUC(name='auc')])
  return nn


In [10]:
def train(model_filepath,
          x_train, y_train,
          x_test, y_test,
          config):
  model = create_model(config['hidden'], config['learning_rate'])

  verbose = config['verbose']
  monitor = config['monitor']
  callbacks = [
      tf.keras.callbacks.ReduceLROnPlateau(
          monitor=monitor, factor=0.1, patience=10,
          verbose=verbose, min_lr=1e-6),
      tf.keras.callbacks.ModelCheckpoint(
          model_filepath, save_best_only=True,
          monitor=monitor, verbose=verbose)
      ]
  hist = model.fit(x_train, y_train, validation_data=(x_test, y_test),
                   epochs=config['epochs'], callbacks=callbacks,
                   verbose=verbose).history

  # results of model saved
  model = tf.keras.models.load_model(model_filepath)
  model.predict(x_test, verbose=verbose) # build
  loss, acc, auc = model.evaluate(x_test, y_test, verbose=verbose)
  best_scores = {'val_loss': loss, 'val_accuracy': acc, 'val_auc': auc}

  return best_scores


# Train Single

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1,
                                                    stratify=y,
                                                    random_state=12345)
train('model.keras', X_train, y_train, X_test, y_test, config)

{'val_loss': 0.5485013723373413,
 'val_accuracy': 0.7333333492279053,
 'val_auc': 0.7191470265388489}

# Cross validation

In [None]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=12345)


scores = []
for i, (train_index, test_index) in enumerate(skf.split(X, y)):

    X_train, y_train = X[train_index], y[train_index]
    X_test, y_test = X[test_index], y[test_index]

    results = train('temp.keras', X_train, y_train, X_test, y_test, config)
    print(f"Fold {i}: {results}")
    scores.append(results[config['monitor']])

f"KFold results: {np.mean(scores)*100:.4f}"

Fold 0: {'val_loss': 1.465401291847229, 'val_accuracy': 0.6857143044471741, 'val_auc': 0.6331669688224792}
Fold 1: {'val_loss': 2.228259801864624, 'val_accuracy': 0.6571428775787354, 'val_auc': 0.6152449250221252}
Fold 2: {'val_loss': 1.6219918727874756, 'val_accuracy': 0.6666666865348816, 'val_auc': 0.5426497459411621}
Fold 3: {'val_loss': 1.1394413709640503, 'val_accuracy': 0.7047619223594666, 'val_auc': 0.6755898594856262}
Fold 4: {'val_loss': 1.79139244556427, 'val_accuracy': 0.682692289352417, 'val_auc': 0.5857612490653992}
Fold 5: {'val_loss': 1.460621953010559, 'val_accuracy': 0.6346153616905212, 'val_auc': 0.5733082294464111}
Fold 6: {'val_loss': 1.0493932962417603, 'val_accuracy': 0.692307710647583, 'val_auc': 0.6670582294464111}
Fold 7: {'val_loss': 0.5318979620933533, 'val_accuracy': 0.75, 'val_auc': 0.7685620188713074}
Fold 8: {'val_loss': 1.5121300220489502, 'val_accuracy': 0.7307692170143127, 'val_auc': 0.6905545592308044}
Fold 9: {'val_loss': 1.7942513227462769, 'val_acc

'KFold results: 63.6444'

In [None]:
config

{'learning_rate': 0.0001,
 'hidden': 4096,
 'epochs': 50,
 'monitor': 'val_auc',
 'verbose': 0}