In [1]:
!pip install tensorflow==2.9.1

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pandas-gbq 0.19.2 requires google-auth-oauthlib>=0.7.0, but you have google-auth-oauthlib 0.4.6 which is incompatible.
tensorflow-datasets 4.9.4 requires protobuf>=3.20, but you have protobuf 3.19.6 which is incompatible.
tensorflow-metadata 1.14.0 requires protobuf<4.21,>=3.20.3, but you have protobuf 3.19.6 which is incompatible.
tf-keras 2.15.1 requires tensorflow<2.16,>=2.15, but you have tensorflow 2.9.1 which is incompatible.[0m[31m
[0mSuccessfully installed flatbuffers-1.12 gast-0.4.0 google-auth-oauthlib-0.4.6 keras-2.9.0 keras-preprocessing-1.1.2 protobuf-3.19.6 tensorboard-2.9.1 tensorboard-data-server-0.6.1 tensorboard-plugin-wit-1.8.1 tensorflow-2.9.1 tensorflow-estimator-2.9.0


In [1]:
import sqlalchemy
from sqlalchemy import text
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine
import pandas as pd

import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder


engine = create_engine('postgresql+psycopg2://breast_cancer_dataset_user:UnSNEeECgY7ky2i5KAPC2WtQn9XrRpvc@dpg-cnbvjf779t8c73epbb3g-a.oregon-postgres.render.com/breast_cancer_dataset')
with engine.connect() as connection:
    # Adjust the SQL query based on your database schema and structure
    query = text("""
    SELECT
    der_age_trunc, der_obesity, der_race_v2, der_smoking2,
    urban_rural, severity_of_covid_19_v2, der_cancertr_none, der_cancer_status_v4, der_dm2,
    der_card, der_pulm, der_renal
    FROM Raw_DataFrame
    """ )
    result = connection.execute(query)

    data = pd.DataFrame(result.fetchall(),
                                        columns=["der_age_trunc", "der_obesity","der_race_v2",
                                            "der_smoking2", "urban_rural", "severity_of_covid_19_v2",
                                            "der_cancertr_none", "der_cancer_status_v4", "der_dm2",
                                            "der_card", "der_pulm", "der_renal"])

data.dropna(inplace= True)
f"Number of rows in data: {len(data)}"

'Number of rows in data: 1044'

In [None]:
data['der_age_trunc'].max(), data['der_age_trunc'].min()

In [None]:
#for c in data.columns:
#  print(data[c].value_counts())
#  print()

In [2]:
# training configuration
config = {
    'learning_rate': 1e-2,
    'hidden': 128,
    'epochs': 50,
    'batch_size': 16,
    'monitor': 'val_auc', # 'val_accuracy', 'val_auc_score', 'val_loss'
    'verbose': 0,
}

# Preprocessing Objects

In [3]:
multi_categorical = ['der_race_v2', 'der_smoking2', 'urban_rural', 'der_cancer_status_v4']
binary = ['der_obesity', 'der_cancertr_none', 'der_dm2', 'der_card', 'der_pulm', 'der_renal']
continuous = ['der_age_trunc']
target = 'severity_of_covid_19_v2'

In [4]:
enc = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
enc.fit(data[multi_categorical + binary].values)

with open('ohe.pkl', 'wb') as f:
  pickle.dump(enc, f)

In [5]:
scaler = StandardScaler()
scaler.fit(data[continuous].values)

with open('scaler.pkl', 'wb') as f:
  pickle.dump(scaler, f)

# Create Training dataset

In [6]:
import numpy as np

X_continous = scaler.transform(data[continuous].values)
X_categorical = enc.transform(data[multi_categorical + binary].values)
X = np.concatenate([X_continous, X_categorical], axis=-1)

y = data['severity_of_covid_19_v2'].map({'Mild': 0, 'Moderate': 1, 'Severe': 1}).values
y = np.expand_dims(y, axis=-1).astype('float32')

X.shape, y.shape

((1044, 29), (1044, 1))

In [7]:
y = data['severity_of_covid_19_v2'].map({'Mild': 0, 'Moderate': 1, 'Severe': 1}).values
y = np.expand_dims(y, axis=-1).astype('float32')
y.shape

(1044, 1)

In [8]:
_, counts = np.unique(y, return_counts=1)
counts / counts.sum()

array([0.72796935, 0.27203065])

# Model Setup

In [9]:
import tensorflow as tf


def point_wise_feed_forward_network(d_model, hidden):
    return tf.keras.Sequential([
        tf.keras.layers.Dense(hidden, activation='relu'),
        tf.keras.layers.Dense(hidden, activation='relu'),
        tf.keras.layers.Dense(d_model, activation='sigmoid')
    ])


def create_model(hidden, learning_rate=1e-4):
  nn = point_wise_feed_forward_network(1, hidden)
  nn.compile(loss="binary_crossentropy",
             optimizer=tf.keras.optimizers.Adam(learning_rate),
             metrics=['accuracy', tf.keras.metrics.AUC(name='auc')])
  return nn


In [10]:
def train(model_filepath,
          x_train, y_train,
          x_test, y_test,
          config):
  model = create_model(config['hidden'], config['learning_rate'])

  verbose = config['verbose']
  monitor = config['monitor']
  callbacks = [
      tf.keras.callbacks.ReduceLROnPlateau(
          monitor=monitor, factor=0.1, patience=10,
          verbose=verbose, min_lr=1e-6),
      tf.keras.callbacks.ModelCheckpoint(
          model_filepath, save_best_only=True,
          monitor=monitor, verbose=verbose)
      ]
  hist = model.fit(x_train, y_train, validation_data=(x_test, y_test),
                   epochs=config['epochs'], callbacks=callbacks,
                   verbose=verbose).history

  # results of model saved
  model = tf.keras.models.load_model(model_filepath)
  model.predict(x_test, verbose=verbose) # build
  loss, acc, auc = model.evaluate(x_test, y_test, verbose=verbose)
  best_scores = {'val_loss': loss, 'val_accuracy': acc, 'val_auc': auc}

  return best_scores


# Train Single

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1,
                                                    stratify=y,
                                                    random_state=12345)
train('model.h5', X_train, y_train, X_test, y_test, config)

{'val_loss': 0.8416507244110107,
 'val_accuracy': 0.7142857313156128,
 'val_auc': 0.7300363183021545}

# Cross validation

In [None]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=12345)


scores = []
for i, (train_index, test_index) in enumerate(skf.split(X, y)):

    X_train, y_train = X[train_index], y[train_index]
    X_test, y_test = X[test_index], y[test_index]

    results = train('temp.keras', X_train, y_train, X_test, y_test, config)
    print(f"Fold {i}: {results}")
    scores.append(results[config['monitor']])

f"KFold results: {np.mean(scores)*100:.4f}"

In [None]:
config