In [1]:
import pandas as pd
from tokenize_dataset import tokenize_dataset
from isolation_forest import isolation_forest

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

ModuleNotFoundError: ignored

Define dependent variables to prevent overfitting

In [None]:
duke_dependent = ['Surgery', 'Days to Surgery (from the date of diagnosis)', 'Definitive Surgery Type', 'Clinical Response, Evaluated Through Imaging ', 'Pathologic Response to Neoadjuvant Therapy', 'Days to local recurrence (from the date of diagnosis) ', 'Days to distant recurrence(from the date of diagnosis) ', 'Days to death (from the date of diagnosis) ',
                            'Days to last local recurrence free assessment (from the date of diagnosis) ', 'Days to last distant recurrence free assemssment(from the date of diagnosis) ', 'Neoadjuvant Chemotherapy', 'Adjuvant Chemotherapy', 'Neoadjuvant Endocrine Therapy Medications ',
                            'Adjuvant Endocrine Therapy Medications ', 'Therapeutic or Prophylactic Oophorectomy as part of Endocrine Therapy ', 'Neoadjuvant Anti-Her2 Neu Therapy', 'Adjuvant Anti-Her2 Neu Therapy ', 'Received Neoadjuvant Therapy or Not', 'Pathologic response to Neoadjuvant therapy: Pathologic stage (T) following neoadjuvant therapy ',
                            'Pathologic response to Neoadjuvant therapy:  Pathologic stage (N) following neoadjuvant therapy', 'Pathologic response to Neoadjuvant therapy:  Pathologic stage (M) following neoadjuvant therapy ', 'Overall Near-complete Response:  Stricter Definition', 'Overall Near-complete Response:  Looser Definition', 'Near-complete Response (Graded Measure)']

Set target variable

In [None]:
target = 'Surgery'

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Cancer ML Clinical Data/Clinical and Other Features (edited).csv", low_memory=False)

In [None]:
clinical_ids = df[list(df.columns)[0]]

Set patient IDs as the index of df

In [None]:
df = df.set_index(str(list(df.columns)[0]))

In [None]:
df = tokenize_dataset(df)

Partition data

In [None]:
x = df.drop(duke_dependent, axis=1)
y = df[target]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=84)

X_test, X_val = train_test_split(X_test, test_size=0.5, random_state=84)
y_test, y_val = train_test_split(y_test, test_size=0.5, random_state=84)

Normalize Data

In [None]:
min_max_scaler = MinMaxScaler()
X_train = min_max_scaler.fit_transform(X_train)
X_test = min_max_scaler.fit_transform(X_test)
X_val = min_max_scaler.fit_transform(X_val)

Use Isolation Forest to detect and remove outliers

In [None]:
predicted = isolation_forest(X_train, y_train)
non_outlier_indices = []
i = 0
for prediction in predicted:
  if prediction != -1:
    non_outlier_indices.append(i)
  i = i + 1

num_outliers = len(predicted) - len(non_outlier_indices)
print("Num Outliers:", num_outliers)

X_train = X_train[non_outlier_indices]
y_train = y_train.iloc[non_outlier_indices]

Define model

In [None]:
import torch.nn as nn

class duke_model(nn.Module):
    def __init__(self):
        super().__init__()
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(51, 32)
        self.fc2 = nn.Linear(32, 16)
        self.fc3 = nn.Linear(16, 3)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)

        return x

In [None]:
model = duke_model()

In [None]:
import torch
import numpy as np
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from metrics import recall_m, precision_m, f1_m

def train_model(config, data, epochs=10, batch_size=32):
  X_train = data[0]
  y_train = data[1]
  X_val = data[2]
  y_val = data[3]
  y_train = y_train.to_numpy()
  y_val = y_val.to_numpy()
  X_val = torch.from_numpy(X_val).type(torch.float)
  criterion = nn.CrossEntropyLoss()
  optimizer = torch.optim.Adam(model.parameters())

  for epoch in range(epochs):
    running_loss = 0.0
    for i in range((X_train.shape[0]-1)//batch_size + 1):
      start_i = i*batch_size
      end_i = start_i+batch_size
      xb = torch.from_numpy(X_train[start_i:end_i]).type(torch.float)
      yb = torch.from_numpy(y_train[start_i:end_i])
      
      pred = model(xb)
      loss = criterion(pred, yb)

      loss.backward()
      optimizer.step()

      # zero the parameter gradients
      optimizer.zero_grad()

      running_loss += loss.item()
      pred = model(X_val).detach().numpy().astype(np.float)
      y_val = torch.from_numpy(y_val)
      pred = np.argmax(pred, axis=1)
      accuracy = accuracy_score(y_val, pred)
      f1_score = f1_m(y_val, pred)
      recall = recall_m(y_val, pred)
      balanced_acc = balanced_accuracy_score(y_val, pred)
      print('Completed training batch', epoch, 'Training Loss is: %.4f' %running_loss, 'Accuracy: %.4f' %accuracy, 'F1: %.4f' %f1_score, 'Recall: %.4f' %recall, 'Balanced Accuracy: %.4f' %balanced_acc)
      running_loss = 0.0

  return model

Use Raytune to determine hyperparameters

In [None]:
import ray
from ray import tune
from ray.tune.schedulers.async_hyperband import ASHAScheduler

config = {
    'epochs':tune.choice([50, 100, 150]),
    'batch_size':tune.choice([8, 16, 32, 64]),
    'lr':tune.loguniform(1e-3, 1e-1),
}
scheduler = ASHAScheduler(
    max_t=10,
    grace_period=1,
    reduction_factor=3
)
result = tune.run(
    tune.with_parameters(train_model, data=[X_train, y_train, X_val, y_val]),
    resources_per_trial={"cpu":2},
    config=config,
    metric="loss",
    mode="min",
    num_samples=10,
    scheduler=scheduler
)
best_trial = result.get_best_trail("loss", "min", "last")
print("Best trial config: {}".format(best_trial.config))
print("Best trial final validation loss: {}".format(
    best_trial.last_result["loss"]))
print("Best trial final validation accuracy: {}".format(
    best_trial.last_result['accuracy']))

In [None]:
train_model(config=best_trial, data=[X_train, y_train, X_val, y_val])