## Data Initialization

In [6]:
import random
import pandas as pd
import numpy as np
import torch as T
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectPercentile, mutual_info_classif, RFECV
from sklearn.metrics import precision_recall_curve, f1_score, accuracy_score, precision_score, recall_score, roc_curve, auc, roc_auc_score
from sklearn import preprocessing
import matplotlib.pyplot as plt
import shap
import ray
from ray import tune
from ray.tune.schedulers import ASHAScheduler
from ray.tune.search.hyperopt import HyperOptSearch
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from hyperopt import hp
import xgboost as xgb
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression

In [7]:
#Cancer gene list initialization
cancerdf = pd.read_csv("/gpfs/data/dgamsiz/kduru1/data/known_cancer.txt", sep ="\t")
cancer_genes = list(cancerdf['Symbol'])

In [8]:
#RNA Dataframe initialization
rnadf = pd.read_csv("/gpfs/data/dgamsiz/kduru1/data/modified_rnaseq_data.txt", sep ="\t")

rnadf = rnadf.transpose()
rnadf.drop('Entrez_Gene_Id',inplace=True)
rnadf = rnadf.reset_index() 
rnadf['index'] = rnadf['index'].str.replace('-01', '')
rnadf.columns = rnadf.iloc[0]
rnadf = rnadf.drop(0)
rnadf = rnadf.rename(columns={'Hugo_Symbol': 'PATIENT_ID'})
rnadf.columns = rnadf.columns.astype(str)
rnadf.set_index('PATIENT_ID', inplace = True)
#Removes missing RNA data
rnadf.dropna(axis=1, inplace=True)

#Averages duplicate RNA data
duplicates = rnadf.columns[rnadf.columns.duplicated()]

for duplicate in duplicates:
    avg = rnadf[duplicate].mean(axis=1)
    rnadf['avg_' + duplicate] = avg

rnadf.drop(duplicates, axis=1, inplace=True)

rnadf = rnadf.add_prefix('rna_')

  rnadf['avg_' + duplicate] = avg
  rnadf['avg_' + duplicate] = avg
  rnadf['avg_' + duplicate] = avg
  rnadf['avg_' + duplicate] = avg
  rnadf['avg_' + duplicate] = avg


In [9]:
#Mutation DataFrame Initialization
mutdf = pd.read_csv("/gpfs/data/dgamsiz/kduru1/data/data_mutations.txt", sep ="\t")

columns_to_keep = ['Hugo_Symbol', 'Variant_Classification', 'Tumor_Sample_Barcode']
mutdf = mutdf[columns_to_keep]
mutdf = mutdf.set_index('Tumor_Sample_Barcode')

mutdf = mutdf[~mutdf['Variant_Classification'].isin(["Silent","3'Flank","3'UTR","5'Flank","5'UTR","Intron"])]
mutdf.drop('Variant_Classification', axis=1, inplace=True)
mutdf.index = [s.strip('-01') for s in mutdf.index]
mutdf.index.names = ['PATIENT_ID']
mutdf = pd.get_dummies(mutdf, columns=['Hugo_Symbol'])

mutdf = mutdf.groupby("PATIENT_ID").max()

  mutdf = pd.read_csv("/gpfs/data/dgamsiz/kduru1/data/data_mutations.txt", sep ="\t")


In [10]:
#Initialize Clinical Dataframe and combine with Treatment Dataframe
clindf = pd.read_csv("/gpfs/data/dgamsiz/kduru1/data/data_clinical_patient.txt", sep ="\t")
treatmentdf = pd.read_csv("/gpfs/data/dgamsiz/kduru1/data/data_timeline_treatment.txt", sep ="\t")
statdf = pd.read_csv("/gpfs/data/dgamsiz/kduru1/data/data_timeline_status.txt", sep ="\t")

clindf.set_index('PATIENT_ID', inplace=True)
statdf.set_index('PATIENT_ID', inplace=True)
statusdf = statdf['PRIMARY_THERAPY_OUTCOME_SUCCESS'].dropna()


for col in treatmentdf.columns:
    if col != 'AGENT' and col != 'MEASURE_OF_RESPONSE' and col != 'PATIENT_ID' and col != 'START_DATE':
        treatmentdf.drop(col, axis=1, inplace=True)

#Transform data into binary format - 0.5
treatmentdf.replace(to_replace='clinical progressive disease', value=False, inplace=True)
treatmentdf.replace(to_replace='stable progressive disease', value=False, inplace=True)
treatmentdf.replace(to_replace='radiographic progressive disease', value=False, inplace=True)
treatmentdf.replace(to_replace='stable disease', value=False, inplace=True)
treatmentdf.replace(to_replace='partial response', value=True, inplace=True)
treatmentdf.replace(to_replace='complete response', value=True, inplace=True)
statusdf.replace(to_replace='Complete Remission/Response', value=True, inplace=True)
statusdf.replace(to_replace='Partial Remission/Response', value=True, inplace=True)
statusdf.replace(to_replace='Stable Disease', value=False, inplace=True)
statusdf.replace(to_replace='Progressive Disease', value=False, inplace=True)
clindf.replace(to_replace='Male', value= 1, inplace=True)
clindf.replace(to_replace='Female', value= 0, inplace=True)

#Merge treatmentdf with clin params of interest
paramsetup = {
    'SEX' : clindf['SEX'],
    'ANCES' : clindf['GENETIC_ANCESTRY_LABEL'],
    'AGE' : clindf['AGE']
}

clinparameters = pd.DataFrame(paramsetup)
idparamdf = treatmentdf.merge(clinparameters, on='PATIENT_ID')
idparamdf.set_index('PATIENT_ID', inplace=True)


#Augments treatment database with overall status database
nans = idparamdf['MEASURE_OF_RESPONSE'].isna()
for pt in idparamdf.index:
    if nans[pt].all() and pt in statusdf.index:
        last_med = np.max(idparamdf.loc[pt,'START_DATE'])
        idparamdf.loc[(idparamdf['START_DATE'] == last_med) & (idparamdf.index == pt), 'MEASURE_OF_RESPONSE'] = statusdf[pt]

idparamdf.replace(' ', np.nan, inplace=True)
idparamdf.dropna(axis=0, inplace=True)

#Remove pts missing clinical data
idparamdf.replace(' ', np.nan, inplace=True)
idparamdf.dropna(axis=0, inplace=True)

#One-hot encoding for categorical data
idparamdf = pd.get_dummies(idparamdf, columns=['AGENT'])


non_med = ['AGENT_Radiation 1', 'AGENT_Nos', 'AGENT_Radiation 2']

#if non_med in idparamdf.columns:
idparamdf.drop(non_med, axis=1, inplace=True)

  statusdf.replace(to_replace='Progressive Disease', value=False, inplace=True)
  clindf.replace(to_replace='Female', value= 0, inplace=True)


In [11]:
#Combines concurrent regiments into one entry based on start date
agents = list()
agents.append('MEASURE_OF_RESPONSE')

for item in list(idparamdf.columns):
    if 'AGENT' in item:
        agents.append(item)

clindf = pd.DataFrame()

for pt in idparamdf.index.unique():
    df = idparamdf.loc[pt]
    starts = np.sort(np.unique(df['START_DATE']))
    if isinstance(df, pd.DataFrame):
        startdf = df[(df['START_DATE'] == starts[0])]
        startdf = startdf[agents]
        boolsum = startdf.apply(lambda row: any(row), axis=0)
        new_row = pd.DataFrame(boolsum).transpose()
        new_row.index = [pt]
        clindf = pd.concat([clindf, new_row])
    else:
        df = df[agents]
        new_row = pd.DataFrame(df).transpose()
        clindf = pd.concat([clindf, new_row])


agents.remove('MEASURE_OF_RESPONSE')
valid_single_agents = [x.replace('AGENT_','') for x in agents if "+" not in x]
double_agents = [x for x in agents if "+" in x]

#Handle '+' medication entries
for row in range(len(clindf)):
    for col in clindf.columns:
        if '+' in col and clindf.iloc[row][col]:
            single_agents = col.replace('AGENT_','').split(' + ')
            for agent in single_agents:
                if agent in valid_single_agents:
                    addagent = 'AGENT_' + agent
                    clindf.iloc[row][addagent] = True

clindf.drop(columns=double_agents, inplace=True)

#Add back in ages, sex, MOR, ancestry
ages = idparamdf['AGE'].loc[~idparamdf['AGE'].index.duplicated(keep='first')]
sex = idparamdf['SEX'].loc[~idparamdf['SEX'].index.duplicated(keep='first')]
ances = idparamdf['ANCES'].loc[~idparamdf['ANCES'].index.duplicated(keep='first')]

clindf = clindf.join(ages)
clindf = clindf.join(sex)
clindf = clindf.join(ances)

clindf = pd.get_dummies(clindf, columns=['ANCES'])
clindf.index.name = 'PATIENT_ID'

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  clindf.iloc[row][addagent] = True


In [12]:
#Merge dfs
halfmergedf = clindf.join(mutdf, how="inner")
mergedf = halfmergedf.join(rnadf, how="inner")

In [13]:
#Shuffles dataframe with reproducible seed
ids = mergedf.index.unique().tolist()
random.seed(23)
random.shuffle(ids)
shuffledf = mergedf.reset_index()
shuffledf = shuffledf.set_index('PATIENT_ID').loc[ids]

In [14]:
#Drop constant values
nonconstantdf = shuffledf.loc[:, shuffledf.nunique() != 1]

#Convert logical values to ints
nonconstantdf.replace(to_replace=False, value=0 , inplace=True)
nonconstantdf.replace(to_replace=True, value=1, inplace = True)
nonconstantdf= nonconstantdf.astype(float)

  nonconstantdf.replace(to_replace=False, value=0 , inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nonconstantdf.replace(to_replace=False, value=0 , inplace=True)
  nonconstantdf.replace(to_replace=True, value=1, inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nonconstantdf.replace(to_replace=True, value=1, inplace = True)


## Feature Engineering

In [15]:
from sklearn.model_selection import train_test_split

# Split into train (70%) and test (30%) with stratification
train_df, test_df = train_test_split(nonconstantdf, test_size=0.3, random_state=41, stratify=nonconstantdf['MEASURE_OF_RESPONSE'])

# Split test into test (80%) and validation (20%) with stratification
test_df, valid_df = train_test_split(test_df, test_size=0.2, random_state=41, stratify=test_df['MEASURE_OF_RESPONSE'])

# Define target (y) and features (X)
y_train = train_df.iloc[:, 0]
df_train = train_df.iloc[:, 1:]

y_test = test_df.iloc[:, 0]
df_test = test_df.iloc[:, 1:]

y_valid = valid_df.iloc[:, 0]
df_valid = valid_df.iloc[:, 1:]

In [16]:
#Data Splitting
rnadf_train = df_train.filter(like='rna_')
mutdf_train = df_train.filter(like='Hugo_')
clindf_train = df_train.drop(columns=rnadf_train.columns.union(mutdf_train.columns))

rnadf_test = df_test.filter(like='rna_')
mutdf_test = df_test.filter(like='Hugo_')
clindf_test = df_test.drop(columns=rnadf_test.columns.union(mutdf_test.columns))

rnadf_valid = df_valid.filter(like='rna_')
mutdf_valid = df_valid.filter(like='Hugo_')
clindf_valid = df_valid.drop(columns=rnadf_valid.columns.union(mutdf_valid.columns))

In [17]:
#RNA Normalization
scaler = preprocessing.Normalizer()
scaler.fit(rnadf_train)

srnadf_train = pd.DataFrame(scaler.transform(rnadf_train), 
                        columns=rnadf_train.columns, index=rnadf_train.index)

srnadf_test = pd.DataFrame(scaler.transform(rnadf_test), 
                        columns=rnadf_test.columns, index=rnadf_test.index)

srnadf_valid = pd.DataFrame(scaler.transform(rnadf_valid), 
                        columns=rnadf_valid.columns, index=rnadf_valid.index)

srna_np = rnadf_train.to_numpy()
corr_matrix = np.corrcoef(srna_np, rowvar=False)
corr_triangle = np.triu(corr_matrix, k = 1)
feature_var = rnadf_train.var()

#Find associated features and remove
to_remove = set()
corr_threshold = 0.80
c = 0
v = 0

corr_ind1, corr_ind2 = np.where(corr_triangle >= corr_threshold)

#Which feature is known for cancer? Which feature has higher variance?
for index in range(len(corr_ind1)):
    corr_feature1 = srnadf_train.columns[corr_ind1[index]]
    corr_feature2 = srnadf_train.columns[corr_ind2[index]]

    if corr_feature1.strip('rna_') in cancer_genes and corr_feature2.strip('rna_') not in cancer_genes:
        to_remove.add(corr_feature1)
        c =  c+1

    elif corr_feature2.strip('rna_') in cancer_genes and corr_feature1.strip('rna_') not in cancer_genes:
        to_remove.add(corr_feature2)
        c= c+1

    elif feature_var[corr_feature1] > feature_var[corr_feature2]:
        to_remove.add(corr_feature2)
        v= v+1

    elif feature_var[corr_feature2] > feature_var[corr_feature1]:
        to_remove.add(corr_feature1)
        v = v+1


urnadf_train = rnadf_train.drop(columns = to_remove)
urnadf_test = rnadf_test.drop(columns = to_remove)
urnadf_valid = rnadf_valid.drop(columns = to_remove)


print(f"We adjudicated {c} features based on known cancer and {v} features by variance.")
print(f"We removed {len(to_remove)} total features.")

We adjudicated 866 features based on known cancer and 59334 features by variance and 0 features randomly.
We removed 3232 total features.


In [18]:
preselector = SelectPercentile(mutual_info_classif, percentile=2)
preselector.fit(urnadf_train, y_train)

selected_features = preselector.get_support(indices=True) 
psrnadf_train = urnadf_train.iloc[:, selected_features]
psrnadf_test = urnadf_test.iloc[:, selected_features]
psrnadf_valid = urnadf_valid.iloc[:, selected_features]

In [19]:
rfc = RandomForestClassifier(n_estimators=50, random_state=26, n_jobs=-1)
selector = RFECV(rfc, min_features_to_select=200, step=5, cv=5, n_jobs=-1)
selector.fit(psrnadf_train, y_train)

selected_features = selector.get_support(indices=True) 
rfrnadf_train = psrnadf_train.iloc[:, selected_features]
rfrnadf_test = psrnadf_test.iloc[:, selected_features]
rfrnadf_valid = psrnadf_valid.iloc[:, selected_features]

In [20]:
#Mutation Data Processing
constant_cols = mutdf_train.loc[:, mutdf_train.nunique() == 1]

vmutdf_train = mutdf_train.drop(columns=constant_cols)
vmutdf_test = mutdf_test.drop(columns=constant_cols)
vmutdf_valid = mutdf_valid.drop(columns=constant_cols)

In [21]:
mut_np = vmutdf_train.to_numpy()
corr_matrix = np.corrcoef(mut_np, rowvar=False)
corr_triangle = np.triu(corr_matrix, k = 1)
feature_var = vmutdf_train.var()

#Find associated features and remove
to_remove = set()
corr_threshold = 0.80
c = 0
v = 0

corr_ind1, corr_ind2 = np.where(corr_triangle >= corr_threshold)

#Which feature is known for cancer? Which feature has higher variance?
for index in range(len(corr_ind1)):
    corr_feature1 = vmutdf_train.columns[corr_ind1[index]]
    corr_feature2 = vmutdf_train.columns[corr_ind2[index]]

    if corr_feature1.strip('Hugo_Symbol_') in cancer_genes and corr_feature2.strip('Hugo_Symbol_') not in cancer_genes:
        to_remove.add(corr_feature1)
        c =  c+1

    elif corr_feature2.strip('Hugo_Symbol_') in cancer_genes and corr_feature1.strip('Hugo_Symbol_') not in cancer_genes:
        to_remove.add(corr_feature2)
        c= c+1

    elif feature_var[corr_feature1] > feature_var[corr_feature2]:
        to_remove.add(corr_feature2)
        v= v+1

    elif feature_var[corr_feature2] > feature_var[corr_feature1]:
        to_remove.add(corr_feature1)
        v = v+1


umutdf_train = vmutdf_train.drop(columns = to_remove)
umutdf_test = vmutdf_test.drop(columns = to_remove)
umutdf_valid = vmutdf_valid.drop(columns = to_remove)


print(f"We adjudicated {c} features based on known cancer and {v} features by variance.")
print(f"We removed {len(to_remove)} total features.")

We adjudicated 118767 features based on known cancer and 183881 features by variance and 0 features randomly.
We removed 3408 total features.


In [22]:
preselector = SelectPercentile(mutual_info_classif, percentile=3)
preselector.fit(umutdf_train, y_train)

selected_features = preselector.get_support(indices=True) 
psmutdf_train = umutdf_train.iloc[:, selected_features]
psmutdf_test = umutdf_test.iloc[:, selected_features]
psmutdf_valid = umutdf_valid.iloc[:, selected_features]

In [23]:
rfc = RandomForestClassifier(n_estimators=50, random_state=26, n_jobs=-1)
selector = RFECV(rfc, min_features_to_select=200, step=5, cv=5, n_jobs=-1)
selector.fit(psmutdf_train, y_train)

selected_features = selector.get_support(indices=True) 
rfmutdf_train = psmutdf_train.iloc[:, selected_features]
rfmutdf_test = psmutdf_test.iloc[:, selected_features]
rfmutdf_valid = psmutdf_valid.iloc[:, selected_features]

In [None]:
#Recombine categorical and numeric data
mclindf_train = clindf_train.reset_index().drop('PATIENT_ID', axis = 1)
mrnadf_train = rfrnadf_train.reset_index().drop('PATIENT_ID', axis = 1)
mmutdf_train = rfmutdf_train.reset_index().drop('PATIENT_ID', axis = 1)
halfmergedf_train = mclindf_train.join(mrnadf_train, how="inner")
mergedf_train = halfmergedf_train.join(mmutdf_train, how="inner")

mclindf_test = clindf_test.reset_index().drop('PATIENT_ID', axis = 1)
mrnadf_test = rfrnadf_test.reset_index().drop('PATIENT_ID', axis = 1)
mmutdf_test = rfmutdf_test.reset_index().drop('PATIENT_ID', axis = 1)
halfmergedf_test = mclindf_test.join(mrnadf_test, how="inner")
mergedf_test = halfmergedf_test.join(mmutdf_test, how="inner")

mclindf_valid = clindf_valid.reset_index().drop('PATIENT_ID', axis = 1)
mrnadf_valid = rfrnadf_valid.reset_index().drop('PATIENT_ID', axis = 1)
mmutdf_valid = rfmutdf_valid.reset_index().drop('PATIENT_ID', axis = 1)
halfmergedf_valid = mclindf_valid.join(mrnadf_valid, how="inner")
mergedf_valid = halfmergedf_valid.join(mmutdf_valid, how="inner")

mergedf_train.to_csv('<PATHNAME>', index=False)
mergedf_test.to_csv('<PATHNAME>', index=False)
mergedf_valid.to_csv('<PATHNAME>', index=False)
y_train.to_csv('<PATHNAME>', index=False)
y_test.to_csv('<PATHNAME>', index=False)
y_valid.to_csv('<PATHNAME>', index=False)

## Final Model Training

In [24]:
def reset_weights(m):
    for layer in m.children():
        if hasattr(layer, 'reset_parameters'):
            layer.reset_parameters()

In [26]:
#Loads saved data
trainXdf = pd.read_csv('nfinaltrainX.csv')
testXdf = pd.read_csv('nfinaltestX.csv')
validXdf = pd.read_csv('nfinalvalidX.csv')
trainYdf = pd.read_csv('ntrainY.csv')
testYdf = pd.read_csv('ntestY.csv')
validYdf = pd.read_csv('nvalidY.csv')

#Creates tensors
trainX = trainXdf.to_numpy()
testX = testXdf.to_numpy()
validX = validXdf.to_numpy()

trainY = trainYdf.to_numpy()
testY = testYdf.to_numpy()
validY = validYdf.to_numpy()

TrainX = T.tensor(trainX, dtype=T.float32)
TrainY = T.tensor(trainY, dtype=T.float32).reshape(-1, 1)

TestX = T.tensor(testX, dtype=T.float32)
TestY = T.tensor(testY, dtype=T.float32).reshape(-1, 1)

ValidX = T.tensor(validX, dtype=T.float32)
ValidY = T.tensor(validY, dtype=T.float32).reshape(-1, 1)

In [27]:
paramcount = len(TestX[0])

In [29]:
nr = sum(trainY == 0)
r = sum(trainY == 1)
bal = nr/r

In [30]:
weight = T.tensor([bal], dtype=T.float32)

  weight = T.tensor([bal], dtype=T.float32)


In [31]:
hparams = {
    "lr" : hp.uniform('lr', 0.0001, 0.0009),
    "batch": hp.randint('batch',15,30),
    "decay" : hp.uniform('decay',0.001, 0.009),
    "do" : hp.choice('do',[0.5,0.6,0.7, 0.8, 0.9]),
    
}

hyperopt_search = HyperOptSearch(hparams, metric="loss", mode="min")

def createNet(do):
    model = nn.Sequential(
    nn.Linear(paramcount, 1024),
    nn.LeakyReLU(),
    nn.Dropout(do),
    nn.Linear(1024, 512),
    nn.LeakyReLU(),
    nn.Dropout(do),
    nn.Linear(512, 256),
    nn.LeakyReLU(),
    nn.Linear(256,1)
)
    return model

def train(hparams):
    model = createNet(hparams["do"])
    reset_weights(model)
    lossfxn = nn.BCEWithLogitsLoss(pos_weight = weight)
    optimizer = optim.Adam(model.parameters(), lr=hparams["lr"], weight_decay = hparams["decay"])

    numepochs = 30
    batchsize = hparams["batch"]
    for epoch in range(numepochs):
        for i in range (0, len(TrainX), batchsize):
            Xbatch = TrainX[i:i+batchsize]
            ybatch = TrainY[i:i+batchsize]
            ypredicted = model(Xbatch)
            loss = lossfxn(ypredicted, ybatch)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # Calculate average validation loss
        model.eval() 
        val_loss = 0.0
        with T.no_grad():
            y_prob = model(TestX).numpy()
            for i in range(0, len(validX), batchsize):
                Xbatch_val = ValidX[i:i+batchsize]
                ybatch_val = ValidY[i:i+batchsize]
                ypredicted_val = model(Xbatch_val)
                val_loss_batch = lossfxn(ypredicted_val, ybatch_val)
                val_loss += val_loss_batch.item()
        val_loss /= (len(ValidX) / batchsize)
        
        # Calculate FPR, TPR, and thresholds
        fpr, tpr, thresholds = roc_curve(TestY, y_prob)
    
        # Calculate the AUC (Area Under the Curve)
        roc_auc = auc(fpr, tpr)
    
        model.train()

        ray.train.report(
            {"loss" : val_loss,
            "AUC" : roc_auc})

In [32]:
#Hyperparameter training
analysis = tune.run(
    train,
    search_alg=hyperopt_search,
    num_samples = 500,
    max_concurrent_trials = 30,
    scheduler=ASHAScheduler(
        metric = "loss",
        mode = "min",
        max_t = 30,
        grace_period = 10,
        reduction_factor = 2),
    resources_per_trial={"cpu": 10, "gpu": 0},
    verbose = 1)

0,1
Current time:,2025-04-16 01:34:51
Running for:,00:00:21.39
Memory:,55.6/375.5 GiB

Trial name,status,loc,batch,decay,do,lr,iter,total time (s),loss,AUC
train_04c55282,PENDING,,24,0.00733347,0.9,0.000696433,,,,
train_519159d7,TERMINATED,172.20.216.7:3897442,25,0.00377207,0.5,0.000622922,30.0,6.82183,1.21271,0.714286
train_ecfbbbf4,TERMINATED,172.20.216.7:3954512,26,0.00756158,0.8,0.000599886,30.0,3.49858,1.05859,0.705882
train_2c4ec3df,TERMINATED,172.20.216.7:3998997,18,0.0039877,0.7,0.000718294,30.0,2.47073,0.710554,0.789916


[36m(pid=3897442)[0m   return torch.load(io.BytesIO(b))
[36m(pid=3897442)[0m   return torch.load(io.BytesIO(b))
[36m(pid=3954512)[0m   return torch.load(io.BytesIO(b))
[36m(pid=3954512)[0m   return torch.load(io.BytesIO(b))
[36m(pid=3998997)[0m   return torch.load(io.BytesIO(b))
[36m(pid=3998997)[0m   return torch.load(io.BytesIO(b))
2025-04-16 01:34:51,780	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/users/kduru1/ray_results/train_2025-04-16_01-34-30' in 0.0363s.
[36m(pid=4054209)[0m   return torch.load(io.BytesIO(b))[32m [repeated 2x across cluster][0m
2025-04-16 01:34:53,379	INFO tune.py:1041 -- Total run time: 23.18 seconds (21.36 seconds for the tuning loop).
Resume experiment with: tune.run(..., resume=True)
- train_04c55282: FileNotFoundError('Could not fetch metrics for train_04c55282: both result.json and progress.csv were not found at /users/kduru1/ray_results/train_2025-04-16_01-34-30/train_04c55282_4_batch=24,de

## Comparative Models

In [33]:
#Loads saved data
trainXdf = pd.read_csv('nfinaltrainX.csv')
testXdf = pd.read_csv('nfinaltestX.csv')
validXdf = pd.read_csv('nfinalvalidX.csv')
trainYdf = pd.read_csv('ntrainY.csv')
testYdf = pd.read_csv('ntestY.csv')
validYdf = pd.read_csv('nvalidY.csv')

#Creates tensors
trainX = trainXdf.to_numpy()
testX = testXdf.to_numpy()

trainY = trainYdf.to_numpy()
testY = testYdf.to_numpy()

validX = validXdf.to_numpy()
validY = validYdf.to_numpy()

TrainX = T.tensor(trainX, dtype=T.float32)
TrainY = T.tensor(trainY, dtype=T.float32).reshape(-1, 1)

TestX = T.tensor(testX, dtype=T.float32)
TestY = T.tensor(testY, dtype=T.float32).reshape(-1, 1)

ValidX = T.tensor(validX, dtype=T.float32)
ValidY = T.tensor(validY, dtype=T.float32).reshape(-1, 1)

In [34]:
#SVC
svc_model = SVC(kernel='rbf', C=20, gamma='scale', random_state=41, probability = True) 
svc_model.fit(trainX, trainY)

y_pred = svc_model.predict(testX) 
y_probs = svc_model.predict_proba(testX)[:, 1] 


accuracy = accuracy_score(testY, y_pred)
precision = precision_score(testY, y_pred)
recall = recall_score(testY, y_pred)
f1 = f1_score(testY, y_pred)

print("SVC")
print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1-score: {f1:.3f}")

aucreport1 = roc_auc_score(testY, y_probs)
fpr1, tpr1, _ = roc_curve(testY, y_probs)


#MLP
mlp_model = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam', 
                          alpha=0.0001, max_iter=500, random_state=42)

mlp_model.fit(trainX, trainY)

y_pred = mlp_model.predict(testX) 
y_probs = mlp_model.predict_proba(testX)[:, 1] 

accuracy = accuracy_score(testY, y_pred)
precision = precision_score(testY, y_pred)
recall = recall_score(testY, y_pred)
f1 = f1_score(testY, y_pred)

print("MLP")
print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1-score: {f1:.3f}")

aucreport2 = roc_auc_score(testY, y_probs) 

fpr2, tpr2, _ = roc_curve(testY, y_probs)


#AdaBoost
adaboost_model = AdaBoostClassifier(
    n_estimators=50,
    learning_rate=1.0,
    random_state=42
)

adaboost_model.fit(trainX, trainY)

y_pred = adaboost_model.predict(testX)
y_probs = adaboost_model.predict_proba(testX)[:, 1]

accuracy = accuracy_score(testY, y_pred)
precision = precision_score(testY, y_pred)
recall = recall_score(testY, y_pred)
f1 = f1_score(testY, y_pred)

print("AdaBoost")
print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1-score: {f1:.3f}")

aucreport3 = roc_auc_score(testY, y_probs)

fpr3, tpr3, _ = roc_curve(testY, y_probs)




#K-Neighbors Classifier
model = KNeighborsClassifier(n_neighbors=2, metric='minkowski', p=20)


model.fit(trainX, trainY)

y_pred = model.predict(testX)
y_probs = model.predict_proba(testX)[:, 1]

accuracy = accuracy_score(testY, y_pred)
precision = precision_score(testY, y_pred)
recall = recall_score(testY, y_pred)
f1 = f1_score(testY, y_pred)

print("K-Neigh")
print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1-score: {f1:.3f}")

aucreport4 = roc_auc_score(testY, y_probs)
fpr4, tpr4, _ = roc_curve(testY, y_probs)



#BNB
model = BernoulliNB()
model.fit(trainX, trainY)

y_pred = model.predict(testX)
y_probs = model.predict_proba(testX)[:, 1]

accuracy = accuracy_score(testY, y_pred)
precision = precision_score(testY, y_pred)
recall = recall_score(testY, y_pred)
f1 = f1_score(testY, y_pred)

print("BNB")
print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1-score: {f1:.3f}")

aucreport5 = roc_auc_score(testY, y_probs)

fpr5, tpr5, _ = roc_curve(testY, y_probs)


#Logistic Regression
model = LogisticRegression()
model.fit(trainX, trainY)

y_pred = model.predict(testX)
y_probs = model.predict_proba(testX)[:, 1]

accuracy = accuracy_score(testY, y_pred)
precision = precision_score(testY, y_pred)
recall = recall_score(testY, y_pred)
f1 = f1_score(testY, y_pred)

print("Logistic R")
print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1-score: {f1:.3f}")

aucreport6 = roc_auc_score(testY, y_probs)

fpr6, tpr6, _ = roc_curve(testY, y_probs)



#XGBoost
model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
)

model.fit(trainX, trainY)

y_pred = model.predict(testX)
y_probs = model.predict_proba(testX)[:, 1]

accuracy = accuracy_score(testY, y_pred)
precision = precision_score(testY, y_pred)
recall = recall_score(testY, y_pred)
f1 = f1_score(testY, y_pred)

print("XGBoost")
print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1-score: {f1:.3f}")

aucreport7 = roc_auc_score(testY, y_probs)

fpr7, tpr7, _ = roc_curve(testY, y_probs)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


SVC
Accuracy: 0.708
Precision: 0.708
Recall: 1.000
F1-score: 0.829
MLP
Accuracy: 0.750
Precision: 0.789
Recall: 0.882
F1-score: 0.833
AdaBoost
Accuracy: 0.667
Precision: 0.737
Recall: 0.824
F1-score: 0.778


  return self._fit(X, y)


K-Neigh
Accuracy: 0.625
Precision: 0.900
Recall: 0.529
F1-score: 0.667
BNB
Accuracy: 0.708
Precision: 0.727
Recall: 0.941
F1-score: 0.821
Logistic R
Accuracy: 0.750
Precision: 0.762
Recall: 0.941
F1-score: 0.842


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


XGBoost
Accuracy: 0.583
Precision: 0.667
Recall: 0.824
F1-score: 0.737


In [None]:
#Optimal therapy tool

import itertools

# Generate all binary combinations of length 5
binary_combinations = list(itertools.product([0, 1], repeat=9))

# Print all combinations
prob_responses = []
combos = []
max_prob = 0
max_combo = []
for combo in binary_combinations:
    with T.no_grad():
        testX[14][0:9] = list(combo)
        testX[14][8] = 0
        TestX = T.tensor(testX, dtype=T.float32)
        outputs = analysis_model(TestX[14])
        prob = T.sigmoid(outputs).cpu().numpy().flatten()
        if prob > max_prob:
            max_prob = prob
            max_combo = combo
        