In [None]:
import pandas as pd
import numpy as np
import json

import tensorflow as tf
from tensorflow.keras import datasets, layers, models
import matplotlib.pyplot as plt
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras import regularizers
from sklearn.model_selection import train_test_split

In [None]:
# get the mc3 dataset
df_mc3 = pd.read_csv("mc3.v0.2.8.PUBLIC.maf",sep="\t",usecols=["Gene","Tumor_Sample_Barcode"],low_memory=True)
df_mc3 = df_mc3.groupby(['Tumor_Sample_Barcode',"Gene"]).size().unstack(fill_value=0)
mc3 = df_mc3.to_numpy()
mc3 = np.column_stack((mc3,df_mc3.index.values))

In [None]:
# get the c96 dataset
df_96 = pd.read_csv("WES_TCGA.96.csv").T
c96 = df_96.to_numpy()
c96 = np.column_stack((c96,df_96.index.values))[2:]

for i in c96:
    i[-1] = i[-1][i[-1].index(":")+2:]
    
c96_last = np.copy(c96[:,-1])

mc3_dict = {}
for i in mc3:
    mc3_dict[i[-1]] = i[:-1]

In [None]:
# Label the mc3 and c96 datasets
jsonFile = json.load(open("cases.2020-02-28.json","r"))
def runJson(submitter_id):
    for i in jsonFile:
        if(submitter_id == i["submitter_id"]):
            return i["project"]["project_id"]
    print("ERROR: " + str(submitter_id))
    return False

mc3_labeled = np.copy(mc3)
c96_labeled = np.copy(c96)
for i in mc3_labeled:
    i[-1] = runJson(i[-1][:12])
for i in c96_labeled:
    i[-1] = runJson(i[-1][:12])

In [None]:
# get the top ten cancers from the mc3 dataset
counts = {}

for i in mc3_labeled:
    tumorName = i[-1]
    if(tumorName in counts):
        counts[tumorName] = counts[tumorName] + 1
    else:
        counts[tumorName] = 1
        
print(sorted(counts.items(), key=lambda x: x[1]))
sumvar = 0
for key,val in counts.items():
    sumvar = sumvar + counts[key]
    
print(sumvar)

topTen = ["TCGA-LUAD","TCGA-UCEC", "TCGA-UCEC", "TCGA-LGG",  "TCGA-HNSC", "TCGA-THCA", "TCGA-PRAD"
          , "TCGA-LUSC","TCGA-SKCM","TCGA-STAD"]

In [None]:
# isolate top ten cancers in both datasets.
mc3_labeled_top = []
for i in mc3_labeled:
    if(i[-1] in topTen):
        mc3_labeled_top.append(i)
        
sumagain = 0
for i in topTen:
    sumagain = sumagain + counts[i]

mc3_labeled_top = np.array(mc3_labeled_top)

c96_labeled_top = []
for i in c96_labeled:
    if(i[-1] in topTen):
        c96_labeled_top.append(i)
        
sumagain = 0

for i in topTen:
    sumagain = sumagain + counts[i]
    
c96_labeled_top = np.array(c96_labeled_top)

In [None]:
# transform data to x and y (input and labels)
x_mc3 = mc3_labeled_top[:,0:-1]
# extract tumor type
y_mc3 = mc3_labeled_top[:,-1]
typeToNumber = {}
for i in range(len(topTen)):
    typeToNumber[topTen[i]] = i
newYList = []
for i in y_mc3:
    newYList.append(typeToNumber[i])
    
y_mc3 = np.array(newYList)

x_c96 = c96_labeled_top[:,0:-1]
# extract tumor type
y_c96 = c96_labeled_top[:,-1]
typeToNumber = {}
for i in range(len(topTen)):
    typeToNumber[topTen[i]] = i
newYList = []
for i in y_c96:
    newYList.append(typeToNumber[i])
    
y_c96 = np.array(newYList)

In [None]:
print(x_mc3.shape)
print(x_c96.shape)

In [None]:
# Assign which type of data to use
x = x_mc3
y = y_mc3

# determine the number of input features
n_features = x.shape[1]
input_layer = tf.keras.layers.Input(shape=((n_features, )))
dropout_layer_1 = tf.keras.layers.Dropout(0.5)(input_layer) # 0.5 for mc3
hidden_layer_1 = tf.keras.layers.Dense(128, activation='relu', 
                                       kernel_regularizer=regularizers.l2(0.025))(dropout_layer_1) 
dropout_layer_2 = tf.keras.layers.Dropout(0.5)(hidden_layer_1) # 0.5 for mc3
hidden_layer_2 = tf.keras.layers.Dense(64, activation='relu', 
                                       kernel_regularizer=regularizers.l2(0.02))(dropout_layer_2) 
dropout_layer_3 = tf.keras.layers.Dropout(0.5)(hidden_layer_2) # 0.5 for mc3
hidden_layer_3 = tf.keras.layers.Dense(64,  activation='relu', 
                                       kernel_regularizer=regularizers.l2(0.02))(dropout_layer_3)
dropout_layer_4 = tf.keras.layers.Dropout(0.5)(hidden_layer_3) # 0.5 for mc3
output_layer = tf.keras.layers.Dense(10,  activation='softmax')(dropout_layer_4) #(hidden_layer_2)
# model
model = tf.keras.Model(inputs=[input_layer], outputs=[output_layer])
callbacks = [tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)]
# compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


# determine the number of input features
n_features_c96 = x_c96.shape[1]
input_layer_c96 = tf.keras.layers.Input(shape=((n_features_c96, )))
dropout_layer_1_c96 = tf.keras.layers.Dropout(0.2)(input_layer_c96) 
hidden_layer_1_c96 = tf.keras.layers.Dense(128, activation='relu', 
                                       kernel_regularizer=regularizers.l2(0.025))(dropout_layer_1_c96) 
dropout_layer_2_c96 = tf.keras.layers.Dropout(0.2)(hidden_layer_1_c96)
hidden_layer_2_c96 = tf.keras.layers.Dense(64, activation='relu', 
                                       kernel_regularizer=regularizers.l2(0.02))(dropout_layer_2_c96) 
dropout_layer_3_c96 = tf.keras.layers.Dropout(0.2)(hidden_layer_2_c96) 
hidden_layer_3_c96 = tf.keras.layers.Dense(64,  activation='relu', 
                                       kernel_regularizer=regularizers.l2(0.02))(dropout_layer_3_c96)
dropout_layer_4_c96 = tf.keras.layers.Dropout(0.2)(hidden_layer_3_c96) 
output_layer_c96 = tf.keras.layers.Dense(10,  activation='softmax')(dropout_layer_4_c96) 
# model
model_c96 = tf.keras.Model(inputs=[input_layer_c96], outputs=[output_layer_c96])
callbacks = [tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)]
# compile the model
model_c96.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [None]:
def data_split(x, y, test_size, val_size):
    # Create train/test split
    X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=test_size, stratify=y)
    # Create validation split from train split
    X_train, X_valid, Y_train, Y_valid = train_test_split(X_train, Y_train, test_size=val_size, stratify=Y_train)
    # convert
    X_train = np.asarray(X_train).astype(np.float32)
    Y_train = np.asarray(Y_train).astype(np.float32)
    X_valid = np.asarray(X_valid).astype(np.float32)
    Y_valid = np.asarray(Y_valid).astype(np.float32)
    X_test = np.asarray(X_test).astype(np.float32)
    Y_test = np.asarray(Y_test).astype(np.float32)
    
    return X_train, Y_train, X_valid, Y_valid, X_test, Y_test

test_accuracy = []
test_loss = []
val_accuracy = []
val_loss = []
con_mats = []
models = []
n_repeat = 1
y_pred_cross_vals = []

# k-fold stratify
for k in range(n_repeat):
    # get data sets for training and testing
    X_train, Y_train, X_valid, Y_valid, X_test, Y_test = data_split(x, y, 0.25, 0.25)
    #X_train_mc3, Y_train_mc3, X_valid_mc3, Y_valid_mc3, X_test_mc3, Y_test_mc3 = data_split(x_mc3, y_mc3, 0.25, 0.25)
    #X_train_c96, Y_train_c96, X_valid_c96, Y_valid_c96, X_test_c96, Y_test_c96 = data_split(x_c96, y_c96, 0.25, 0.25)
    
    # fit the model, larger batch size, validate 
    history = model.fit(X_train, Y_train, validation_data=(X_valid, Y_valid), 
                        epochs=150, callbacks=callbacks, batch_size=120, verbose=1)
    
    # evaluate the model using history
    loss, acc = model.evaluate(X_test, Y_test, verbose=0)
    test_accuracy.append(acc)
    test_loss.append(loss)
    history = history.history
    val_accuracy.append(history['val_accuracy'][-1])
    val_loss.append(history['val_loss'][-1])
    
    # confusion matrix
    Y_pred_all_labels = model.predict(X_test)
    Y_pred = np.argmax(Y_pred_all_labels,axis=1)
    con_mat = tf.math.confusion_matrix(labels=Y_test, predictions=Y_pred).numpy()
    con_mats.append(con_mat)
    
    # add model to list
    models.append(model)
    
    # storing X_test, Y_test, Y_pred_all_labels, Y_pred
    y_pred_cross_vals.append((X_test, Y_test, Y_pred_all_labels, Y_pred))


In [None]:
print("test accuracy: " + str(np.average(np.array(test_accuracy))))
print("test loss: " + str(np.average(np.array(test_loss))))
print("val accuracy: " + str(np.average(np.array(val_accuracy))))
print("val loss: " + str(np.average(np.array(val_loss))))

con_mats_sum = [[0]*10]*10
for n in range(len(con_mats)):
    con_mats_sum = [[con_mats_sum[i][j] + (con_mats[n])[i][j]  for j in range(len(con_mats_sum[0]))] for i in range(len(con_mats_sum))]

print("Confusion matrix sum: ")
print('\n'.join([''.join(['{:4}'.format(item) for item in row]) 
      for row in con_mats_sum]))