In [None]:
# imports

import os
import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [None]:
# uncomment to set GPU
#os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
"""
# ClassficationReport
# Class to calculate accuracy, precision, recall and f1 score
# Object of of this can be consturcted using prediction file, label file and output file 
"""

class ClassificationReport:
    
    
    def __init__(self, model_res_file, lfile, outfile):
        # file containing model predictions
        self.model_res_file = model_res_file
        
        # Unique list of labels (cancer names)
        self.lfile = lfile 
        
        # output file
        self.outfile = outfile 
        
        # crating array from label file 
        self.labels = [label.rstrip() for label in open(self.lfile, 'r')] 

    def parse_results(self):
        # read input file and get predicted and groud truth values 
        df = pd.read_csv(self.model_res_file, sep='\t', header=None).dropna(axis=1)
        ypred = np.array(df.iloc[:, -3])
        yreal = np.array(df.iloc[:, -2])

        # model accuracy, precicion, recall and f1-scores 
        acc = accuracy_score(yreal, ypred)
        print("Accuracy:", acc)
        report = classification_report(yreal, ypred, output_dict=True, zero_division=0)
        report = pd.DataFrame(report).transpose()
        
        print("Classification: ",report)
        out = os.path.join(self.outfile)

        report.to_csv(out)

        print('Done')



In [None]:
# Train Test Validate

# model_trainer = ModelTrainer(inputfile, testfile, label_file, mname, nfeatures ,resdir)
# model, res = model_trainer.run_mlp_model(e=50, test=True)

class ModelTrainer:
    
    def __init__(self, train_data_file, test_data_file, labels_file, mname, nfeatures, outdir):
        self.train_data_file = train_data_file
        self.test_data_file = test_data_file
        self.labels_file = labels_file
        self.mname = mname
        self.outdir = outdir
        self.nfeatures = nfeatures 
        self.nclasses = self.get_number_of_classes()

    def get_number_of_classes(self):
        return len([c for c in open(self.labels_file, 'r')])


    def run_mlp_model(self, e=20, test=False):
        self.predict_txt = self.mname + "_MLP_predict.txt"
        self.result_log = os.path.join(self.outdir,self.mname + "_MLP.log")
        
        print('==> Loading train and test dataset...')
        # get the training data
        trainX, testX, trainY, testY = Preprocessor(self.train_data_file, self.test_data_file,
                                                    self.labels_file).get_mlp_data()

        print('==> Creating  DNN model')
        # initialize model
        mlp_model = DLmodel(self.nfeatures, self.nclasses).get_mlp_model()

        print('==> Training model ...')
        dnn_callbacks = tf.keras.callbacks.EarlyStopping(patience=3, monitor='loss')
        
        # To train mode on 70% the data and validate on 10% and test on 20% data
        # Set validation split to 12.5% because training data file contains only 80% data
        train_results = mlp_model.fit(trainX, trainY, epochs=e, validation_split=0.125,
                                     callbacks=[dnn_callbacks])
        
        # put model training log into a file 
        pd.DataFrame(train_results.history).to_csv(self.result_log, sep='\t')
        
        if test:
            print('==> Testing model ....')
            # get the test data
            ypred = mlp_model.predict(testX)
            self.print_ypred_test_labels(ypred, testY)
            print("Results dir: ", self.outdir)

        return mlp_model, self.get_model_accuracy_and_loss(train_results)
    

    def get_model_accuracy_and_loss(self, fit_results):
        res = {'accuracy': fit_results.history['accuracy'][-1],
               'val_accuracy': fit_results.history['val_accuracy'][-1],
               'loss': fit_results.history['loss'][-1],
               'val_loss': fit_results.history['val_loss'][-1]}

        return res


    def print_ypred_test_labels(self, ypred, labels):
        # save model prediction to file
        # alternate column containg softmax and one hot encoded value  
        # For example,
        # first column softmax value for encoded label 0 
        # second column softmax value for encoded label 0
        # then encode label of max(softmax) value feteched using argmax
        # after that groud truth encoded label
        # finally result true / false prediction
        
        ypred_argmax = np.argmax(ypred, 1)
        lab_argmax = np.argmax(labels, 1)
        outfile = os.path.join(self.outdir, self.predict_txt)

        ylen = ypred.shape[0]
        yclasses = ypred.shape[1]

        with open(outfile, 'w+') as out:
            for rec in range(ylen):
                for i in range(yclasses):
                    out.write(str(ypred[rec, i]))
                    out.write('\t')
                    out.write(str(labels[rec, i]))
                    out.write('\t')

                out.write('\t')
                out.write(str(ypred_argmax[rec]))
                out.write('\t')
                out.write(str(lab_argmax[rec]))
                out.write('\t')
                out.write(str(ypred_argmax[rec] == lab_argmax[rec]))
                out.write('\n')

    def print_model_training_progress(self, res):

        df = pd.DataFrame(res)
        df.to_csv('training_progress.tsv', sep='\t')
        
    
    # Early Stopping
    def get_callbacks(self):
        return tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)



In [None]:
"""
# This class takes train data, test data , and label file 
# data will be separated into features and labels
# It has method to one hot encode labels 
# 
"""

class Preprocessor:
    def __init__(self, inputfile, testfile, label_file):
        # file containing training data
        self.input_files = inputfile
        # file containing test data
        self.testfile = testfile
        # file containing labels 
        self.labels_file = label_file

    def split_the_data(self):
        
        data = pd.read_csv(self.input_files, sep=',', index_col=0)
        test = pd.read_csv(self.testfile, sep=',', index_col=0 )
        print(data.shape)
        
        # separate features and labels 
        
        # last column contains labels
        trainY = data.iloc[:, -1] 
        testY = test.iloc[:, -1]
        
        # drop last column to keep features only 
        trainX = data.iloc[:, :-1] 
        testX = test.iloc[:, :-1] 
        
        # get one hot encoded labels 
        trainY = self.get_one_encoded_labels(trainY)
        testY = self.get_one_encoded_labels(testY)
        
        print(trainX.shape, trainY.shape, testX.shape, testY.shape)
        
        return trainX, testX, trainY, testY
   
    def get_one_encoded_labels(self, y):
        y = y.to_numpy().reshape(-1, 1)
        labels = np.array([label.strip() for label in open(self.labels_file)]).reshape(-1,1)
        ohe = OneHotEncoder()
        ohe.fit(labels)
        return ohe.transform(y).toarray()

    def get_mlp_data(self):
        trainX, testX, trainY, testY = self.split_the_data()
        trainX = trainX.to_numpy('float64')
        testX = testX.to_numpy('float64')

        return trainX, testX, trainY, testY
    
    def get_cnn_data(self):
        trainX, testX, trainY, testY =  self.split_the_data()
        
        trainX = self.reshape_data(trainX)
        testX  = self.reshape_data(testX)
        
    
        # Log transformation
        #trainX = self.logtranformthe_data(self.reshape_data(trainX), add_to_zeros=1)
        #testX  = self.logtranformthe_data(self.reshape_data(testX), add_to_zeros=1)        
        
        print("dataX shape :", trainX.shape)
        return trainX, testX, trainY, testY

    def reshape_data(self,x):
        print('Reshape data')
        x = x.to_numpy('float64')
        
        # 1 column matrix
        x = x.reshape(x.shape[0], x.shape[1], 1)
        
        # 1 row  matrix
        # x = x.reshape(x.shape[0], 1, x.shape[1])

        return x

    def logtranformthe_data(self, x, base=10, add_to_zeros = 1):
        print('log transformed')
        x = np.log(x + add_to_zeros) / np.log(base)
        return x

    def get_test_data(self, isCNN=False):
        data  = pd.read_csv(self.input_files, sep='\t', index_col=0)
        dataY = data.iloc[:, -1]  # last column contains labels
        dataX = data.iloc[:, :-1]  # droping last columns
         
        
        if(isCNN):
            dataX = self.reshape_data(dataX)
        else:
            dataX = dataX.to_numpy('float64')

        return dataX




In [None]:
import tensorflow as tf
from numpy.random import seed

# Model

class DLmodel:
    
    """
    It has only one method and it is a static method which could be used to initialize the cnn model
    change code in the create_model method to change the architecture of the model
    """

    def __init__(self, nfeatures, nclasses):
        self.nfeatures = nfeatures
        self.nclasses = nclasses
  
    # method for MLP model
    def get_mlp_model(self):
        
        # delete model if exists in the local or global enviroment  
        if ('model' in locals()) or ('model' in globals()):
            del(model)
            
        # set random seed to a constant number before intializing model
        tf.random.set_seed(7)
        
        """
        DNN model architecture initializing 
        """
        print('Creating the model ... ')
        
        model = tf.keras.models.Sequential()
        
        # input layer and hidden layers  
        model.add(tf.keras.layers.Dense(500, activation='relu', input_shape=(self.nfeatures,)))
        model.add(tf.keras.layers.Dense(250, activation='relu'))
        model.add(tf.keras.layers.Dense(125, activation='relu'))
        model.add(tf.keras.layers.Dense(100, activation='relu'))
        model.add(tf.keras.layers.Dense(75, activation='relu'))

        # output layer
        model.add(tf.keras.layers.Dense(self.nclasses, activation='softmax',  name='output_layer'))

        # optimizer and learning rate
        opt = tf.keras.optimizers.Adam(lr=0.00001)

        # Model compile
        model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
        print(model.summary())
        return model




In [None]:
def train_Test_DNN(dname, root_dir, datafile):
    
    # number of gene used to train model 
    nfeatures = 12350
    
    # Create an output directory for DNN model under main output directory   
    data_dir = os.path.join(root_dir,dname)
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
    
    # set it as a result dir 
    resdir = os.path.join(root_dir, dname)
    
    # create an obeject of DataSplitter class
    dsplit = DataSplitter(datafile, data_dir)
    
    # create Train and Test datasets 
    dsplit.split_test_train()
    inputfile = os.path.join(data_dir, 'TCGA_FPKM_5_log_train.csv')
    testfile = os.path.join(data_dir, 'TCGA_FPKM_5_log_test.csv')
    
    # create Test and Train datasets 
    dsplit.split_test_data()
    
    # File containing unique labels 
    label_file = '/home/n10337547/gpu/extra/Projects/2_GPU_Parallel/TCGA_classes_new.txt'
    
    # This is hard coded dont change 
    mname = 'FPKM_5'
    sfix = '_MLP'
    
    # This will train and test model
    # Saves predictions,  to file as well as model to h5 file
    
    model_trainer = ModelTrainer(inputfile, testfile, label_file, mname, nfeatures ,resdir)
    model, res = model_trainer.run_mlp_model(e=50, test=True)
    model_file = os.path.join(resdir, mname + sfix +'.h5')
    model.save(model_file)

    model_res_file = os.path.join(resdir, mname + sfix + "_predict.txt")
    report_file = os.path.join(resdir, mname + sfix + "_confusion_matrix.txt")
    ClassificationReport(model_res_file, label_file, report_file).parse_results()
    
    print(res)
    

In [None]:
# Input data file containing TCGA gene expression data 
# Rows = samples 
# columns = genes and last column = label (cancer name)
# Values = log tranformed FPKM values 
# input data file 
dfile = '/path/to/input_data_file.csv'
# output directory  
outdir = '/path/to/folder'

# To create 10 models, we will run code 10 times 
i = 1
while i < 11:
    # output names DNN_1, DNN_2 ... DNN_10 
    dname = "DNN_"+str(i)
    print(dname)
    
    # Run model 
    train_Test_DNN(dname, outdir, dfile)
    i += 1


print('--- DONE ---')
