In [None]:
%pylab inline

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.ensemble import RandomForestClassifier
from sklearn import datasets, metrics, model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize

In [None]:
raw_dataset = pd.read_csv('tcga_skcm.csv')
dataset_orig = raw_dataset.copy()

dataset=dataset_orig[['Study','immune_subtype','HLA_A_log', 'HLA_B_log', 'HLA_C_log','HLA_E_log', 'HLA_G_log', 'B2M_log','HLA_DRA_log', 'HLA_DRB1_log', 'HLA_DQA1_log', 'HLA_DQB1_log', 'HLA_DPA1_log', 'HLA_DPB1_log', 'HLA_DMA_log', 'HLA_DMB_log', 'HLA_DOA_log', 'HLA_DOB_log']]
dataset.replace([np.inf, -np.inf], np.nan, inplace=True)
dataset.isna().sum()
dataset = dataset.dropna()
dataset['immune_subtype']=dataset['immune_subtype'].astype('uint8')

# Cross Validation

In [None]:
c1=[]
c2=[]
c3=[]
c4=[]
c5=[]
c6=[]
micro=[]

for i in range(10000):
    SKCM_df=dataset[dataset['Study']=='SKCM']
    SKCM_train_dataset = SKCM_df.sample(frac=0.6)
    SKCM_test_dataset = SKCM_df.drop(SKCM_train_dataset.index)

    train_set=SKCM_train_dataset
    test_set=SKCM_test_dataset

    # exclude Study which is categorical
    train_set.pop('Study')
    test_set.pop('Study')

    train_features = train_set.copy()
    test_features = test_set.copy()

    train_labels = train_features.pop('immune_subtype')
    test_labels = test_features.pop('immune_subtype')

    X_train=pd.DataFrame.to_numpy(train_features)
    y_train=pd.Series.to_numpy(train_labels)

    X_test=pd.DataFrame.to_numpy(test_features)
    y_test=pd.Series.to_numpy(test_labels)

    clf = RandomForestClassifier(n_estimators=200, max_features=10)
    y_score=clf.fit(X_train, y_train).predict_proba(X_test)

    if (np.unique(y_train).size == 4):
        y_train_bin = label_binarize(y_train, classes=[1,2,3,4])
        y_test_bin = label_binarize(y_test, classes=[1,2,3,4,6])
        n_classes = 4
        fpr = dict()
        tpr = dict()
        roc_auc = dict()
        for i in range(n_classes):
            fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_score[:, i])
            roc_auc[i] = auc(fpr[i], tpr[i])

        # Compute micro-average ROC curve and ROC area
        fpr["micro"], tpr["micro"], _ = roc_curve(y_test_bin[:,0:4].ravel(), y_score[:,0:4].ravel())
        roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

        c1.append(roc_auc[0])
        c2.append(roc_auc[1])
        c3.append(roc_auc[2])
        c4.append(roc_auc[3])
        micro.append(roc_auc['micro'])

    elif (np.unique(y_test).size == 4):
        # Compute ROC curve and ROC area for each class
        y_train_bin = label_binarize(y_train, classes=[1,2,3,4,6])
        y_test_bin = label_binarize(y_test, classes=[1,2,3,4])
        n_classes = 4
        fpr = dict()
        tpr = dict()
        roc_auc = dict()
        for i in range(n_classes):
            fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_score[:, i])
            roc_auc[i] = auc(fpr[i], tpr[i])

        # Compute micro-average ROC curve and ROC area
        fpr["micro"], tpr["micro"], _ = roc_curve(y_test_bin[:,0:4].ravel(), y_score[:,0:4].ravel())
        roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

        c1.append(roc_auc[0])
        c2.append(roc_auc[1])
        c3.append(roc_auc[2])
        c4.append(roc_auc[3])
        micro.append(roc_auc['micro'])


    else:  
        # Compute ROC curve and ROC area for each class
        y_train_bin = label_binarize(y_train, classes=[1,2,3,4,6])
        y_test_bin = label_binarize(y_test, classes=[1,2,3,4,6])
        n_classes = 5
        fpr = dict()
        tpr = dict()
        roc_auc = dict()
        for i in range(n_classes):
            fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_score[:, i])
            roc_auc[i] = auc(fpr[i], tpr[i])

        # Compute micro-average ROC curve and ROC area
        fpr["micro"], tpr["micro"], _ = roc_curve(y_test_bin.ravel(), y_score.ravel())
        roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

        c1.append(roc_auc[0])
        c2.append(roc_auc[1])
        c3.append(roc_auc[2])
        c4.append(roc_auc[3])
        c6.append(roc_auc[4])
        micro.append(roc_auc['micro'])



In [None]:
mean_C1=nanmean(c1)
mean_C2=nanmean(c2)
mean_C3=nanmean(c3)
mean_C4=nanmean(c4)
mean_C6=nanmean(c6)
mean_micro=nanmean(micro)

sd_C1=nanstd(c1)
sd_C2=nanstd(c2)
sd_C3=nanstd(c3)
sd_C4=nanstd(c4)
sd_C6=nanstd(c6)
sd_micro=nanstd(micro)

In [None]:
mean_roc_auc={1:mean_C1,2:mean_C2,3:mean_C3,4:mean_C4,6:mean_C6,'micro':mean_micro}
sd_roc_auc={1:sd_C1,2:sd_C2,3:sd_C3,4:sd_C4,6:sd_C6,'micro':sd_micro}

# Immune Subtype Prediction with the Random Forest Classifier

In [None]:
SKCM_df=dataset[dataset['Study']=='SKCM']
SKCM_df.pop('Study')
SKCM_train_features = SKCM_df.copy()
SKCM_train_labels = SKCM_train_features.pop('immune_subtype')

X_train_skcm=pd.DataFrame.to_numpy(SKCM_train_features)
y_train_skcm=pd.Series.to_numpy(SKCM_train_labels)

In [None]:
clf = RandomForestClassifier(n_estimators=200,max_features=10,random_state=1)
clf.fit(X_train_skcm, y_train_skcm)

In [None]:
# predict the immune subtypes of melanoma patients receiving ici treatment
ici_dataset = pd.read_csv('ici.csv')

ici_dataset=ici_dataset[['HLA_A_log', 'HLA_B_log', 'HLA_C_log','HLA_E_log', 'HLA_G_log', 'B2M_log','HLA_DRA_log', 'HLA_DRB1_log', 'HLA_DQA1_log', 'HLA_DQB1_log', 'HLA_DPA1_log', 'HLA_DPB1_log', 'HLA_DMA_log', 'HLA_DMB_log', 'HLA_DOA_log', 'HLA_DOB_log']]

ici_dataset.replace([np.inf, -np.inf], np.nan, inplace=True)
ici_dataset.isna().sum()
ici_dataset = ici_dataset.dropna()

In [None]:
features=pd.DataFrame.to_numpy(ici_dataset)

In [None]:
subtypes=clf.predict(features)

In [None]:
import csv

# open the file in the write mode
with open('ImmuneSubtypes_Prediction.csv', 'w', encoding='UTF8') as f:
    # create the csv writer
    writer = csv.writer(f)

    # write a row to the csv file
    writer.writerow(subtypes)