In [76]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math
pd.options.display.float_format = '{:.2f}'.format

filename = 'mutations.csv'
data = pd.read_csv(filename, index_col=0)
samples = data.shape[0]

In [77]:
def logby2(x):
    return np.log2(x) if x != 0 else 0


def find_tL_tR(mutation, data):
    tL = data[data[mutation] == 1]
    tR = data[data[mutation] == 0]
    return tL, tR


def find_NC_C(data):
    NC = data[data.index.str.startswith('NC')]
    C = data[data.index.str.startswith('C')]
    return NC, C


def HT(PC, PNC):
    return -PC * logby2(PC) - PNC * logby2(PNC)


def find_gain(mutation, data):
        
        noncancerous, cancerous = find_NC_C(data)
        PC = len(cancerous) / samples
        PNC = len(noncancerous) / samples
        #H(t) = -[pC,t log2(pC,t) + pNC,t log2(pNC,t)]
        #H(t) = -[(probability of cancerous samples) * log2(probability of cancerous samples) + (probability of non-cancerous samples) * log2(probability of non-cancerous samples)]
        HT_value = -PC * logby2(PC) - PNC * logby2(PNC)
        L, R = find_tL_tR(mutation, data)
        PL = len(L) / samples
        NCL, CL = find_NC_C(L)
        if len(L) > 0:
            HTL = -((len(CL) / len(L) + 0.000001) * logby2((len(CL) / len(L)) + 0.000001) + (len(NCL) / len(L) + 0.000001) * logby2((len(NCL) / len(L)) + 0.000001))
        else:
            HTL = 0

        PR = len(R) / samples
        NCR, CR = find_NC_C(R)
        if len(R) > 0:
            HTR = -((len(CR) / len(R) + 0.000001) * logby2((len(CR) / len(R)) + 0.000001) + (len(NCR) / len(R) + 0.00001) * logby2((len(NCR) / len(R)) + 0.00001))
        else:
            HTR = 0
        HST_value = (PL * HTL) + (PR * HTR)
        gain = HT_value - HST_value
        return gain


In [78]:
#get bootstrap samples
def getBootstrapSamples(data):
    bootstrapData = pd.DataFrame()
    for i in range(samples):
        randomNum = np.random.randint(0, samples) 
        bootstrapData = pd.concat([bootstrapData, data.loc[[data.index[randomNum]]]])

    outOfBagData = data[~data.index.isin(bootstrapData.index)]
    
    return bootstrapData, outOfBagData

In [79]:
def build_tree_for_forest(bootstrapData, outOfBagData):
    selectedFeatures = pd.DataFrame()
    selectedFeaturesL = pd.DataFrame()
    selectedFeaturesR = pd.DataFrame()
    gainList = []
    gainListL = []
    gainListR = []

    # sqrt(num of mutation from og data)
    num = math.sqrt(data.shape[1])
    num = int(num)
    for i in range(num):
        randomNum = np.random.randint(0, num)
        selectedFeatures = pd.concat([selectedFeatures, bootstrapData.iloc[:, randomNum]], axis=1)
        data.drop(data.columns[randomNum], axis=1, inplace=True)
        gainList.append(find_gain(selectedFeatures.columns[i], bootstrapData))
    
    maxGain = max(gainList)
    maxGainIndex = gainList.index(maxGain)
    selectedMutation = selectedFeatures.columns[maxGainIndex]


    L, R = find_tL_tR(selectedMutation, bootstrapData)
    
    for i in range(num):
        randomNum1 = np.random.randint(0, num)
        selectedFeaturesL = pd.concat([selectedFeaturesL, L.iloc[:, randomNum1]], axis=1)
        data.drop(data.columns[randomNum1], axis=1, inplace=True)
        gainListL.append(find_gain(selectedFeaturesL.columns[i], L))
    
    maxGainL = max(gainListL)
    maxGainIndexL = gainListL.index(maxGainL)
    selectedMutationL = selectedFeaturesL.columns[maxGainIndexL]


    A1, A2 = find_tL_tR(selectedMutationL, L)
    CA1 = len(A1[A1.index.str.startswith('C')])
    NCA1 = len(A1[A1.index.str.startswith('NC')])
    CA2 = len(A2[A2.index.str.startswith('C')])
    NCA2 = len(A2[A2.index.str.startswith('NC')])

    if CA1 > NCA1:
        classified_A1 = 'C'
    else:
        classified_A1 = 'NC'
    
    if CA2 > NCA2:
        classified_A2 = 'C'
    else:
        classified_A2 = 'NC'

    for i in range(num):
        randomNum2 = np.random.randint(0, num)
        selectedFeaturesR = pd.concat([selectedFeaturesR, R.iloc[:, randomNum2]], axis=1)
        data.drop(data.columns[randomNum2], axis=1, inplace=True)
        gainListR.append(find_gain(selectedFeaturesR.columns[i], R))
    
    maxGainR = max(gainListR)
    maxGainIndexR = gainListR.index(maxGainR)
    selectedMutationR = selectedFeaturesR.columns[maxGainIndexR]


    B1, B2 = find_tL_tR(selectedMutationR, R)
    CB1 = len(B1[B1.index.str.startswith('C')])
    NCB1 = len(B1[B1.index.str.startswith('NC')])
    CB2 = len(B2[B2.index.str.startswith('C')])
    NCB2 = len(B2[B2.index.str.startswith('NC')])

    if CB1 > NCB1:
        classified_B1 = 'C'
    else:
        classified_B1 = 'NC'
    
    if CB2 > NCB2:
        classified_B2 = 'C'
    else:
        classified_B2 = 'NC'

    return selectedMutation, selectedMutationL, selectedMutationR, classified_A1, classified_A2, classified_B1, classified_B2, len(outOfBagData)


In [81]:
treeChart = pd.DataFrame(columns=['Top Mutation', 'Top Left Mutation', 'Top Right Mutation', 'A1', 'A2', 'B1', 'B2', 'Out of Bag Data Length'])

for i in range(11):
    bootstrapData, outOfBagData = getBootstrapSamples(data)
    treeChart.loc[i] = build_tree_for_forest(bootstrapData, outOfBagData)
    data = pd.read_csv(filename, index_col=0)

treeChart.head(11)

Unnamed: 0,Top Mutation,Top Left Mutation,Top Right Mutation,A1,A2,B1,B2,Out of Bag Data Length
0,ZC3H18_GRCh37_16:88691141-88691141_Frame-Shift...,DAZAP1_GRCh37_19:1430254-1430254_Frame-Shift-D...,RAPGEF5_GRCh37_7:22197474-22197474_Frame-Shift...,C,C,C,NC,38
1,MIR1303_GRCh37_5:154065380-154065383_RNA_DEL_T...,MIR1303_GRCh37_5:154065380-154065383_RNA_DEL_T...,PIK3CA_GRCh37_3:178936082-178936082_Missense-M...,C,NC,C,NC,46
2,HTR1F_GRCh37_3:88040823-88040823_Frame-Shift-D...,HTR1F_GRCh37_3:88040823-88040823_Frame-Shift-D...,COL7A1_GRCh37_3:48612871-48612871_Frame-Shift-...,C,NC,C,NC,42
3,SERPINI1_GRCh37_3:167507159-167507159_Frame-Sh...,SERPINI1_GRCh37_3:167507159-167507159_Frame-Sh...,LRRC43_GRCh37_12:122685346-122685346_Frame-Shi...,C,NC,C,NC,47
4,MIR1303_GRCh37_5:154065380-154065383_RNA_DEL_T...,COL7A1_GRCh37_3:48612871-48612871_Frame-Shift-...,HTR1F_GRCh37_3:88040823-88040823_Frame-Shift-D...,C,C,C,NC,37
5,ZC3H18_GRCh37_16:88691141-88691141_Frame-Shift...,ZC3H18_GRCh37_16:88691141-88691141_Frame-Shift...,HTR1F_GRCh37_3:88040823-88040823_Frame-Shift-D...,C,NC,C,NC,36
6,FARP1_GRCh37_13:99092237-99092237_Frame-Shift-...,PDZD8_GRCh37_10:119042898-119042898_Frame-Shif...,SERPINI1_GRCh37_3:167507159-167507159_Frame-Sh...,C,C,C,NC,43
7,SREK1IP1_GRCh37_5:64020298-64020298_Frame-Shif...,SERPINI1_GRCh37_3:167507159-167507159_Frame-Sh...,SPEG_GRCh37_2:220349122-220349122_Frame-Shift-...,C,C,C,NC,45
8,HTR1F_GRCh37_3:88040823-88040823_Frame-Shift-D...,HTR1F_GRCh37_3:88040823-88040823_Frame-Shift-D...,PDZD8_GRCh37_10:119042898-119042898_Frame-Shif...,C,NC,C,NC,40
9,MIR1303_GRCh37_5:154065380-154065383_RNA_DEL_T...,DAZAP1_GRCh37_19:1430254-1430254_Frame-Shift-D...,CENPF_GRCh37_1:214830653-214830653_Frame-Shift...,C,C,C,NC,41
