In [13]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
pd.options.display.float_format = '{:.2f}'.format

filename = 'mutations.csv'
data = pd.read_csv(filename, index_col=0)
samples = data.shape[0]


In [14]:
def logby2(x):
    return np.log2(x) if x != 0 else 0


def find_tL_tR(mutation, data):
    tL = data[data[mutation] == 1]
    tR = data[data[mutation] == 0]
    return tL, tR


def find_NC_C(data):
    NC = data[data.index.str.startswith('NC')]
    C = data[data.index.str.startswith('C')]
    return NC, C


def HT(PC, PNC):
    return -PC * logby2(PC) - PNC * logby2(PNC)

gain_chart = pd.DataFrame(columns=['Gain', 'n(tL)', 'n(tR)', 'n(tL, C)', 'n(tL, NC)', 'n(tR, C)', 'n(tR, NC)', 'PL', 'PR', 'H(s,t)', 'H(t)'])

# Calculate and store gain for each feature in the data
for feature in data.columns:
    noncancerous, cancerous = find_NC_C(data)
    PC = len(cancerous) / samples
    PNC = len(noncancerous) / samples
    #H(t) = -[pC,t log2(pC,t) + pNC,t log2(pNC,t)]
    #H(t) = -[(probability of cancerous samples) * log2(probability of cancerous samples) + (probability of non-cancerous samples) * log2(probability of non-cancerous samples)]
    HT_value = -PC * logby2(PC) - PNC * logby2(PNC)
    L, R = find_tL_tR(feature, data)
    PL = len(L) / samples
    NCL, CL = find_NC_C(L)
    HTL = -(len(CL) / len(L) * logby2(len(CL) / len(L)) + len(NCL) / len(L) * logby2(len(NCL) / len(L)))
    PR = len(R) / samples
    NCR, CR = find_NC_C(R)
    HTR = -(len(CR) / len(R) * logby2(len(CR) / len(R)) + len(NCR) / len(R) * logby2(len(NCR) / len(R)))
    HST_value = (PL * HTL) + (PR * HTR)
    gain = HT_value - HST_value
    gain_chart.loc[feature] = [gain, len(L), len(R), len(L[L.index.str.startswith('C')]), len(L[L.index.str.startswith('NC')]), len(R[R.index.str.startswith('C')]), len(R[R.index.str.startswith('NC')]), PL, PR, HST_value, HT_value]


# Sort gain_chart by 'Gain' in descending order and display top 10 features
gain_chart.sort_values(by='Gain', ascending=False).head(10)


Unnamed: 0,Gain,n(tL),n(tR),"n(tL, C)","n(tL, NC)","n(tR, C)","n(tR, NC)",PL,PR,"H(s,t)",H(t)
DOCK3_GRCh37_3:51417604-51417604_Frame-Shift-Del_DEL_C-C--,0.14,12.0,98.0,12.0,0.0,37.0,61.0,0.11,0.89,0.85,0.99
PGM5_GRCh37_9:70993145-70993145_Missense-Mutation_SNP_A-A-G,0.11,10.0,100.0,10.0,0.0,39.0,61.0,0.09,0.91,0.88,0.99
LARP4B_GRCh37_10:890939-890939_Frame-Shift-Del_DEL_T-T--,0.09,8.0,102.0,8.0,0.0,41.0,61.0,0.07,0.93,0.9,0.99
ZBTB20_GRCh37_3:114058003-114058003_Frame-Shift-Del_DEL_G-G--,0.09,12.0,98.0,11.0,1.0,38.0,60.0,0.11,0.89,0.9,0.99
ACVR2A_GRCh37_2:148683686-148683686_Frame-Shift-Del_DEL_A-A--,0.09,12.0,98.0,11.0,1.0,38.0,60.0,0.11,0.89,0.9,0.99
RNF43_GRCh37_17:56435161-56435161_Frame-Shift-Del_DEL_C-C--,0.09,12.0,98.0,11.0,1.0,38.0,60.0,0.11,0.89,0.9,0.99
KIAA0195_GRCh37_17:73491063-73491063_Frame-Shift-Del_DEL_C-C--,0.08,7.0,103.0,7.0,0.0,42.0,61.0,0.06,0.94,0.91,0.99
KDM1B_GRCh37_6:18222307-18222307_3'UTR_DEL_A-A--,0.08,7.0,103.0,7.0,0.0,42.0,61.0,0.06,0.94,0.91,0.99
TVP23C_GRCh37_17:15441469-15441469_Intron_SNP_C-C-T,0.08,7.0,103.0,7.0,0.0,42.0,61.0,0.06,0.94,0.91,0.99
UPF3A_GRCh37_13:115057211-115057211_Frame-Shift-Del_DEL_A-A--,0.08,7.0,103.0,7.0,0.0,42.0,61.0,0.06,0.94,0.91,0.99


In [15]:
def find_gain(data, gain_chart):


    # Calculate and store gain for each feature in the data
    for feature in data.columns:
        noncancerous, cancerous = find_NC_C(data)
        PC = len(cancerous) / samples
        PNC = len(noncancerous) / samples
        #H(t) = -[pC,t log2(pC,t) + pNC,t log2(pNC,t)]
        #H(t) = -[(probability of cancerous samples) * log2(probability of cancerous samples) + (probability of non-cancerous samples) * log2(probability of non-cancerous samples)]
        HT_value = -PC * logby2(PC) - PNC * logby2(PNC)
        L, R = find_tL_tR(feature, data)
        PL = len(L) / samples
        NCL, CL = find_NC_C(L)
        if len(L) > 0:
            HTL = -((len(CL) / len(L) + 0.000001) * logby2((len(CL) / len(L)) + 0.000001) + (len(NCL) / len(L) + 0.000001) * logby2((len(NCL) / len(L)) + 0.000001))
        else:
            HTL = 0

        PR = len(R) / samples
        NCR, CR = find_NC_C(R)
        if len(R) > 0:
            HTR = -((len(CR) / len(R) + 0.000001) * logby2((len(CR) / len(R)) + 0.000001) + (len(NCR) / len(R) + 0.00001) * logby2((len(NCR) / len(R)) + 0.00001))
        else:
            HTR = 0
        HST_value = (PL * HTL) + (PR * HTR)
        gain = HT_value - HST_value
        gain_chart.loc[feature] = [gain, len(L), len(R), len(L[L.index.str.startswith('C')]), len(L[L.index.str.startswith('NC')]), len(R[R.index.str.startswith('C')]), len(R[R.index.str.startswith('NC')]), PL, PR, HST_value, HT_value] 


In [16]:
def build_tree(training_data):
    gain_chart = pd.DataFrame(columns=['Gain', 'n(tL)', 'n(tR)', 'n(tL, C)', 'n(tL, NC)', 'n(tR, C)', 'n(tR, NC)', 'PL', 'PR', 'H(s,t)', 'H(t)'])
    find_gain(training_data, gain_chart)
    gain_chart.sort_values(by='Gain', ascending=False).head(10)
    top_mutation = gain_chart.sort_values(by='Gain', ascending=False).head(1).index[0]

    L, R = find_tL_tR(top_mutation, training_data)
    #drop top_mutation from L and R
    \

    find_gain(L, gain_chart)
    if(gain_chart.sort_values(by='Gain', ascending=False).head(1).index[0] == top_mutation):
        top_left_mutation = gain_chart.sort_values(by='Gain', ascending=False).head(2).index[1]
    else:
        top_left_mutation = gain_chart.sort_values(by='Gain', ascending=False).head(1).index[0]
    A1, A2 = find_tL_tR(top_left_mutation, L)

    find_gain(R, gain_chart)
    if(gain_chart.sort_values(by='Gain', ascending=False).head(1).index[0] == top_mutation):
        top_right_mutation = gain_chart.sort_values(by='Gain', ascending=False).head(2).index[1]
    else:
        top_right_mutation = gain_chart.sort_values(by='Gain', ascending=False).head(1).index[0]
    B1, B2 = find_tL_tR(top_right_mutation, R)

    return top_mutation, top_left_mutation, top_right_mutation, A1, A2, B1, B2
    


In [17]:
def classify_nodes(A1, A2, B1, B2):
    CA1 = len(A1[A1.index.str.startswith('C')])
    NCA1 = len(A1[A1.index.str.startswith('NC')])
    CA2 = len(A2[A2.index.str.startswith('C')])
    NCA2 = len(A2[A2.index.str.startswith('NC')])

    if CA1 > NCA1:
        classified_A1 = 'C'
    else:
        classified_A1 = 'NC'
    
    if CA2 > NCA2:
        classified_A2 = 'C'
    else:
        classified_A2 = 'NC'

    CB1 = len(B1[B1.index.str.startswith('C')])
    NCB1 = len(B1[B1.index.str.startswith('NC')])
    CB2 = len(B2[B2.index.str.startswith('C')])
    NCB2 = len(B2[B2.index.str.startswith('NC')])

    if CB1 > NCB1:
        classified_B1 = 'C'
    else:
        classified_B1 = 'NC'
    
    if CB2 > NCB2:
        classified_B2 = 'C'
    else:
        classified_B2 = 'NC'
    
    return classified_A1, classified_A2, classified_B1, classified_B2
    

In [None]:
def classify_tree(testing_data, top_mutation, top_left_mutation, top_right_mutation, classified_A1, classified_A2, classified_B1, classified_B2):
    TP, TN, FP, FN = 0, 0, 0, 0
    for sample in testing_data.index:
        if testing_data.loc[sample, top_mutation] == 1:
            if testing_data.loc[sample, top_left_mutation] == 1:
                if sample.startswith("NC") and classified_A1 == 'NC':
                    TN += 1
                elif sample.startswith("C") and classified_A1 == 'C':
                    TP += 1
                elif sample.startswith("NC") and classified_A1 == 'C':
                    FP += 1
                else:
                    FN += 1
            else:
                if sample.startswith("NC") and classified_A2 == 'NC':
                    TN += 1
                elif sample.startswith("C") and classified_A2 == 'C':
                    TP += 1
                elif sample.startswith("NC") and classified_A2 == 'C':
                    FP += 1
                else:
                    FN += 1
        elif testing_data.loc[sample, top_mutation] == 0:
            if testing_data.loc[sample, top_right_mutation] == 1:
                if sample.startswith("NC") and classified_B1== 'NC':
                    TN += 1
                elif sample.startswith("C") and classified_B1== 'C':
                    TP += 1
                elif sample.startswith("NC") and classified_B1== 'C':
                    FP += 1
                else:
                    FN += 1
            else:
                if sample.startswith("NC") and classified_B2 == 'NC':
                    TN += 1
                elif sample.startswith("C") and classified_B2 == 'C':
                    TP += 1
                elif sample.startswith("NC") and classified_B2 == 'C':
                    FP += 1
                else:
                    FN += 1
        
    return TP, TN, FP, FN


In [27]:
def calc_advanced_metrics(TP, FP, TN , FN ):
    advanced_metrics = []
    accuracy = (TP + TN) / (TP + FP + TN + FN) * 100
    sensitivity = TP / (TP + FN) * 100
    specificity = TN / (TN + FP) * 100
    precision = TP / (TP + FP) * 100
    miss_rate = FN / (FN + TP) * 100
    fdr = FP / (FP + TP) * 100
    forr = FN / (FN + TN + 0.00001) * 100

    advanced_metrics.append([accuracy, sensitivity, specificity, precision, miss_rate, fdr, forr])
    return advanced_metrics

In [19]:
top_mutation, top_left_mutation, top_right_mutation, A1, A2, B1, B2 = build_tree(data)
classified_A1, classified_A2, classified_B1, classified_B2 = classify_nodes(A1, A2, B1, B2)

print('Tree for ALL Data: ')

print('Top Mutation: ', top_mutation)
print('Top Left Mutation: ', top_left_mutation)
print('A1: ', classified_A1)
print('A2: ', classified_A2)
print('Top Right Mutation: ', top_right_mutation)
print('B1: ', classified_B1)
print('B2: ', classified_B2)

TP, TN, FP, FN = classify_tree(data, top_mutation, top_left_mutation, top_right_mutation, classified_A1, classified_A2, classified_B1, classified_B2)
print('TP: ', TP)
print('TN: ', TN)
print('FP: ', FP)
print('FN: ', FN)


#https://docs.google.com/drawings/d/1sKoUrVoSA9odh0GtuRCxhZ39PNI7JRLHeGkEATh0Duo/edit?usp=sharing

Tree for ALL Data: 
Top Mutation:  DOCK3_GRCh37_3:51417604-51417604_Frame-Shift-Del_DEL_C-C--
Top Left Mutation:  UBR5_GRCh37_8:103289349-103289349_Frame-Shift-Del_DEL_T-T--
A1:  C
A2:  C
Top Right Mutation:  KDM1B_GRCh37_6:18222307-18222307_3'UTR_DEL_A-A--
B1:  C
B2:  NC
TP:  16
TN:  61
FP:  0
FN:  33


In [20]:
# Sort Data into 3 groups 
# 2/3 = training data
# 1/3 = testing data

np.random.seed(69)
shuffled_data = np.random.permutation(data.index)
split_data = np.array_split(shuffled_data, 3)

one_third_one = data.loc[split_data[0]]
one_third_two = data.loc[split_data[1]]
one_third_three = data.loc[split_data[2]]

# print out data
print('1/3 of Data = ', one_third_one.index)
print('1/3 of Data = ', one_third_two.index)
print('1/3 of Data = ', one_third_three.index)

1/3 of Data =  Index(['C3', 'NC8', 'C29', 'NC12', 'NC36', 'C31', 'NC29', 'C15', 'NC6', 'C37',
       'C41', 'NC40', 'NC50', 'C16', 'C27', 'NC17', 'NC15', 'C45', 'C11',
       'C34', 'NC28', 'C13', 'NC37', 'C39', 'C43', 'NC58', 'NC23', 'NC10',
       'C10', 'NC2', 'NC19', 'NC54', 'NC33', 'NC3', 'C40', 'NC52', 'C1'],
      dtype='object')
1/3 of Data =  Index(['C47', 'NC46', 'C25', 'C28', 'NC48', 'C24', 'C44', 'NC1', 'NC4', 'C46',
       'NC22', 'C14', 'C48', 'NC34', 'C33', 'NC16', 'NC13', 'NC45', 'C20',
       'NC27', 'C18', 'NC43', 'C0', 'C12', 'NC41', 'NC51', 'C30', 'C32', 'C5',
       'NC7', 'NC35', 'NC11', 'C19', 'C22', 'C26', 'NC59', 'NC47'],
      dtype='object')
1/3 of Data =  Index(['NC38', 'NC39', 'NC57', 'C6', 'NC9', 'NC24', 'NC18', 'C8', 'NC25',
       'C38', 'C21', 'C42', 'C35', 'NC0', 'C9', 'NC21', 'NC49', 'C2', 'NC30',
       'NC60', 'NC20', 'C4', 'NC53', 'NC55', 'C17', 'NC56', 'NC32', 'NC5',
       'C7', 'NC26', 'NC14', 'NC31', 'C36', 'NC42', 'NC44', 'C23'],
      dtype='

In [None]:
# 1_1/3 + 2_1/3 = training data
training_data = pd.concat([one_third_one, one_third_two])

# 3_1/3 = testing data
testing_data = one_third_three

top_mutation, top_left_mutation, top_right_mutation, A1, A2, B1, B2 = build_tree(training_data)
classified_A1, classified_A2, classified_B1, classified_B2 = classify_nodes(A1, A2, B1, B2)

# print('Tree for Training Data: ')
# print('Top Mutation: ', top_mutation)
# print('Top Left Mutation: ', top_left_mutation)
# print('A1: ', classified_A1)
# print('A2: ', classified_A2)
# print('Top Right Mutation: ', top_right_mutation)
# print('B1: ', classified_B1)
# print('B2: ', classified_B2)

print('If the sample has mutation', top_mutation, 'then:')
print('     If the sample has mutation', top_left_mutation, 'then:')
print('         Classify as', classified_A1)
print('         Else classify as', classified_A2)
print('     Else if the sample has mutation', top_right_mutation, 'then:')
print('         Classify as', classified_B1)
print('         Else classify as', classified_B2)

TP, TN, FP, FN = classify_tree(testing_data, top_mutation, top_left_mutation, top_right_mutation, classified_A1, classified_A2, classified_B1, classified_B2)
advanced_metrics_1 = calc_advanced_metrics(TP, FP, TN, FN)
advanced_metrics_1 = pd.DataFrame(advanced_metrics_1, columns=['Accuracy', 'Sensitivity', 'Specificity', 'Precision', 'Miss Rate', 'FDR', 'FOR'])
advanced_metrics_1



If the sample has mutation ACVR2A_GRCh37_2:148683686-148683686_Frame-Shift-Del_DEL_A-A-- then:
     If the sample has mutation RPL22_GRCh37_1:6257785-6257785_Frame-Shift-Del_DEL_T-T-- then:
         Classify as C
         Else classify as C
     Else if the sample has mutation DOCK3_GRCh37_3:51417604-51417604_Frame-Shift-Del_DEL_C-C-- then:
         Classify as C
         Else classify as NC


Unnamed: 0,Accuracy,Sensitivity,Specificity,Precision,Miss Rate,FDR,FOR
0,69.44,23.08,95.65,75.0,76.92,25.0,31.25


In [34]:
# 1_1/3 + 2_1/3 = training data
training_data = pd.concat([one_third_three, one_third_two])

# 3_1/3 = testing data
testing_data = one_third_one

top_mutation, top_left_mutation, top_right_mutation, A1, A2, B1, B2 = build_tree(training_data)
classified_A1, classified_A2, classified_B1, classified_B2 = classify_nodes(A1, A2, B1, B2)

# print('Tree for Training Data: ')
# print('Top Mutation: ', top_mutation)
# print('Top Left Mutation: ', top_left_mutation)
# print('A1: ', classified_A1)
# print('A2: ', classified_A2)
# print('Top Right Mutation: ', top_right_mutation)
# print('B1: ', classified_B1)
# print('B2: ', classified_B2)

print('If the sample has mutation', top_mutation, 'then:')
print('     If the sample has mutation', top_left_mutation, 'then:')
print('         Classify as', classified_A1)
print('         Else classify as', classified_A2)
print('     Else if the sample has mutation', top_right_mutation, 'then:')
print('         Classify as', classified_B1)
print('         Else classify as', classified_B2)

TP, TN, FP, FN = classify_tree(testing_data, top_mutation, top_left_mutation, top_right_mutation, classified_A1, classified_A2, classified_B1, classified_B2)
advanced_metrics_2 = calc_advanced_metrics(TP, FP, TN, FN)
advanced_metrics_2 = pd.DataFrame(advanced_metrics_2, columns=['Accuracy', 'Sensitivity', 'Specificity', 'Precision', 'Miss Rate', 'FDR', 'FOR'])
advanced_metrics_2



If the sample has mutation PGM5_GRCh37_9:70993145-70993145_Missense-Mutation_SNP_A-A-G then:
     If the sample has mutation LARP4B_GRCh37_10:890939-890939_Frame-Shift-Del_DEL_T-T-- then:
         Classify as C
         Else classify as C
     Else if the sample has mutation PUM2_GRCh37_2:20451242-20451242_3'UTR_DEL_A-A-- then:
         Classify as C
         Else classify as NC


Unnamed: 0,Accuracy,Sensitivity,Specificity,Precision,Miss Rate,FDR,FOR
0,56.76,11.76,95.0,66.67,88.24,33.33,44.12


In [35]:
# 1_1/3 + 2_1/3 = training data
training_data = pd.concat([one_third_three, one_third_one])

# 3_1/3 = testing data
testing_data = one_third_two

top_mutation, top_left_mutation, top_right_mutation, A1, A2, B1, B2 = build_tree(training_data)
classified_A1, classified_A2, classified_B1, classified_B2 = classify_nodes(A1, A2, B1, B2)

# print('Tree for Training Data: ')
# print('Top Mutation: ', top_mutation)
# print('Top Left Mutation: ', top_left_mutation)
# print('A1: ', classified_A1)
# print('A2: ', classified_A2)
# print('Top Right Mutation: ', top_right_mutation)
# print('B1: ', classified_B1)
# print('B2: ', classified_B2)

print('If the sample has mutation', top_mutation, 'then:')
print('     If the sample has mutation', top_left_mutation, 'then:')
print('         Classify as', classified_A1)
print('         Else classify as', classified_A2)
print('     Else if the sample has mutation', top_right_mutation, 'then:')
print('         Classify as', classified_B1)
print('         Else classify as', classified_B2)

TP, TN, FP, FN = classify_tree(testing_data, top_mutation, top_left_mutation, top_right_mutation, classified_A1, classified_A2, classified_B1, classified_B2)
advanced_metrics_3 = calc_advanced_metrics(TP, FP, TN, FN)
advanced_metrics_3 = pd.DataFrame(advanced_metrics_3, columns=['Accuracy', 'Sensitivity', 'Specificity', 'Precision', 'Miss Rate', 'FDR', 'FOR'])
advanced_metrics_3



If the sample has mutation DOCK3_GRCh37_3:51417604-51417604_Frame-Shift-Del_DEL_C-C-- then:
     If the sample has mutation UBR5_GRCh37_8:103289349-103289349_Frame-Shift-Del_DEL_T-T-- then:
         Classify as C
         Else classify as C
     Else if the sample has mutation ARL4D_GRCh37_17:41477713-41477713_3'UTR_DEL_C-C-- then:
         Classify as C
         Else classify as NC


Unnamed: 0,Accuracy,Sensitivity,Specificity,Precision,Miss Rate,FDR,FOR
0,59.46,21.05,100.0,100.0,78.95,0.0,45.45


In [36]:
average_accuracy = (advanced_metrics_1['Accuracy'] + advanced_metrics_2['Accuracy'] + advanced_metrics_3['Accuracy']) / 3
average_sensitive = (advanced_metrics_1['Sensitivity'] + advanced_metrics_2['Sensitivity'] + advanced_metrics_3['Sensitivity']) / 3
average_specificity = (advanced_metrics_1['Specificity'] + advanced_metrics_2['Specificity'] + advanced_metrics_3['Specificity']) / 3
average_precision = (advanced_metrics_1['Precision'] + advanced_metrics_2['Precision'] + advanced_metrics_3['Precision']) / 3
average_miss_rate = (advanced_metrics_1['Miss Rate'] + advanced_metrics_2['Miss Rate'] + advanced_metrics_3['Miss Rate']) / 3
average_fdr = (advanced_metrics_1['FDR'] + advanced_metrics_2['FDR'] + advanced_metrics_3['FDR']) / 3
average_forr = (advanced_metrics_1['FOR'] + advanced_metrics_2['FOR'] + advanced_metrics_3['FOR']) / 3


# accuracy = total number of correct predictions / total number of predictions
print(f'Average Accuracy: {average_accuracy.values[0]:.2f}%')
# sensitivity = true positive rate = TP / (TP + FN) = ability to classify true positives from all predicted positives
print(f'Average Sensitivity: {average_sensitive.values[0]:.2f}%')
# specificity = true negative rate = TN / (TN + FP) = ability to classify true negatives from all predicted negatives
print(f'Average Specificity: {average_specificity.values[0]:.2f}%')
# precision = TP / (TP + FP) = ability to classify true positives from all predicted positives
print(f'Average Precision: {average_precision.values[0]:.2f}%')
# miss rate = FN / (FN + TP) = ability to classify true negatives from all predicted negatives
print(f'Average Miss Rate: {average_miss_rate.values[0]:.2f}%')
# FDR = FP / (FP + TP) = ability to classify false postivies from all predicted positives
print(f'Average FDR: {average_fdr.values[0]:.2f}%')
# FOR = FN / (FN + TN) = ability to classify false negatives from all predicted negatives
print(f'Average FOR: {average_forr.values[0]:.2f}%')

Average Accuracy: 61.89%
Average Sensitivity: 18.63%
Average Specificity: 96.88%
Average Precision: 80.56%
Average Miss Rate: 81.37%
Average FDR: 19.44%
Average FOR: 40.27%
