In [20]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
pd.options.display.float_format = '{:.2f}'.format

filename = 'mutations.csv'
data = pd.read_csv(filename, index_col=0)
samples = data.shape[0]
data.head()

Unnamed: 0,ANKRD26_GRCh37_10:27322259-27322259_Frame-Shift-Del_DEL_T-T--,ARID5B_GRCh37_10:63850705-63850705_Frame-Shift-Del_DEL_A-A--,PTEN_GRCh37_10:89717770-89717770_Frame-Shift-Del_DEL_A-A--,C11orf70_GRCh37_11:101937275-101937275_Frame-Shift-Del_DEL_T-T--,LRRC43_GRCh37_12:122685346-122685346_Frame-Shift-Del_DEL_C-C--,FARP1_GRCh37_13:99092237-99092237_Frame-Shift-Del_DEL_G-G--,SNAPC1_GRCh37_14:62242911-62242911_Frame-Shift-Del_DEL_T-T--,ZC3H18_GRCh37_16:88691141-88691141_Frame-Shift-Del_DEL_C-C--,KIF2B_GRCh37_17:51901904-51901904_Missense-Mutation_SNP_C-C-T,KIF2B_GRCh37_17:51902014-51902014_Frame-Shift-Del_DEL_A-A--,...,PWWP2B_GRCh37_10:134230688-134230688_3'UTR_DEL_C-C--,DPYSL2_GRCh37_8:26513311-26513311_3'UTR_SNP_T-T-G,NME5_GRCh37_5:137451362-137451362_3'UTR_DEL_T-T--,MGP_GRCh37_12:15035053-15035053_3'UTR_DEL_T-T--,NAP1L1_GRCh37_12:76442093-76442093_3'UTR_DEL_A-A--,SYTL1_GRCh37_1:27680355-27680356_3'UTR_DEL_CT-CT--,LRIT1_GRCh37_10:85991648-85991648_3'UTR_SNP_G-G-T,PLK4_GRCh37_4:128819735-128819735_3'UTR_DEL_T-T--,ZBED6CL_GRCh37_7:150028250-150028250_3'UTR_SNP_C-C-T,TERF2IP_GRCh37_16:75690558-75690558_3'UTR_DEL_A-A--
C0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
NC0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
NC1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
# find (tL) and (tR) for a given mutation

def find_tL_tR(data, mutation):
    tL = []
    tR = []
    for sample in data.index:
        if data[mutation][sample] == 1:
            tL.append(sample)
        else:
            tR.append(sample)
    return tL, tR

In [22]:
# seperate sample into 2 groups based on NC or C

def find_NC_C(samples):
    NC = []
    C = []
    # go through array of samples and find if they are NC or C
    for sample in samples:
        if sample.startswith('NC'):
            NC.append(sample)
        else:
            C.append(sample)
    return NC, C
    
        

In [23]:
# create a dataframe with...
# (tL) n(tR) n(tL, C) n(tL,NC) PL PR P(C | tL) P(NC | tL) P(C | tR) P(NC | tR) 2PLPR Q phi(s,t)

#make this a function so i can resue it later
def create_phi_table(data):
    phi_value_table = pd.DataFrame(columns=['Mutation', '(tL)', 'n(tR)', 'n(tL, C)', 'n(tL, NC)', 'PL', 'PR', 'P(C | tL)', 'P(NC | tL)', 'P(C | tR)', 'P(NC | tR)', '2PLPR', 'Q', 'phi(s,t)'])

    for mutation in data.columns:
        nTL, nTR = find_tL_tR(data, mutation)
        nTL_NC, nTL_C = find_NC_C(nTL)
        nTR_NC, nTR_C = find_NC_C(nTR)  
        PL = len(nTL) / samples
        PR = len(nTR) / samples
        P_C_tL = len(nTL_C) / (len(nTL) + 0.00000001)
        P_NC_tL = len(nTL_NC) / (len(nTL) + 0.00000001)
        P_C_tR = len(nTR_C) / (len(nTR) + 0.0000001)
        P_NC_tR = len(nTR_NC) / (len(nTR) + 0.0000001)
        PLPR = 2 * PL * PR
        Q = (abs(P_C_tL - P_C_tR)) + (abs(P_NC_tL - P_NC_tR))
        phi = PLPR * Q


        #add all the values to the table
        phi_value_table.loc[len(phi_value_table)] = [
            mutation,
            len(nTL),
            len(nTR),
            len(nTL_C),
            len(nTL_NC),
            PL ,
            PR ,
            P_C_tL ,
            P_NC_tL,
            P_C_tR ,
            P_NC_tR,
            PLPR,
            Q,
            phi
        ]

    phi_value_table.sort_values('phi(s,t)', ascending=False, inplace=True)
    return phi_value_table


In [24]:
phi_value_table = create_phi_table(data)
phi_value_table


Unnamed: 0,Mutation,(tL),n(tR),"n(tL, C)","n(tL, NC)",PL,PR,P(C | tL),P(NC | tL),P(C | tR),P(NC | tR),2PLPR,Q,"phi(s,t)"
169,DOCK3_GRCh37_3:51417604-51417604_Frame-Shift-D...,12,98,12,0,0.11,0.89,1.00,0.00,0.38,0.62,0.19,1.24,0.24
160,ZBTB20_GRCh37_3:114058003-114058003_Frame-Shif...,12,98,11,1,0.11,0.89,0.92,0.08,0.39,0.61,0.19,1.06,0.21
448,RNF43_GRCh37_17:56435161-56435161_Frame-Shift-...,12,98,11,1,0.11,0.89,0.92,0.08,0.39,0.61,0.19,1.06,0.21
285,ACVR2A_GRCh37_2:148683686-148683686_Frame-Shif...,12,98,11,1,0.11,0.89,0.92,0.08,0.39,0.61,0.19,1.06,0.21
478,PGM5_GRCh37_9:70993145-70993145_Missense-Mutat...,10,100,10,0,0.09,0.91,1.00,0.00,0.39,0.61,0.17,1.22,0.20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
482,DFNB31_GRCh37_9:117266506-117266506_Silent_SNP...,2,108,1,1,0.02,0.98,0.50,0.50,0.44,0.56,0.04,0.11,0.00
66,MGA_GRCh37_15:42034971-42034971_Frame-Shift-De...,2,108,1,1,0.02,0.98,0.50,0.50,0.44,0.56,0.04,0.11,0.00
504,MED1_GRCh37_17:37564657-37564659_In-Frame-Del_...,2,108,1,1,0.02,0.98,0.50,0.50,0.44,0.56,0.04,0.11,0.00
44,ZDHHC5_GRCh37_11:57466855-57466857_In-Frame-De...,2,108,1,1,0.02,0.98,0.50,0.50,0.44,0.56,0.04,0.11,0.00


In [25]:
def classify(data):

    A1_C = 0
    A1_NC = 0
    A2_C = 0
    A2_NC = 0
    B1_C = 0
    B1_NC = 0
    B2_C = 0
    B2_NC = 0

    A1, A2, B1, B2 = '', '', '', ''

    for sample in data.index:
        if data[best_mutation][sample]:
            if data[best_mutation_A][sample]:
                if sample.startswith('NC'):
                    A1_NC += 1
                else:
                    A1_C += 1
            if not data[best_mutation_A][sample]:
                if sample.startswith('NC'):
                    A2_NC += 1
                else:
                    A2_C += 1

            else: 
                if data[best_mutation_B][sample]:
                    if sample.startswith('NC'):
                        B1_NC += 1
                    else:
                        B1_C += 1
                if not data[best_mutation_B][sample]:
                    if sample.startswith('NC'):
                        B2_NC += 1
                    else:
                        B2_C += 1

    if A1_C > A1_NC:
        A1 = 'C'
    else:
        A1 = 'NC'
    if A2_C > A2_NC:
        A2 = 'C'
    else:
        A2 = 'NC'
    if B1_C > B1_NC:
        B1 = 'C'
    else:
        B1 = 'NC'
    if B2_C > B2_NC:
        B2 = 'C'
    else:
        B2 = 'NC'

    return A1, A2, B1, B2      
            

In [26]:
# pick the best mutation
best_mutation = phi_value_table.iloc[0]['Mutation']
best_mutation

'DOCK3_GRCh37_3:51417604-51417604_Frame-Shift-Del_DEL_C-C--'

In [27]:
A, B = find_tL_tR(data, best_mutation)
print(A)
print(len(A))
print(B)
print(len(B))


['C1', 'C10', 'C21', 'C29', 'C32', 'C34', 'C38', 'C41', 'C43', 'C44', 'C46', 'C47']
12
['C0', 'NC0', 'C2', 'NC1', 'NC2', 'NC3', 'NC4', 'C3', 'NC5', 'NC6', 'NC7', 'C4', 'NC8', 'NC9', 'NC10', 'NC11', 'NC12', 'C5', 'NC13', 'NC14', 'C6', 'C7', 'C8', 'C9', 'NC15', 'NC16', 'C11', 'C12', 'NC17', 'C13', 'C14', 'C15', 'C16', 'NC18', 'NC19', 'NC20', 'C17', 'NC21', 'NC22', 'C18', 'NC23', 'NC24', 'NC25', 'C19', 'C20', 'C22', 'NC26', 'NC27', 'NC28', 'NC29', 'NC30', 'C23', 'NC31', 'NC32', 'C24', 'NC33', 'NC34', 'C25', 'C26', 'C27', 'NC35', 'NC36', 'C28', 'C30', 'NC37', 'NC38', 'NC39', 'NC40', 'NC41', 'NC42', 'NC43', 'NC44', 'NC45', 'NC46', 'NC47', 'NC48', 'NC49', 'C31', 'C33', 'NC50', 'NC51', 'NC52', 'NC53', 'C35', 'C36', 'C37', 'NC54', 'C39', 'C40', 'NC55', 'NC56', 'C42', 'C45', 'NC57', 'NC58', 'NC59', 'NC60', 'C48']
98


In [28]:
phi_table_A = create_phi_table(data.loc[A])
phi_table_A

best_mutation_A = phi_table_A.iloc[0]['Mutation']
print(best_mutation_A)

A1, A2 = find_tL_tR(data.loc[A], best_mutation_A)
print(A1)
print(len(A1))
print(A2)
print(len(A2))

UBR5_GRCh37_8:103289349-103289349_Frame-Shift-Del_DEL_T-T--
['C1', 'C21', 'C32', 'C34', 'C38', 'C41', 'C47']
7
['C10', 'C29', 'C43', 'C44', 'C46']
5


In [29]:
phi_table_B = create_phi_table(data.loc[B])
phi_table_B

best_mutation_B = phi_table_B.iloc[0]['Mutation']
print(best_mutation_B)

B1, B2 = find_tL_tR(data.loc[B], best_mutation_B)
print(B1)
print(len(B1))
print(B2)
print(len(B2))


ACVR2A_GRCh37_2:148683686-148683686_Frame-Shift-Del_DEL_A-A--
['C4', 'C11', 'C13', 'C27', 'C28', 'C40', 'NC57']
7
['C0', 'NC0', 'C2', 'NC1', 'NC2', 'NC3', 'NC4', 'C3', 'NC5', 'NC6', 'NC7', 'NC8', 'NC9', 'NC10', 'NC11', 'NC12', 'C5', 'NC13', 'NC14', 'C6', 'C7', 'C8', 'C9', 'NC15', 'NC16', 'C12', 'NC17', 'C14', 'C15', 'C16', 'NC18', 'NC19', 'NC20', 'C17', 'NC21', 'NC22', 'C18', 'NC23', 'NC24', 'NC25', 'C19', 'C20', 'C22', 'NC26', 'NC27', 'NC28', 'NC29', 'NC30', 'C23', 'NC31', 'NC32', 'C24', 'NC33', 'NC34', 'C25', 'C26', 'NC35', 'NC36', 'C30', 'NC37', 'NC38', 'NC39', 'NC40', 'NC41', 'NC42', 'NC43', 'NC44', 'NC45', 'NC46', 'NC47', 'NC48', 'NC49', 'C31', 'C33', 'NC50', 'NC51', 'NC52', 'NC53', 'C35', 'C36', 'C37', 'NC54', 'C39', 'NC55', 'NC56', 'C42', 'C45', 'NC58', 'NC59', 'NC60', 'C48']
91


In [30]:
print("best_mutation: ", best_mutation)
print("best_mutation_A: ", best_mutation_A)
print("best_mutation_B: ", best_mutation_B)


A1, A2, B1, B2 = classify(data)

print("A1: ", A1)
print("A2: ", A2)
print("B1: ", B1)
print("B2: ", B2)



best_mutation:  DOCK3_GRCh37_3:51417604-51417604_Frame-Shift-Del_DEL_C-C--
best_mutation_A:  UBR5_GRCh37_8:103289349-103289349_Frame-Shift-Del_DEL_T-T--
best_mutation_B:  ACVR2A_GRCh37_2:148683686-148683686_Frame-Shift-Del_DEL_A-A--
A1:  C
A2:  C
B1:  C
B2:  C


In [31]:
def build_tree(data):
    phi_value_table = create_phi_table(data)
    mutation = phi_value_table.iloc[0]['Mutation']
    A, B = find_tL_tR(data, mutation)
    phi_table_A = create_phi_table(data.loc[A])
    if(phi_table_A.iloc[0]['Mutation'] == mutation):
        best_mutation_A = phi_table_A.iloc[1]['Mutation']
    else:
        best_mutation_A = phi_table_A.iloc[0]['Mutation']
    A1, A2 = find_tL_tR(data.loc[A], best_mutation_A)

    if(phi_table_B.iloc[0]['Mutation'] == mutation or phi_table_B.iloc[0]['Mutation'] == best_mutation_A):
        best_mutation_B = phi_table_B.iloc[2]['Mutation']
    else:
        if(phi_table_B.iloc[0]['Mutation'] == mutation):
            best_mutation_B = phi_table_B.iloc[1]['Mutation']
        else:
            best_mutation_B = phi_table_B.iloc[0]['Mutation']
    B1, B2 = find_tL_tR(data.loc[B], best_mutation_B)
    
    return mutation, best_mutation_A, best_mutation_B, A1, A2, B1, B2

In [32]:
def calculate_mutation_data(data, best_mutation, best_mutation_A, best_mutation_B, A1, A2, B1, B2):
    TN, TP, FN, FP = 0, 0, 0, 0
    for sample in data.index:
        if data[best_mutation][sample] == 1:
            if data[best_mutation_A][sample] == 1:
                if sample.startswith("NC") and A1 == 'NC':
                    TN += 1
                elif sample.startswith("C") and A1 == 'C':
                    TP += 1
                elif sample.startswith("NC") and A1 == 'C':
                    FP += 1
                else:
                    # sample.startswith("C") and A1 == 'NC'
                    FN += 1
            else:
                if sample.startswith("NC") and A2 == 'NC':
                    TN += 1
                elif sample.startswith("C") and A2 == 'C':
                    TP += 1
                elif sample.startswith("NC") and A2 == 'C':
                    FP += 1
                else:
                    # sample.startswith("C") and A2 == 'NC'
                    FN += 1
        else:
            if data[best_mutation_B][sample] == 1:
                if sample.startswith("NC") and B1 == 'NC':
                    TN += 1
                elif sample.startswith("C") and B1 == 'C':
                    TP += 1
                elif sample.startswith("NC") and B1 == 'C':
                    FP += 1
                else:
                    # sample.startswith("C") and B1 == 'NC'
                    FN += 1
            else:
                if sample.startswith("NC") and B2 == 'NC':
                    TN += 1
                elif sample.startswith("C") and B2 == 'C':
                    TP += 1
                elif sample.startswith("NC") and B2 == 'C':
                    FP += 1
                else:
                    # sample.startswith("C") and B2 == 'NC'
                    FN += 1
    return TN, TP, FN, FP
    

In [33]:
# Advanced Metrics 
def calc_advanced_metrics(TP, FP, TN , FN ):
    advanced_metrics = []
    accuracy = (TP + TN) / (TP + FP + TN + FN) * 100
    sensitivity = TP / (TP + FN) * 100
    specificity = TN / (TN + FP) * 100
    precision = TP / (TP + FP) * 100
    miss_rate = FN / (FN + TP) * 100
    fdr = FP / (FP + TP) * 100
    forr = FN / (FN + TN + 0.00001) * 100

    advanced_metrics.append([accuracy, sensitivity, specificity, precision, miss_rate, fdr, forr])
    return advanced_metrics

In [34]:
# Sort Data into 3 groups 
# 2/3 = training data
# 1/3 = testing data

np.random.seed(69)
shuffled_data = np.random.permutation(data.index)
split_data = np.array_split(shuffled_data, 3)

one_third_one = data.loc[split_data[0]]
one_third_two = data.loc[split_data[1]]
one_thrid_three = data.loc[split_data[2]]

# print out data
print('1/3 of Data = ', one_third_one.index)
print('1/3 of Data = ', one_third_two.index)
print('1/3 of Data = ', one_thrid_three.index)

1/3 of Data =  Index(['C3', 'NC8', 'C29', 'NC12', 'NC36', 'C31', 'NC29', 'C15', 'NC6', 'C37',
       'C41', 'NC40', 'NC50', 'C16', 'C27', 'NC17', 'NC15', 'C45', 'C11',
       'C34', 'NC28', 'C13', 'NC37', 'C39', 'C43', 'NC58', 'NC23', 'NC10',
       'C10', 'NC2', 'NC19', 'NC54', 'NC33', 'NC3', 'C40', 'NC52', 'C1'],
      dtype='object')
1/3 of Data =  Index(['C47', 'NC46', 'C25', 'C28', 'NC48', 'C24', 'C44', 'NC1', 'NC4', 'C46',
       'NC22', 'C14', 'C48', 'NC34', 'C33', 'NC16', 'NC13', 'NC45', 'C20',
       'NC27', 'C18', 'NC43', 'C0', 'C12', 'NC41', 'NC51', 'C30', 'C32', 'C5',
       'NC7', 'NC35', 'NC11', 'C19', 'C22', 'C26', 'NC59', 'NC47'],
      dtype='object')
1/3 of Data =  Index(['NC38', 'NC39', 'NC57', 'C6', 'NC9', 'NC24', 'NC18', 'C8', 'NC25',
       'C38', 'C21', 'C42', 'C35', 'NC0', 'C9', 'NC21', 'NC49', 'C2', 'NC30',
       'NC60', 'NC20', 'C4', 'NC53', 'NC55', 'C17', 'NC56', 'NC32', 'NC5',
       'C7', 'NC26', 'NC14', 'NC31', 'C36', 'NC42', 'NC44', 'C23'],
      dtype='

In [35]:
# Create Training Data
# Training Data = 1/3_1 + 1/3_2
# Testing Data = 1/3_3
training_data = pd.concat([one_third_one, one_third_two])
#print('Training Data = ', training_data.index)
test_data = one_thrid_three
#print('Test Data = ', test_data.index)

# Build Tree
mutation, best_mutation_A, best_mutation_B, A1, A2, B1, B2 = build_tree(training_data)
# print('Mutation: ', mutation)
# print('Best Mutation A: ', best_mutation_A)
# print('Best Mutation B: ', best_mutation_B)



# Classify Test Data
A1, A2, B1, B2 = classify(training_data)


print("-------------------------------------------------------------------------------")
print('If the sample has mutation', best_mutation, 'then:')
print('     If the sample has mutation', best_mutation_A, 'then:')
print('         Classify as', A1)
print('         Else classify as', A2)
print('     Else if the sample has mutation', best_mutation_B, 'then:')
print('         Classify as', B1)
print('         Else classify as', B2)

print("-------------------------------------------------------------------------------")
TN, TP, FN, FP = calculate_mutation_data(test_data, best_mutation, best_mutation_A, best_mutation_B, A1, A2, B1, B2)
print('TN:', TN)
print('TP:', TP)
print('FN:', FN)
print('FP:', FP)


print("-------------------------------------------------------------------------------")
advanced_metrics_1 = calc_advanced_metrics(TP, FP, TN, FN)
advanced_metrics_1 = pd.DataFrame(advanced_metrics_1, columns=['Accuracy', 'Sensitivity', 'Specificity', 'Precision', 'Miss Rate', 'FDR', 'FOR'])
advanced_metrics_1



-------------------------------------------------------------------------------
If the sample has mutation DOCK3_GRCh37_3:51417604-51417604_Frame-Shift-Del_DEL_C-C-- then:
     If the sample has mutation ZBTB20_GRCh37_3:114058003-114058003_Frame-Shift-Del_DEL_G-G-- then:
         Classify as C
         Else classify as C
     Else if the sample has mutation UPF3A_GRCh37_13:115057211-115057211_Frame-Shift-Del_DEL_A-A-- then:
         Classify as C
         Else classify as C
-------------------------------------------------------------------------------
TN: 0
TP: 13
FN: 0
FP: 23
-------------------------------------------------------------------------------


Unnamed: 0,Accuracy,Sensitivity,Specificity,Precision,Miss Rate,FDR,FOR
0,36.11,100.0,0.0,36.11,0.0,63.89,0.0


In [36]:
# Create Training Data
# training data = 1/3_1 + 1/3_3
# testing data = 1/3_2
training_data = pd.concat([one_third_one, one_thrid_three])
# print('Training Data = ', training_data.index)
test_data = one_third_two
# print('Test Data = ', test_data.index)

# Build Tree
mutation, best_mutation_A, best_mutation_B, A1, A2, B1, B2 = build_tree(training_data)
# print('Mutation: ', mutation)
# print('Best Mutation A: ', best_mutation_A)
# print('Best Mutation B: ', best_mutation_B)


# Classify Test Data
A1, A2, B1, B2 = classify(training_data)


print("-------------------------------------------------------------------------------")
print('If the sample has mutation', best_mutation, 'then:')
print('     If the sample has mutation', best_mutation_A, 'then:')
print('         Classify as', A1)
print('         Else classify as', A2)
print('     Else if the sample has mutation', best_mutation_B, 'then:')
print('         Classify as', B1)
print('         Else classify as', B2)

print("-------------------------------------------------------------------------------")
TN, TP, FN, FP = calculate_mutation_data(test_data, best_mutation, best_mutation_A, best_mutation_B, A1, A2, B1, B2)
print('TN:', TN)
print('TP:', TP)
print('FN:', FN)
print('FP:', FP)


print("-------------------------------------------------------------------------------")
advanced_metrics_2 = calc_advanced_metrics(TP, FP, TN, FN)
advanced_metrics_2 = pd.DataFrame(advanced_metrics_2, columns=['Accuracy', 'Sensitivity', 'Specificity', 'Precision', 'Miss Rate', 'FDR', 'FOR'])
advanced_metrics_2



-------------------------------------------------------------------------------
If the sample has mutation DOCK3_GRCh37_3:51417604-51417604_Frame-Shift-Del_DEL_C-C-- then:
     If the sample has mutation KDM3B_GRCh37_5:137756625-137756625_Frame-Shift-Del_DEL_C-C-- then:
         Classify as C
         Else classify as C
     Else if the sample has mutation ACVR2A_GRCh37_2:148683686-148683686_Frame-Shift-Del_DEL_A-A-- then:
         Classify as C
         Else classify as NC
-------------------------------------------------------------------------------
TN: 18
TP: 5
FN: 14
FP: 0
-------------------------------------------------------------------------------


Unnamed: 0,Accuracy,Sensitivity,Specificity,Precision,Miss Rate,FDR,FOR
0,62.16,26.32,100.0,100.0,73.68,0.0,43.75


In [37]:
# Create Training Data
# training data = 1/3_2 + 1/3_3
# testing data = 1/3_1
training_data = pd.concat([one_third_two, one_thrid_three])
#print('Training Data = ', training_data.index)
test_data = one_third_one
#print('Test Data = ', test_data.index)

# Build Tree
mutation, best_mutation_A, best_mutation_B, A1, A2, B1, B2 = build_tree(training_data)
# print('Mutation: ', mutation)
# print('Best Mutation A: ', best_mutation_A)
# print('Best Mutation B: ', best_mutation_B)


# Classify Test Data
A1, A2, B1, B2 = classify(training_data)

print("-------------------------------------------------------------------------------")
print('If the sample has mutation', best_mutation, 'then:')
print('     If the sample has mutation', best_mutation_A, 'then:')
print('         Classify as', A1)
print('         Else classify as', A2)
print('     Else if the sample has mutation', best_mutation_B, 'then:')
print('         Classify as', B1)
print('         Else classify as', B2)

print("-------------------------------------------------------------------------------")
TN, TP, FN, FP = calculate_mutation_data(test_data, best_mutation, best_mutation_A, best_mutation_B, A1, A2, B1, B2)
print('TN:', TN)
print('TP:', TP)
print('FN:', FN)
print('FP:', FP)


print("-------------------------------------------------------------------------------")
advanced_metrics_3 = calc_advanced_metrics( TP, FP, TN, FN)
advanced_metrics_3 = pd.DataFrame(advanced_metrics_3, columns=['Accuracy', 'Sensitivity', 'Specificity', 'Precision', 'Miss Rate', 'FDR', 'FOR'])
advanced_metrics_3


-------------------------------------------------------------------------------
If the sample has mutation DOCK3_GRCh37_3:51417604-51417604_Frame-Shift-Del_DEL_C-C-- then:
     If the sample has mutation LARP4B_GRCh37_10:890939-890939_Frame-Shift-Del_DEL_T-T-- then:
         Classify as C
         Else classify as C
     Else if the sample has mutation ACVR2A_GRCh37_2:148683686-148683686_Frame-Shift-Del_DEL_A-A-- then:
         Classify as C
         Else classify as C
-------------------------------------------------------------------------------
TN: 0
TP: 17
FN: 0
FP: 20
-------------------------------------------------------------------------------


Unnamed: 0,Accuracy,Sensitivity,Specificity,Precision,Miss Rate,FDR,FOR
0,45.95,100.0,0.0,45.95,0.0,54.05,0.0


In [38]:
average_accuracy = (advanced_metrics_1['Accuracy'] + advanced_metrics_2['Accuracy'] + advanced_metrics_3['Accuracy']) / 3
average_sensitive = (advanced_metrics_1['Sensitivity'] + advanced_metrics_2['Sensitivity'] + advanced_metrics_3['Sensitivity']) / 3
average_specificity = (advanced_metrics_1['Specificity'] + advanced_metrics_2['Specificity'] + advanced_metrics_3['Specificity']) / 3
average_precision = (advanced_metrics_1['Precision'] + advanced_metrics_2['Precision'] + advanced_metrics_3['Precision']) / 3
average_miss_rate = (advanced_metrics_1['Miss Rate'] + advanced_metrics_2['Miss Rate'] + advanced_metrics_3['Miss Rate']) / 3
average_fdr = (advanced_metrics_1['FDR'] + advanced_metrics_2['FDR'] + advanced_metrics_3['FDR']) / 3
average_forr = (advanced_metrics_1['FOR'] + advanced_metrics_2['FOR'] + advanced_metrics_3['FOR']) / 3

print("(Set 1)")

# accuracy = total number of correct predictions / total number of predictions
print(f'Average Accuracy: {average_accuracy.values[0]:.2f}%')
# sensitivity = true positive rate = TP / (TP + FN) = ability to classify true positives from all predicted positives
print(f'Average Sensitivity: {average_sensitive.values[0]:.2f}%')
# specificity = true negative rate = TN / (TN + FP) = ability to classify true negatives from all predicted negatives
print(f'Average Specificity: {average_specificity.values[0]:.2f}%')
# precision = TP / (TP + FP) = ability to classify true positives from all predicted positives
print(f'Average Precision: {average_precision.values[0]:.2f}%')
# miss rate = FN / (FN + TP) = ability to classify true negatives from all predicted negatives
print(f'Average Miss Rate: {average_miss_rate.values[0]:.2f}%')
# FDR = FP / (FP + TP) = ability to classify false postivies from all predicted positives
print(f'Average FDR: {average_fdr.values[0]:.2f}%')
# FOR = FN / (FN + TN) = ability to classify false negatives from all predicted negatives
print(f'Average FOR: {average_forr.values[0]:.2f}%')

(Set 1)
Average Accuracy: 48.07%
Average Sensitivity: 75.44%
Average Specificity: 33.33%
Average Precision: 60.69%
Average Miss Rate: 24.56%
Average FDR: 39.31%
Average FOR: 14.58%


# Metrics from TP-FP (Set 2)

- Average Accuracy: 61.84%
- Average Sensitivity: 17.63%
- Average Specificity: 96.88%
- Average Precision: 77.78%
- Average Miss Rate: 82.37%
- Average FDR: 22.22%
- Average FOR: 40.35%