In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
pd.options.display.float_format = '{:.2f}'.format

filename = 'mutations.csv'
data = pd.read_csv(filename, index_col=0)
samples = data.shape[0]
data.head()

Unnamed: 0,ANKRD26_GRCh37_10:27322259-27322259_Frame-Shift-Del_DEL_T-T--,ARID5B_GRCh37_10:63850705-63850705_Frame-Shift-Del_DEL_A-A--,PTEN_GRCh37_10:89717770-89717770_Frame-Shift-Del_DEL_A-A--,C11orf70_GRCh37_11:101937275-101937275_Frame-Shift-Del_DEL_T-T--,LRRC43_GRCh37_12:122685346-122685346_Frame-Shift-Del_DEL_C-C--,FARP1_GRCh37_13:99092237-99092237_Frame-Shift-Del_DEL_G-G--,SNAPC1_GRCh37_14:62242911-62242911_Frame-Shift-Del_DEL_T-T--,ZC3H18_GRCh37_16:88691141-88691141_Frame-Shift-Del_DEL_C-C--,KIF2B_GRCh37_17:51901904-51901904_Missense-Mutation_SNP_C-C-T,KIF2B_GRCh37_17:51902014-51902014_Frame-Shift-Del_DEL_A-A--,...,PWWP2B_GRCh37_10:134230688-134230688_3'UTR_DEL_C-C--,DPYSL2_GRCh37_8:26513311-26513311_3'UTR_SNP_T-T-G,NME5_GRCh37_5:137451362-137451362_3'UTR_DEL_T-T--,MGP_GRCh37_12:15035053-15035053_3'UTR_DEL_T-T--,NAP1L1_GRCh37_12:76442093-76442093_3'UTR_DEL_A-A--,SYTL1_GRCh37_1:27680355-27680356_3'UTR_DEL_CT-CT--,LRIT1_GRCh37_10:85991648-85991648_3'UTR_SNP_G-G-T,PLK4_GRCh37_4:128819735-128819735_3'UTR_DEL_T-T--,ZBED6CL_GRCh37_7:150028250-150028250_3'UTR_SNP_C-C-T,TERF2IP_GRCh37_16:75690558-75690558_3'UTR_DEL_A-A--
C0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
NC0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
NC1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [2]:
# Function to find calc TP, FP, TN, FN for a set of Mutations

def calculate_mutation_data(data):
    mutation_data = []
    for mutation in data:
        curr_tp, curr_fp, curr_tn, curr_fn = 0, 0, 0, 0
        for index in data.index:
            if data[mutation][index] == 1:
                if index.startswith('C'):
                    curr_tp += 1
                else:
                    curr_fp += 1
            else: 
                if index.startswith('C'):
                    curr_fn += 1
                else:
                    curr_tn += 1
        mutation_data.append([mutation, curr_tp, curr_fp, curr_tn, curr_fn])
    return mutation_data


In [3]:
# Advanced Metrics 
def calc_advanced_metrics(mutation_data, TP, FP, TN , FN ):
    advanced_metrics = []
    accuracy = (TP + TN) / (TP + FP + TN + FN) * 100
    sensitivity = TP / (TP + FN) * 100
    specificity = TN / (TN + FP) * 100
    precision = TP / (TP + FP) * 100
    miss_rate = FN / (FN + TP) * 100
    fdr = FP / (FP + TP) * 100
    forr = FN / (FN + TN) * 100

    advanced_metrics.append([accuracy, sensitivity, specificity, precision, miss_rate, fdr, forr])
    return advanced_metrics

In [4]:
# Build Tree
def build_tree(data):
    # find the first most important mutation from the data
    mutation_data = calculate_mutation_data(data)
    mutation_data.sort(key=lambda x: x[1] - x[2], reverse=True)
    first_mutation = mutation_data[0][0] 
    #print out the first most important mutation
    print(f'The first most important mutation is {first_mutation}')    

    # sort into two groups based on the first most important mutation
    group1 = []
    group2 = []
    for index in data.index:
        if data[first_mutation][index] == 1:
            group1.append(index)
        else:
            group2.append(index)
    
    # take the most important mutation from each group 
    mutation_data_group1 = calculate_mutation_data(data.loc[group1])
    mutation_data_group1.sort(key=lambda x: x[1] - x[2], reverse=True)
    # if not already the first mutation, take the most important mutation from the group
    if mutation_data_group1[0][0] == first_mutation:
        mutation_data_group1.pop(0)
    mutation_group1 = mutation_data_group1[0][0]
    print(f'The most important mutation in group 1 is {mutation_group1}')

    mutation_data_group2 = calculate_mutation_data(data.loc[group2])
    mutation_data_group2.sort(key=lambda x: x[1] - x[2], reverse=True)
    # if not already the first mutation, take the most important mutation from the group
    if mutation_data_group2[0][0] == first_mutation:
        mutation_data_group2.pop(0)
    mutation_group2 = mutation_data_group2[0][0]
    print(f'The most important mutation in group 2 is {mutation_group2}')


    return first_mutation, mutation_group1, mutation_group2



In [5]:
# Test Tree

# bring the first mutation, group1 mutation, group2 mutation
# bring in the testing data
# sort data based on tree that we build

def test_tree(data, first_mutation, mutation_group1, mutation_group2):
    TP, FP, TN, FN = 0, 0, 0, 0
    for sample in data.index:
        if data[first_mutation][sample] == 1:
            if data[mutation_group1][sample] == 1:
                if sample.startswith('C'):
                    TP += 1
                else :
                    FP += 1
            else:
                if sample.startswith('NC'):
                    TN += 1
                else: 
                    FN += 1
                
        else:
            if data[mutation_group2][sample] == 1:
                if sample.startswith('C'):
                    TP += 1
                else:
                    FP += 1
            else :
                if sample.startswith('NC'):
                    TN += 1
                else:
                    FN += 1

    #print out the results
    print(f'TP: {TP}, FP: {FP}, TN: {TN}, FN: {FN}')
    return TP, FP, TN, FN

In [6]:
# Sort Data into 3 groups 
# 2/3 = training data
# 1/3 = testing data

np.random.seed(69)
shuffled_data = np.random.permutation(data.index)
split_data = np.array_split(shuffled_data, 3)

one_third_one = data.loc[split_data[0]]
one_third_two = data.loc[split_data[1]]
one_thrid_three = data.loc[split_data[2]]

# print out data
print('1/3 of Data = ', one_third_one.index)
print('1/3 of Data = ', one_third_two.index)
print('1/3 of Data = ', one_thrid_three.index)



1/3 of Data =  Index(['C3', 'NC8', 'C29', 'NC12', 'NC36', 'C31', 'NC29', 'C15', 'NC6', 'C37',
       'C41', 'NC40', 'NC50', 'C16', 'C27', 'NC17', 'NC15', 'C45', 'C11',
       'C34', 'NC28', 'C13', 'NC37', 'C39', 'C43', 'NC58', 'NC23', 'NC10',
       'C10', 'NC2', 'NC19', 'NC54', 'NC33', 'NC3', 'C40', 'NC52', 'C1'],
      dtype='object')
1/3 of Data =  Index(['C47', 'NC46', 'C25', 'C28', 'NC48', 'C24', 'C44', 'NC1', 'NC4', 'C46',
       'NC22', 'C14', 'C48', 'NC34', 'C33', 'NC16', 'NC13', 'NC45', 'C20',
       'NC27', 'C18', 'NC43', 'C0', 'C12', 'NC41', 'NC51', 'C30', 'C32', 'C5',
       'NC7', 'NC35', 'NC11', 'C19', 'C22', 'C26', 'NC59', 'NC47'],
      dtype='object')
1/3 of Data =  Index(['NC38', 'NC39', 'NC57', 'C6', 'NC9', 'NC24', 'NC18', 'C8', 'NC25',
       'C38', 'C21', 'C42', 'C35', 'NC0', 'C9', 'NC21', 'NC49', 'C2', 'NC30',
       'NC60', 'NC20', 'C4', 'NC53', 'NC55', 'C17', 'NC56', 'NC32', 'NC5',
       'C7', 'NC26', 'NC14', 'NC31', 'C36', 'NC42', 'NC44', 'C23'],
      dtype='

In [7]:
# Create Training Data
# Training Data = 1/3_1 + 1/3_2
# Testing Data = 1/3_3
training_data = pd.concat([one_third_one, one_third_two])
print('Training Data = ', training_data.index)
test_data = one_thrid_three
print('Test Data = ', test_data.index)

# Build Tree
first_mutation, mutation_group1, mutation_group2 = build_tree(training_data)

# Test Tree
TP, FP, TN, FN = test_tree(test_data, first_mutation, mutation_group1, mutation_group2)

# Calculate Advanced Metrics
advanced_metrics_1 = calc_advanced_metrics(test_data, TP, FP, TN, FN)
advanced_metrics_1 = pd.DataFrame(advanced_metrics_1, columns=['Accuracy', 'Sensitivity', 'Specificity', 'Precision', 'Miss Rate', 'FDR', 'FOR'])
advanced_metrics_1





Training Data =  Index(['C3', 'NC8', 'C29', 'NC12', 'NC36', 'C31', 'NC29', 'C15', 'NC6', 'C37',
       'C41', 'NC40', 'NC50', 'C16', 'C27', 'NC17', 'NC15', 'C45', 'C11',
       'C34', 'NC28', 'C13', 'NC37', 'C39', 'C43', 'NC58', 'NC23', 'NC10',
       'C10', 'NC2', 'NC19', 'NC54', 'NC33', 'NC3', 'C40', 'NC52', 'C1', 'C47',
       'NC46', 'C25', 'C28', 'NC48', 'C24', 'C44', 'NC1', 'NC4', 'C46', 'NC22',
       'C14', 'C48', 'NC34', 'C33', 'NC16', 'NC13', 'NC45', 'C20', 'NC27',
       'C18', 'NC43', 'C0', 'C12', 'NC41', 'NC51', 'C30', 'C32', 'C5', 'NC7',
       'NC35', 'NC11', 'C19', 'C22', 'C26', 'NC59', 'NC47'],
      dtype='object')
Test Data =  Index(['NC38', 'NC39', 'NC57', 'C6', 'NC9', 'NC24', 'NC18', 'C8', 'NC25',
       'C38', 'C21', 'C42', 'C35', 'NC0', 'C9', 'NC21', 'NC49', 'C2', 'NC30',
       'NC60', 'NC20', 'C4', 'NC53', 'NC55', 'C17', 'NC56', 'NC32', 'NC5',
       'C7', 'NC26', 'NC14', 'NC31', 'C36', 'NC42', 'NC44', 'C23'],
      dtype='object')
The first most important muta

Unnamed: 0,Accuracy,Sensitivity,Specificity,Precision,Miss Rate,FDR,FOR
0,63.89,7.69,95.65,50.0,92.31,50.0,35.29


In [8]:
# Create Training Data
# training data = 1/3_1 + 1/3_3
# testing data = 1/3_2
training_data = pd.concat([one_third_one, one_thrid_three])
print('Training Data = ', training_data.index)
test_data = one_third_two
    

# Build Tree
first_mutation, mutation_group1, mutation_group2 = build_tree(training_data)

# Test Tree
TP, FP, TN, FN = test_tree(test_data, first_mutation, mutation_group1, mutation_group2)

# Calculate Advanced Metrics
advanced_metrics_2 = calc_advanced_metrics(data, TP, FP, TN, FN)
advanced_metrics_2 = pd.DataFrame(advanced_metrics_2, columns=['Accuracy', 'Sensitivity', 'Specificity', 'Precision', 'Miss Rate', 'FDR', 'FOR'])
advanced_metrics_2

Training Data =  Index(['C3', 'NC8', 'C29', 'NC12', 'NC36', 'C31', 'NC29', 'C15', 'NC6', 'C37',
       'C41', 'NC40', 'NC50', 'C16', 'C27', 'NC17', 'NC15', 'C45', 'C11',
       'C34', 'NC28', 'C13', 'NC37', 'C39', 'C43', 'NC58', 'NC23', 'NC10',
       'C10', 'NC2', 'NC19', 'NC54', 'NC33', 'NC3', 'C40', 'NC52', 'C1',
       'NC38', 'NC39', 'NC57', 'C6', 'NC9', 'NC24', 'NC18', 'C8', 'NC25',
       'C38', 'C21', 'C42', 'C35', 'NC0', 'C9', 'NC21', 'NC49', 'C2', 'NC30',
       'NC60', 'NC20', 'C4', 'NC53', 'NC55', 'C17', 'NC56', 'NC32', 'NC5',
       'C7', 'NC26', 'NC14', 'NC31', 'C36', 'NC42', 'NC44', 'C23'],
      dtype='object')
The first most important mutation is ZBTB20_GRCh37_3:114058003-114058003_Frame-Shift-Del_DEL_G-G--
The most important mutation in group 1 is DOCK3_GRCh37_3:51417604-51417604_Frame-Shift-Del_DEL_C-C--
The most important mutation in group 2 is RNF43_GRCh37_17:56435161-56435161_Frame-Shift-Del_DEL_C-C--
TP: 3, FP: 0, TN: 18, FN: 16


Unnamed: 0,Accuracy,Sensitivity,Specificity,Precision,Miss Rate,FDR,FOR
0,56.76,15.79,100.0,100.0,84.21,0.0,47.06


In [9]:
# Create Training Data
# training data = 1/3_2 + 1/3_3
# testing data = 1/3_1
training_data = pd.concat([one_third_two, one_thrid_three])
print('Training Data = ', training_data.index)
test_data = one_third_one
print('Test Data = ', test_data.index)

# Build Tree
first_mutation, mutation_group1, mutation_group2 = build_tree(training_data)

# Test Tree
TP, FP, TN, FN = test_tree(test_data, first_mutation, mutation_group1, mutation_group2)

# Calculate Advanced Metrics
advanced_metrics_3 = calc_advanced_metrics(data, TP, FP, TN, FN)
advanced_metrics_3 = pd.DataFrame(advanced_metrics_3, columns=['Accuracy', 'Sensitivity', 'Specificity', 'Precision', 'Miss Rate', 'FDR', 'FOR'])
advanced_metrics_3

Training Data =  Index(['C47', 'NC46', 'C25', 'C28', 'NC48', 'C24', 'C44', 'NC1', 'NC4', 'C46',
       'NC22', 'C14', 'C48', 'NC34', 'C33', 'NC16', 'NC13', 'NC45', 'C20',
       'NC27', 'C18', 'NC43', 'C0', 'C12', 'NC41', 'NC51', 'C30', 'C32', 'C5',
       'NC7', 'NC35', 'NC11', 'C19', 'C22', 'C26', 'NC59', 'NC47', 'NC38',
       'NC39', 'NC57', 'C6', 'NC9', 'NC24', 'NC18', 'C8', 'NC25', 'C38', 'C21',
       'C42', 'C35', 'NC0', 'C9', 'NC21', 'NC49', 'C2', 'NC30', 'NC60', 'NC20',
       'C4', 'NC53', 'NC55', 'C17', 'NC56', 'NC32', 'NC5', 'C7', 'NC26',
       'NC14', 'NC31', 'C36', 'NC42', 'NC44', 'C23'],
      dtype='object')
Test Data =  Index(['C3', 'NC8', 'C29', 'NC12', 'NC36', 'C31', 'NC29', 'C15', 'NC6', 'C37',
       'C41', 'NC40', 'NC50', 'C16', 'C27', 'NC17', 'NC15', 'C45', 'C11',
       'C34', 'NC28', 'C13', 'NC37', 'C39', 'C43', 'NC58', 'NC23', 'NC10',
       'C10', 'NC2', 'NC19', 'NC54', 'NC33', 'NC3', 'C40', 'NC52', 'C1'],
      dtype='object')
The first most important muta

Unnamed: 0,Accuracy,Sensitivity,Specificity,Precision,Miss Rate,FDR,FOR
0,64.86,29.41,95.0,83.33,70.59,16.67,38.71


In [10]:
# Average the results of all of the Trees 

average_accuracy = (advanced_metrics_1['Accuracy'] + advanced_metrics_2['Accuracy'] + advanced_metrics_3['Accuracy']) / 3
average_sensitive = (advanced_metrics_1['Sensitivity'] + advanced_metrics_2['Sensitivity'] + advanced_metrics_3['Sensitivity']) / 3
average_specificity = (advanced_metrics_1['Specificity'] + advanced_metrics_2['Specificity'] + advanced_metrics_3['Specificity']) / 3
average_precision = (advanced_metrics_1['Precision'] + advanced_metrics_2['Precision'] + advanced_metrics_3['Precision']) / 3
average_miss_rate = (advanced_metrics_1['Miss Rate'] + advanced_metrics_2['Miss Rate'] + advanced_metrics_3['Miss Rate']) / 3
average_fdr = (advanced_metrics_1['FDR'] + advanced_metrics_2['FDR'] + advanced_metrics_3['FDR']) / 3
average_forr = (advanced_metrics_1['FOR'] + advanced_metrics_2['FOR'] + advanced_metrics_3['FOR']) / 3


print(f'Average Accuracy: {average_accuracy.values[0]:.2f}%')
print(f'Average Sensitivity: {average_sensitive.values[0]:.2f}%')
print(f'Average Specificity: {average_specificity.values[0]:.2f}%')
print(f'Average Precision: {average_precision.values[0]:.2f}%')
print(f'Average Miss Rate: {average_miss_rate.values[0]:.2f}%')
print(f'Average FDR: {average_fdr.values[0]:.2f}%')
print(f'Average FOR: {average_forr.values[0]:.2f}%')

Average Accuracy: 61.84%
Average Sensitivity: 17.63%
Average Specificity: 96.88%
Average Precision: 77.78%
Average Miss Rate: 82.37%
Average FDR: 22.22%
Average FOR: 40.35%


# Discussion and interpretation of the 3-fold cross-validation results (including any confluence among the three trees)

- Model is NOT good at finding true postitive cases 
    - Average Sensitivity = 17.63%. Proportion of true positives correctly identified by the model (TP / ALL POSITIVE)
    - Average Miss Rate = 82.37%. Proportion of actual positives that are missed by the model (FN / ALL POSITIVE)
    - Average FOR = 40.35%. How many actual positives are hidden among the predicted negatives. (FN / ALL PREDICTED NEGATIVE)
    - Average Specificity  96.88%. Measures the proportion of true negatives correctly identified. (TN / ALL NEGATIVE)

- Confluence 
    - Tree One Mutations Selected : DOCK3, RNF43, ACVR2A
    - Tree Two Mutations Selected : ZBTB20, DOCK3, RNF43
    - Tree Three Mutations Selected : PGM5, LARP4B, RNF43

    - All three trees have RNF43
    - Two trees (one and two) have DOCK3