In [1]:
import pandas as pd
import matplotlib.pyplot as plt
pd.options.display.float_format = '{:.2f}'.format

filename = 'mutations.csv'
data = pd.read_csv(filename, index_col=0)
samples = data.shape[0]
data.head()

Unnamed: 0,ANKRD26_GRCh37_10:27322259-27322259_Frame-Shift-Del_DEL_T-T--,ARID5B_GRCh37_10:63850705-63850705_Frame-Shift-Del_DEL_A-A--,PTEN_GRCh37_10:89717770-89717770_Frame-Shift-Del_DEL_A-A--,C11orf70_GRCh37_11:101937275-101937275_Frame-Shift-Del_DEL_T-T--,LRRC43_GRCh37_12:122685346-122685346_Frame-Shift-Del_DEL_C-C--,FARP1_GRCh37_13:99092237-99092237_Frame-Shift-Del_DEL_G-G--,SNAPC1_GRCh37_14:62242911-62242911_Frame-Shift-Del_DEL_T-T--,ZC3H18_GRCh37_16:88691141-88691141_Frame-Shift-Del_DEL_C-C--,KIF2B_GRCh37_17:51901904-51901904_Missense-Mutation_SNP_C-C-T,KIF2B_GRCh37_17:51902014-51902014_Frame-Shift-Del_DEL_A-A--,...,PWWP2B_GRCh37_10:134230688-134230688_3'UTR_DEL_C-C--,DPYSL2_GRCh37_8:26513311-26513311_3'UTR_SNP_T-T-G,NME5_GRCh37_5:137451362-137451362_3'UTR_DEL_T-T--,MGP_GRCh37_12:15035053-15035053_3'UTR_DEL_T-T--,NAP1L1_GRCh37_12:76442093-76442093_3'UTR_DEL_A-A--,SYTL1_GRCh37_1:27680355-27680356_3'UTR_DEL_CT-CT--,LRIT1_GRCh37_10:85991648-85991648_3'UTR_SNP_G-G-T,PLK4_GRCh37_4:128819735-128819735_3'UTR_DEL_T-T--,ZBED6CL_GRCh37_7:150028250-150028250_3'UTR_SNP_C-C-T,TERF2IP_GRCh37_16:75690558-75690558_3'UTR_DEL_A-A--
C0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
NC0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
NC1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Top 10 features (ranked by “TP – FP”) and their corresponding “TP – FP” values

In [2]:
mutation_data = []

for mutation in data:
    curr_tp, curr_fp = 0, 0
    for index in data.index:
        if data[mutation][index] == 1:
            if index.startswith('C'):
                curr_tp += 1
            else:
                curr_fp += 1
    # print(f"Mutation: {mutation}")
    # print(f"True Positive: {curr_tp}")
    # print(f"False Positive: {curr_fp}")
    mutation_data.append([mutation, curr_tp, curr_fp])

mutation_data.sort(key=lambda x: x[1] - x[2], reverse=True)
mutation_data 

tp_fp_chart = pd.DataFrame(columns=['Mutation', 'TP - FP'])
tp_fp_chart['Mutation'] = [x[0] for x in mutation_data]
tp_fp_chart['TP - FP'] = [x[1] - x[2] for x in mutation_data]
tp_fp_chart.head(10)


Unnamed: 0,Mutation,TP - FP
0,DOCK3_GRCh37_3:51417604-51417604_Frame-Shift-D...,12
1,ZBTB20_GRCh37_3:114058003-114058003_Frame-Shif...,10
2,ACVR2A_GRCh37_2:148683686-148683686_Frame-Shif...,10
3,RNF43_GRCh37_17:56435161-56435161_Frame-Shift-...,10
4,PGM5_GRCh37_9:70993145-70993145_Missense-Mutat...,10
5,UBR5_GRCh37_8:103289349-103289349_Frame-Shift-...,9
6,LARP4B_GRCh37_10:890939-890939_Frame-Shift-Del...,8
7,TVP23C_GRCh37_17:15441469-15441469_Intron_SNP_...,7
8,RPL22_GRCh37_1:6257785-6257785_Frame-Shift-Del...,7
9,UPF3A_GRCh37_13:115057211-115057211_Frame-Shif...,7


# Pick the most useful mutation (the one with the highest TP - FP)

In [3]:
most_useful_mutation = mutation_data[0][0]
print(f"The most useful mutation is: {most_useful_mutation}")

The most useful mutation is: DOCK3_GRCh37_3:51417604-51417604_Frame-Shift-Del_DEL_C-C--


# Confusion matrix to represent the classification of all samples with genetic mutation F

In [4]:
actual_predicted = pd.DataFrame(index=data.index, columns=['Actual', 'Predicted'])
# Values where the sample starts with 'C', indicating the sample has cancer
actual_predicted['Actual'] = data.index.str.startswith('C').astype(int)
# # Values where the patient has the cancer mutation
actual_predicted['Predicted'] = data[most_useful_mutation]
sum_of_pred = actual_predicted['Predicted'].sum() 

confusion_matrix = pd.crosstab(actual_predicted['Actual'], actual_predicted['Predicted'], rownames=['Actual'], colnames=['Predicted'])
confusion_matrix.name = f"Confusion Matrix for {most_useful_mutation}"

print(confusion_matrix.name)
confusion_matrix

Confusion Matrix for DOCK3_GRCh37_3:51417604-51417604_Frame-Shift-Del_DEL_C-C--


Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,61,0
1,37,12


# Divide the samples into two groups by using the feature F to classify each sample as either
 ### Group-A: samples that have mutation F
 ### Group-B: samples that do not have mutation F

In [5]:
group_a_patients = []
group_b_patients = []

for sample in data.index:
    if data[most_useful_mutation][sample] == 1:
        group_a_patients.append(sample)
    else:
        group_b_patients.append(sample)

max_len = max(len(group_a_patients), len(group_b_patients))
for i in range(max_len - len(group_a_patients)):
    group_a_patients.append('')
for i in range(max_len - len(group_b_patients)):
    group_b_patients.append('')

group_a_and_b_list = pd.DataFrame(columns=['Group A', 'Group B'])
group_a_and_b_list['Group A'] = group_a_patients
group_a_and_b_list['Group B'] = group_b_patients
group_a_and_b_list.name = f"Group A and B for {most_useful_mutation}"
print(group_a_and_b_list.name)
pd.options.display.max_rows = 100
group_a_and_b_list

Group A and B for DOCK3_GRCh37_3:51417604-51417604_Frame-Shift-Del_DEL_C-C--


Unnamed: 0,Group A,Group B
0,C1,C0
1,C10,NC0
2,C21,C2
3,C29,NC1
4,C32,NC2
5,C34,NC3
6,C38,NC4
7,C41,C3
8,C43,NC5
9,C44,NC6
