In [1]:
import pandas as pd
import matplotlib.pyplot as plt
pd.options.display.float_format = '{:.2f}'.format

filename = 'mutations.csv'
data = pd.read_csv(filename, index_col=0)
samples = data.shape[0]
data.head()

Unnamed: 0,ANKRD26_GRCh37_10:27322259-27322259_Frame-Shift-Del_DEL_T-T--,ARID5B_GRCh37_10:63850705-63850705_Frame-Shift-Del_DEL_A-A--,PTEN_GRCh37_10:89717770-89717770_Frame-Shift-Del_DEL_A-A--,C11orf70_GRCh37_11:101937275-101937275_Frame-Shift-Del_DEL_T-T--,LRRC43_GRCh37_12:122685346-122685346_Frame-Shift-Del_DEL_C-C--,FARP1_GRCh37_13:99092237-99092237_Frame-Shift-Del_DEL_G-G--,SNAPC1_GRCh37_14:62242911-62242911_Frame-Shift-Del_DEL_T-T--,ZC3H18_GRCh37_16:88691141-88691141_Frame-Shift-Del_DEL_C-C--,KIF2B_GRCh37_17:51901904-51901904_Missense-Mutation_SNP_C-C-T,KIF2B_GRCh37_17:51902014-51902014_Frame-Shift-Del_DEL_A-A--,...,PWWP2B_GRCh37_10:134230688-134230688_3'UTR_DEL_C-C--,DPYSL2_GRCh37_8:26513311-26513311_3'UTR_SNP_T-T-G,NME5_GRCh37_5:137451362-137451362_3'UTR_DEL_T-T--,MGP_GRCh37_12:15035053-15035053_3'UTR_DEL_T-T--,NAP1L1_GRCh37_12:76442093-76442093_3'UTR_DEL_A-A--,SYTL1_GRCh37_1:27680355-27680356_3'UTR_DEL_CT-CT--,LRIT1_GRCh37_10:85991648-85991648_3'UTR_SNP_G-G-T,PLK4_GRCh37_4:128819735-128819735_3'UTR_DEL_T-T--,ZBED6CL_GRCh37_7:150028250-150028250_3'UTR_SNP_C-C-T,TERF2IP_GRCh37_16:75690558-75690558_3'UTR_DEL_A-A--
C0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
NC0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
NC1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Top 10 features (ranked by “TP – FP”) and their corresponding “TP – FP” values

In [2]:
mutation_data = []

for mutation in data:
    curr_tp, curr_fp = 0, 0
    for index in data.index:
        if data[mutation][index] == 1:
            if index.startswith('C'):
                curr_tp += 1
            else:
                curr_fp += 1
    # print(f"Mutation: {mutation}")
    # print(f"True Positive: {curr_tp}")
    # print(f"False Positive: {curr_fp}")
    mutation_data.append([mutation, curr_tp, curr_fp])

mutation_data.sort(key=lambda x: x[1] - x[2], reverse=True)
mutation_data 

tp_fp_chart = pd.DataFrame(columns=['Mutation', 'TP - FP'])
tp_fp_chart['Mutation'] = [x[0] for x in mutation_data]
tp_fp_chart['TP - FP'] = [x[1] - x[2] for x in mutation_data]
tp_fp_chart.head(10)


Unnamed: 0,Mutation,TP - FP
0,DOCK3_GRCh37_3:51417604-51417604_Frame-Shift-D...,12
1,ZBTB20_GRCh37_3:114058003-114058003_Frame-Shif...,10
2,ACVR2A_GRCh37_2:148683686-148683686_Frame-Shif...,10
3,RNF43_GRCh37_17:56435161-56435161_Frame-Shift-...,10
4,PGM5_GRCh37_9:70993145-70993145_Missense-Mutat...,10
5,UBR5_GRCh37_8:103289349-103289349_Frame-Shift-...,9
6,LARP4B_GRCh37_10:890939-890939_Frame-Shift-Del...,8
7,TVP23C_GRCh37_17:15441469-15441469_Intron_SNP_...,7
8,RPL22_GRCh37_1:6257785-6257785_Frame-Shift-Del...,7
9,UPF3A_GRCh37_13:115057211-115057211_Frame-Shif...,7


# Pick the most useful mutation (the one with the highest TP - FP)

In [3]:
most_useful_mutation = mutation_data[0][0]
print(f"The most useful mutation is: {most_useful_mutation}")

The most useful mutation is: DOCK3_GRCh37_3:51417604-51417604_Frame-Shift-Del_DEL_C-C--


# Divide the samples into two groups by using the feature F to classify each sample as either
 ### Group-A: samples that have mutation F
 ### Group-B: samples that do not have mutation F

In [4]:
group_a_patients = []
group_b_patients = []

for sample in data.index:
    if data[most_useful_mutation][sample] == 1:
        group_a_patients.append(sample)
    else:
        group_b_patients.append(sample)

max_len = max(len(group_a_patients), len(group_b_patients))
for i in range(max_len - len(group_a_patients)):
    group_a_patients.append('')
for i in range(max_len - len(group_b_patients)):
    group_b_patients.append('')

group_a_and_b_list = pd.DataFrame(columns=['Group A', 'Group B'])
group_a_and_b_list['Group A'] = group_a_patients
group_a_and_b_list['Group B'] = group_b_patients
group_a_and_b_list.name = f"Group A and B for {most_useful_mutation}"
print(group_a_and_b_list.name)
pd.options.display.max_rows = 100
group_a_and_b_list

Group A and B for DOCK3_GRCh37_3:51417604-51417604_Frame-Shift-Del_DEL_C-C--


Unnamed: 0,Group A,Group B
0,C1,C0
1,C10,NC0
2,C21,C2
3,C29,NC1
4,C32,NC2
5,C34,NC3
6,C38,NC4
7,C41,C3
8,C43,NC5
9,C44,NC6


In [23]:
group_a_mutation_data = []
# find the true positive and false positive for group A

#go into data and pull data for group A patients
# drop '' from group_a_patients
group_a_patients = [x for x in group_a_patients if x != '']

group_a_data = data.loc[group_a_patients]

for mutation in group_a_data:
    curr_tp, curr_fp = 0, 0
    for index in group_a_data.index:
        if group_a_data[mutation][index] == 1:
            if index.startswith('C'):
                curr_tp += 1
            else:
                curr_fp += 1
    group_a_mutation_data.append([mutation, curr_tp, curr_fp])


group_a_mutation_data.sort(key=lambda x: x[1] - x[2], reverse=True)

#drop the most useful mutation from the list
group_a_mutation_data = [x for x in group_a_mutation_data if x[0] != most_useful_mutation]

group_a_tp_fp_chart = pd.DataFrame(columns=['Mutation', 'TP - FP'])
group_a_tp_fp_chart['Mutation'] = [x[0] for x in group_a_mutation_data]
group_a_tp_fp_chart['TP - FP'] = [x[1] - x[2] for x in group_a_mutation_data]
group_a_tp_fp_chart.name = f"Group A TP - FP for {most_useful_mutation}"

print(group_a_tp_fp_chart.name)
group_a_tp_fp_chart.head(10)


Group A TP - FP for DOCK3_GRCh37_3:51417604-51417604_Frame-Shift-Del_DEL_C-C--


Unnamed: 0,Mutation,TP - FP
0,UBR5_GRCh37_8:103289349-103289349_Frame-Shift-...,7
1,RNF43_GRCh37_17:56435161-56435161_Frame-Shift-...,7
2,ZBTB20_GRCh37_3:114058003-114058003_Frame-Shif...,6
3,LARP4B_GRCh37_10:890939-890939_Frame-Shift-Del...,6
4,PGM5_GRCh37_9:70993145-70993145_Missense-Mutat...,6
5,TVP23C_GRCh37_17:15441469-15441469_Intron_SNP_...,5
6,CCAR2_GRCh37_8:22472975-22472975_Frame-Shift-D...,5
7,ACVR2A_GRCh37_2:148683686-148683686_Frame-Shif...,5
8,PHF2_GRCh37_9:96422612-96422612_Frame-Shift-De...,4
9,RPL22_GRCh37_1:6257785-6257785_Frame-Shift-Del...,4


In [24]:
#grab top mutation for group A

group_a_most_useful_mutation = group_a_mutation_data[0][0]
print(f"The most useful mutation for group A is: {group_a_most_useful_mutation}")

The most useful mutation for group A is: UBR5_GRCh37_8:103289349-103289349_Frame-Shift-Del_DEL_T-T--


In [28]:
# make a confustion matrix for group A
actual_predicted_A = pd.DataFrame(index=group_a_data.index, columns=['Actual', 'Predicted'])


# Values where the sample starts with 'C', indicating the sample has cancer
actual_predicted_A['Actual'] = group_a_data.index.str.startswith('C').astype(int)

# Values where the patient has the cancer mutation
actual_predicted_A['Predicted'] = group_a_data[group_a_most_useful_mutation]

sum_of_pred = actual_predicted_A['Predicted'].sum()

confusion_matrix_A = pd.crosstab(actual_predicted_A['Actual'], actual_predicted_A['Predicted'], rownames=['Actual'], colnames=['Predicted'])
confusion_matrix_A.name = f"Confusion Matrix for {group_a_most_useful_mutation}"


print(confusion_matrix_A.name)
confusion_matrix_A


Confusion Matrix for UBR5_GRCh37_8:103289349-103289349_Frame-Shift-Del_DEL_T-T--


Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
1,5,7


In [40]:
group_a1_patients = []
group_a0_patients = []

for sample in group_a_data.index:
    if group_a_data[group_a_most_useful_mutation][sample] == 1:
        group_a1_patients.append(sample)
    else:
        group_a0_patients.append(sample)

max_len = max(len(group_a1_patients), len(group_a0_patients))
for i in range(max_len - len(group_a1_patients)):
    group_a1_patients.append('')
for i in range(max_len - len(group_a0_patients)):
    group_a0_patients.append('')

group_a1_and_a0_list = pd.DataFrame(columns=['Group A1(C)', 'Group A0(NC)'])
group_a1_and_a0_list['Group A1(C)'] = group_a1_patients
group_a1_and_a0_list['Group A0(NC)'] = group_a0_patients
group_a1_and_a0_list.name = f"Group A1(C) and A0(NC) for {group_a_most_useful_mutation}"
print(group_a1_and_a0_list.name)
group_a1_and_a0_list

Group A1(C) and A0(NC) for UBR5_GRCh37_8:103289349-103289349_Frame-Shift-Del_DEL_T-T--


Unnamed: 0,Group A1(C),Group A0(NC)
0,C1,C10
1,C21,C29
2,C32,C43
3,C34,C44
4,C38,C46
5,C41,
6,C47,


In [25]:
group_b_mutation_data = []
# find the true positive and false positive for group A

#go into data and pull data for group A patients
# drop '' from group_a_patients
group_b_patients = [x for x in group_b_patients if x != '']

group_b_data = data.loc[group_b_patients]

for mutation in group_b_data:
    curr_tp, curr_fp = 0, 0
    for index in group_b_data.index:
        if group_b_data[mutation][index] == 1:
            if index.startswith('C'):
                curr_tp += 1
            else:
                curr_fp += 1
    group_b_mutation_data.append([mutation, curr_tp, curr_fp])


group_b_mutation_data.sort(key=lambda x: x[1] - x[2], reverse=True)

#drop the most useful mutation from the list
group_b_mutation_data = [x for x in group_b_mutation_data if x[0] != most_useful_mutation]

group_b_tp_fp_chart = pd.DataFrame(columns=['Mutation', 'TP - FP'])
group_b_tp_fp_chart['Mutation'] = [x[0] for x in group_b_mutation_data]
group_b_tp_fp_chart['TP - FP'] = [x[1] - x[2] for x in group_b_mutation_data]
group_b_tp_fp_chart.name = f"Group A TP - FP for {most_useful_mutation}"

print(group_b_tp_fp_chart.name)
group_b_tp_fp_chart.head(10)

Group A TP - FP for DOCK3_GRCh37_3:51417604-51417604_Frame-Shift-Del_DEL_C-C--


Unnamed: 0,Mutation,TP - FP
0,ACVR2A_GRCh37_2:148683686-148683686_Frame-Shif...,5
1,ZC3H18_GRCh37_16:88691141-88691141_Frame-Shift...,4
2,ZBTB20_GRCh37_3:114058003-114058003_Frame-Shif...,4
3,BMPR2_GRCh37_2:203420130-203420130_Frame-Shift...,4
4,ZNF330_GRCh37_4:142143532-142143532_Frame-Shif...,4
5,PGM5_GRCh37_9:70993145-70993145_Missense-Mutat...,4
6,GLI3_GRCh37_7:42005573-42005573_Frame-Shift-De...,4
7,UPF3A_GRCh37_13:115057211-115057211_Frame-Shif...,4
8,IWS1_GRCh37_2:128238676-128238676_Frame-Shift-...,4
9,KDM1B_GRCh37_6:18222307-18222307_3'UTR_DEL_A-A--,4


In [26]:
#grab most useful mutation for group B

group_b_most_useful_mutation = group_b_tp_fp_chart.head(1)['Mutation'].values[0]
print(f"The most useful mutation for group B is: {group_b_most_useful_mutation}")

The most useful mutation for group B is: ACVR2A_GRCh37_2:148683686-148683686_Frame-Shift-Del_DEL_A-A--


In [30]:
# make a confustion matrix for group A
actual_predicted_B = pd.DataFrame(index=group_b_data.index, columns=['Actual', 'Predicted'])


# Values where the sample starts with 'C', indicating the sample has cancer
actual_predicted_B['Actual'] = group_b_data.index.str.startswith('C').astype(int)

# Values where the patient has the cancer mutation
actual_predicted_B['Predicted'] = group_b_data[group_b_most_useful_mutation]

sum_of_pred = actual_predicted_B['Predicted'].sum()

confusion_matrix_B = pd.crosstab(actual_predicted_B['Actual'], actual_predicted_B['Predicted'], rownames=['Actual'], colnames=['Predicted'])
confusion_matrix_B.name = f"Confusion Matrix for {group_b_most_useful_mutation}"


print(confusion_matrix_B.name)
confusion_matrix_B

Confusion Matrix for ACVR2A_GRCh37_2:148683686-148683686_Frame-Shift-Del_DEL_A-A--


Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,60,1
1,31,6


In [39]:
group_b1_patients = []
group_b0_patients = []

for sample in group_b_data.index:
    if group_b_data[group_b_most_useful_mutation][sample] == 1:
        group_b1_patients.append(sample)
    else:
        group_b0_patients.append(sample)

max_len = max(len(group_b1_patients), len(group_b0_patients))
for i in range(max_len - len(group_b1_patients)):
    group_b1_patients.append('')
for i in range(max_len - len(group_b0_patients)):
    group_b0_patients.append('')

group_b1_and_b0_list = pd.DataFrame(columns=['Group B1(C)', 'Group B0(NC)'])
group_b1_and_b0_list['Group B1(C)'] = group_b1_patients
group_b1_and_b0_list['Group B0(NC)'] = group_b0_patients
group_b1_and_b0_list.name = f"Group B1(C) and B1(NC) for {group_b_most_useful_mutation}"
print(group_b1_and_b0_list.name)
group_b1_and_b0_list

Group B1(C) and B1(NC) for ACVR2A_GRCh37_2:148683686-148683686_Frame-Shift-Del_DEL_A-A--


Unnamed: 0,Group B1(C),Group B0(NC)
0,C4,C0
1,C11,NC0
2,C13,C2
3,C27,NC1
4,C28,NC2
5,C40,NC3
6,NC57,NC4
7,,C3
8,,NC5
9,,NC6


In [41]:
# Decision Tree Classifier

# If S has mutation F then
# if S has mutation A
# then classify S as C
# else classify S as NC
# else
# if S has mutation B
# then classify S as C
# else classify S as NC

list_of_samples = ["C1", "C10", "C30", "NC5","NC15"]

#get sample data
sample_data = data.loc[list_of_samples]

#make a prediction
for sample in list_of_samples:
    if sample_data[most_useful_mutation][sample] == 1:
        if sample_data[group_a_most_useful_mutation][sample] == 1:
            print(f"{sample} is classified as C")
        else:
            print(f"{sample} is classified as NC")
    else:
        if sample_data[group_b_most_useful_mutation][sample] == 1:
            print(f"{sample} is classified as C")
        else:
            print(f"{sample} is classified as NC")

C1 is classified as C
C10 is classified as NC
C30 is classified as NC
NC5 is classified as NC
NC15 is classified as NC
