In [51]:
import pandas as pd
import numpy as np

# Import input data

In [52]:
r1_cleaned_clinical_metadata = pd.read_csv('./../../data_preprocessing/clinical_metadata_preprocessing/Repository1_clinical_metadata_cleaned.csv')
r2_cleaned_clinical_metadata = pd.read_csv('./../../data_preprocessing/clinical_metadata_preprocessing/Repository2_clinical_metadata_cleaned.csv')
r3_cleaned_clinical_metadata = pd.read_csv('./../../data_preprocessing/clinical_metadata_preprocessing/Repository3_clinical_metadata_cleaned.csv')
hrd_output = pd.read_excel('./../../original_input_data/genomic_features/hrd_score_hrdetect_all_repositories.xlsx')
hrd_output.to_csv('hrd_output.csv')
hrd_status = pd.read_csv('hrd_output.csv')
pd.set_option('display.max_rows', None)

## Labels extraction

In [53]:
r1_new_labels = r1_cleaned_clinical_metadata[['SAMPLE_ID', 'label']]
r1_new_labels

Unnamed: 0,SAMPLE_ID,label
0,Sample_0001,
1,Sample_0002,
2,Sample_0003,true_responder
3,Sample_0004,responder_based_on_PFS
4,Sample_0005,other_use_as_test
5,Sample_0006,nonresponder_based_on_PFS
6,Sample_0007,possible_reponder
7,Sample_0008,
8,Sample_0009,responder_based_on_SD
9,Sample_0010,


In [54]:
r2_new_labels = r2_cleaned_clinical_metadata[['SAMPLE_ID', 'label']]
r2_new_labels

Unnamed: 0,SAMPLE_ID,label
0,Sample_0135,possible_nonresponder
1,Sample_0136,possible_nonresponder
2,Sample_0137,possible_nonresponder
3,Sample_0138,possible_responder
4,Sample_0139,possible_responder
5,Sample_0140,possible_nonresponder
6,Sample_0141,possible_nonresponder
7,Sample_0142,possible_nonresponder
8,Sample_0143,possible_nonresponder
9,Sample_0144,possible_responder


In [55]:
r3_new_labels = r3_cleaned_clinical_metadata[['SAMPLE_ID', 'final_label']]
r3_new_labels = r3_new_labels.rename(columns={"final_label": 'label'})
r3_new_labels

Unnamed: 0,SAMPLE_ID,label
0,Sample_0214,true_responder
1,Sample_0215,true_responder
2,Sample_0216,responder_based_on_PFS
3,Sample_0217,true_responder
4,Sample_0218,true_responder
5,Sample_0219,true_responder
6,Sample_0220,true_responder
7,Sample_0221,responder_based_on_PFS
8,Sample_0222,true_responder
9,Sample_0223,true_nonresponder


In [56]:
all_labels = pd.concat([r1_new_labels, r2_new_labels, r3_new_labels] , ignore_index=True)
all_labels = all_labels[all_labels['label'].isin(['responder_based_on_PFS', 'possible_reponder', 'responder_based_on_SD', 'nonresponder_based_on_PFS', 'possible_nonreponder', 'nonresponder_based_on_SD', 'true_responder', 'true_nonresponder'])]
all_labels
all_labels_with_hrd = pd.merge(all_labels, hrd_status, on='SAMPLE_ID', how='inner')
all_labels_with_hrd = all_labels_with_hrd.drop(columns='Unnamed: 0', axis='columns')
all_labels_with_hrd

Unnamed: 0,SAMPLE_ID,label,HRDetect_output_probablity
0,Sample_0003,true_responder,0.066028
1,Sample_0004,responder_based_on_PFS,0.999997
2,Sample_0006,nonresponder_based_on_PFS,0.271552
3,Sample_0007,possible_reponder,0.747943
4,Sample_0009,responder_based_on_SD,0.999933
5,Sample_0011,responder_based_on_PFS,0.026399
6,Sample_0012,nonresponder_based_on_SD,0.999933
7,Sample_0013,true_responder,0.999973
8,Sample_0014,nonresponder_based_on_SD,0.940979
9,Sample_0015,true_responder,0.995239


In [58]:
def hrd_status_calculation(x):
    if x['HRDetect_output_probablity'] >= 0.7:
        return 1
    else:
        return 0


all_labels_with_hrd['hrd_status'] = all_labels_with_hrd.apply(lambda x: hrd_status_calculation(x), axis=1)
all_labels_with_hrd

Unnamed: 0,SAMPLE_ID,label,HRDetect_output_probablity,hrd_status
0,Sample_0003,true_responder,0.066028,0
1,Sample_0004,responder_based_on_PFS,0.999997,1
2,Sample_0006,nonresponder_based_on_PFS,0.271552,0
3,Sample_0007,possible_reponder,0.747943,1
4,Sample_0009,responder_based_on_SD,0.999933,1
5,Sample_0011,responder_based_on_PFS,0.026399,0
6,Sample_0012,nonresponder_based_on_SD,0.999933,1
7,Sample_0013,true_responder,0.999973,1
8,Sample_0014,nonresponder_based_on_SD,0.940979,1
9,Sample_0015,true_responder,0.995239,1


In [63]:
def hrd_corrected_labels(x):
    if x['label'] in ['responder_based_on_PFS', 'possible_reponder', 'responder_based_on_SD', 'nonresponder_based_on_PFS', 'possible_nonreponder', 'nonresponder_based_on_SD'] and x['hrd_status'] == 1:
        return 1
    elif x['label'] in ['true_responder', 'true_nonresponder']:
        return 1
    else:
        return 0


all_labels_with_hrd['improved_label'] = all_labels_with_hrd.apply(lambda x: hrd_corrected_labels(x), axis=1)
new_labels = all_labels_with_hrd[['SAMPLE_ID', 'improved_label']]
new_labels.to_csv('improved_labels_corrected_by_HRD_status.csv')
            