In [12]:
import pandas as pd
import numpy as np

### Import input data

In [13]:
r3 = pd.read_excel("./../../original_input_data/clinical_metadata/Repository3_sample_clinical_metadata.xlsx")
r3.head

r3 = r3.replace(r'N/A', np.nan, regex=True)

r3

Unnamed: 0,SAMPLE_ID,donor_age_at_diagnosis,donor_relapse_interval_days,donor_survival_time_days,primary_tumor,metastatic_tumor,family_history_of_cancer,treatment,first_response_measure,disease_status_last_followup
0,Sample_0214,78.0,458,1336,1,0,0,"Carboplatin, Paclitaxel, Velcade",CR,
1,Sample_0215,81.0,1913,1919,1,0,0,"Carboplatin, Paclitaxel, Docetaxel",CR,complete remission
2,Sample_0216,69.0,428,656,1,0,0,"Cisplatin, Paclitaxel, Cyclophosphamide, Doxor...",PD,
3,Sample_0217,50.0,2168,2174,1,0,1,"Carboplatin, Paclitaxel",CR,complete remission
4,Sample_0218,45.0,1218,1720,1,0,0,"Carboplatin, Paclitaxel, Cisplatin, Docetaxel,...",CR,
5,Sample_0219,52.0,1032,2561,1,0,0,"Carboplatin, Paclitaxel, Cisplatin, Docetaxel,...",CR,
6,Sample_0220,,479,1752,1,0,0,"Oregovomab, Carboplatin, Paclitaxel, Docetaxel...",CR,progression
7,Sample_0221,50.0,629,1069,1,0,0,"Carboplatin, Paclitaxel, Gemcitabine, Docetaxe...",SD,
8,Sample_0222,65.0,2157,2163,1,0,1,"Carboplatin, Docetaxel, Doxorubicin",CR,complete remission
9,Sample_0223,50.0,110,204,1,0,0,"Caroboplatin, Paclitaxel",PD,


### Identification of patients treated with platinum-based chemotherapy or PARPi

In [14]:
platinum_drugs = ['Carboplatin', 'Cisplatin', 'Oxaliplatin', 'Olaparib', 'Niraparib', 'Rucaparib', 'Talazoparib']

def platinum_treatment(treatment):
    if isinstance(treatment, str):
        if any(drug in treatment for drug in platinum_drugs):
            return 1  
        else:
            return 0

r3['platinum-based_chemotherapy'] = r3['treatment'].apply(lambda x: platinum_treatment(x))
r3

Unnamed: 0,SAMPLE_ID,donor_age_at_diagnosis,donor_relapse_interval_days,donor_survival_time_days,primary_tumor,metastatic_tumor,family_history_of_cancer,treatment,first_response_measure,disease_status_last_followup,platinum-based_chemotherapy
0,Sample_0214,78.0,458,1336,1,0,0,"Carboplatin, Paclitaxel, Velcade",CR,,1
1,Sample_0215,81.0,1913,1919,1,0,0,"Carboplatin, Paclitaxel, Docetaxel",CR,complete remission,1
2,Sample_0216,69.0,428,656,1,0,0,"Cisplatin, Paclitaxel, Cyclophosphamide, Doxor...",PD,,1
3,Sample_0217,50.0,2168,2174,1,0,1,"Carboplatin, Paclitaxel",CR,complete remission,1
4,Sample_0218,45.0,1218,1720,1,0,0,"Carboplatin, Paclitaxel, Cisplatin, Docetaxel,...",CR,,1
5,Sample_0219,52.0,1032,2561,1,0,0,"Carboplatin, Paclitaxel, Cisplatin, Docetaxel,...",CR,,1
6,Sample_0220,,479,1752,1,0,0,"Oregovomab, Carboplatin, Paclitaxel, Docetaxel...",CR,progression,1
7,Sample_0221,50.0,629,1069,1,0,0,"Carboplatin, Paclitaxel, Gemcitabine, Docetaxe...",SD,,1
8,Sample_0222,65.0,2157,2163,1,0,1,"Carboplatin, Docetaxel, Doxorubicin",CR,complete remission,1
9,Sample_0223,50.0,110,204,1,0,0,"Caroboplatin, Paclitaxel",PD,,0


### Calculation of progression-free survival period (months)

In [15]:
r3["progression_free_survival_months"] = round(r3["donor_relapse_interval_days"] / 30.41)
r3

Unnamed: 0,SAMPLE_ID,donor_age_at_diagnosis,donor_relapse_interval_days,donor_survival_time_days,primary_tumor,metastatic_tumor,family_history_of_cancer,treatment,first_response_measure,disease_status_last_followup,platinum-based_chemotherapy,progression_free_survival_months
0,Sample_0214,78.0,458,1336,1,0,0,"Carboplatin, Paclitaxel, Velcade",CR,,1,15.0
1,Sample_0215,81.0,1913,1919,1,0,0,"Carboplatin, Paclitaxel, Docetaxel",CR,complete remission,1,63.0
2,Sample_0216,69.0,428,656,1,0,0,"Cisplatin, Paclitaxel, Cyclophosphamide, Doxor...",PD,,1,14.0
3,Sample_0217,50.0,2168,2174,1,0,1,"Carboplatin, Paclitaxel",CR,complete remission,1,71.0
4,Sample_0218,45.0,1218,1720,1,0,0,"Carboplatin, Paclitaxel, Cisplatin, Docetaxel,...",CR,,1,40.0
5,Sample_0219,52.0,1032,2561,1,0,0,"Carboplatin, Paclitaxel, Cisplatin, Docetaxel,...",CR,,1,34.0
6,Sample_0220,,479,1752,1,0,0,"Oregovomab, Carboplatin, Paclitaxel, Docetaxel...",CR,progression,1,16.0
7,Sample_0221,50.0,629,1069,1,0,0,"Carboplatin, Paclitaxel, Gemcitabine, Docetaxe...",SD,,1,21.0
8,Sample_0222,65.0,2157,2163,1,0,1,"Carboplatin, Docetaxel, Doxorubicin",CR,complete remission,1,71.0
9,Sample_0223,50.0,110,204,1,0,0,"Caroboplatin, Paclitaxel",PD,,0,4.0


### Calculation of overal survival period (months)

In [16]:
r3["overal_survival_months"] = round(r3["donor_survival_time_days"] / 30.41)
r3

Unnamed: 0,SAMPLE_ID,donor_age_at_diagnosis,donor_relapse_interval_days,donor_survival_time_days,primary_tumor,metastatic_tumor,family_history_of_cancer,treatment,first_response_measure,disease_status_last_followup,platinum-based_chemotherapy,progression_free_survival_months,overal_survival_months
0,Sample_0214,78.0,458,1336,1,0,0,"Carboplatin, Paclitaxel, Velcade",CR,,1,15.0,44.0
1,Sample_0215,81.0,1913,1919,1,0,0,"Carboplatin, Paclitaxel, Docetaxel",CR,complete remission,1,63.0,63.0
2,Sample_0216,69.0,428,656,1,0,0,"Cisplatin, Paclitaxel, Cyclophosphamide, Doxor...",PD,,1,14.0,22.0
3,Sample_0217,50.0,2168,2174,1,0,1,"Carboplatin, Paclitaxel",CR,complete remission,1,71.0,71.0
4,Sample_0218,45.0,1218,1720,1,0,0,"Carboplatin, Paclitaxel, Cisplatin, Docetaxel,...",CR,,1,40.0,57.0
5,Sample_0219,52.0,1032,2561,1,0,0,"Carboplatin, Paclitaxel, Cisplatin, Docetaxel,...",CR,,1,34.0,84.0
6,Sample_0220,,479,1752,1,0,0,"Oregovomab, Carboplatin, Paclitaxel, Docetaxel...",CR,progression,1,16.0,58.0
7,Sample_0221,50.0,629,1069,1,0,0,"Carboplatin, Paclitaxel, Gemcitabine, Docetaxe...",SD,,1,21.0,35.0
8,Sample_0222,65.0,2157,2163,1,0,1,"Carboplatin, Docetaxel, Doxorubicin",CR,complete remission,1,71.0,71.0
9,Sample_0223,50.0,110,204,1,0,0,"Caroboplatin, Paclitaxel",PD,,0,4.0,7.0


### Final labels preparation

In [17]:
def sample_labeling(x):
    if x['first_response_measure'] in ['CR', 'PR']:
        return 1
    elif x['first_response_measure'] == 'PD':
            return 0
    else:
        if x['first_response_measure'] == 'SD' and x['progression_free_survival_months'] >= 6:
                return 1
        else:
            return 1

x_test = {
    'first_response_measure': 'PD',
    'progression_free_survival_months': 10
}

result = sample_labeling(x_test)
result

r3['final_label'] = r3.apply(sample_labeling, axis=1)
r3

Unnamed: 0,SAMPLE_ID,donor_age_at_diagnosis,donor_relapse_interval_days,donor_survival_time_days,primary_tumor,metastatic_tumor,family_history_of_cancer,treatment,first_response_measure,disease_status_last_followup,platinum-based_chemotherapy,progression_free_survival_months,overal_survival_months,final_label
0,Sample_0214,78.0,458,1336,1,0,0,"Carboplatin, Paclitaxel, Velcade",CR,,1,15.0,44.0,1
1,Sample_0215,81.0,1913,1919,1,0,0,"Carboplatin, Paclitaxel, Docetaxel",CR,complete remission,1,63.0,63.0,1
2,Sample_0216,69.0,428,656,1,0,0,"Cisplatin, Paclitaxel, Cyclophosphamide, Doxor...",PD,,1,14.0,22.0,0
3,Sample_0217,50.0,2168,2174,1,0,1,"Carboplatin, Paclitaxel",CR,complete remission,1,71.0,71.0,1
4,Sample_0218,45.0,1218,1720,1,0,0,"Carboplatin, Paclitaxel, Cisplatin, Docetaxel,...",CR,,1,40.0,57.0,1
5,Sample_0219,52.0,1032,2561,1,0,0,"Carboplatin, Paclitaxel, Cisplatin, Docetaxel,...",CR,,1,34.0,84.0,1
6,Sample_0220,,479,1752,1,0,0,"Oregovomab, Carboplatin, Paclitaxel, Docetaxel...",CR,progression,1,16.0,58.0,1
7,Sample_0221,50.0,629,1069,1,0,0,"Carboplatin, Paclitaxel, Gemcitabine, Docetaxe...",SD,,1,21.0,35.0,1
8,Sample_0222,65.0,2157,2163,1,0,1,"Carboplatin, Docetaxel, Doxorubicin",CR,complete remission,1,71.0,71.0,1
9,Sample_0223,50.0,110,204,1,0,0,"Caroboplatin, Paclitaxel",PD,,0,4.0,7.0,0


### Save cleaned data

In [18]:
r3.to_excel("Repository3_clinical_metadata_cleaned.xlsx")

### Clinical features extraction

In [19]:
r3_clinical_features = r3[['SAMPLE_ID', 'donor_age_at_diagnosis', 'primary_tumor', 'metastatic_tumor', 'family_history_of_cancer', 'platinum-based_chemotherapy']]
r3_clinical_features.to_excel('Repository3_clinical_features.xlsx')

### Labels extraction

In [20]:
r3_final_labels = r3[['SAMPLE_ID', 'final_label']]
r3_final_labels.to_csv('Repository3_final_labels_v1.csv')