In [32]:
import pandas as pd
import numpy as np

### Import input data

In [33]:
r2 = pd.read_excel('./../../original_input_data/clinical_metadata/Repository2_sample_clinical_metadata.xlsx')
r2

Unnamed: 0,SAMPLE_ID,disease_status_last_followup,donor_age_at_diagnosis,donor_relapse_interval_days,donor_survival_time_days,primary_tumor,metastatic_tumor,family_history_of_cancer
0,Sample_0135,relapse,54,21.0,477,1,0,0
1,Sample_0136,progression,54,,184,1,0,0
2,Sample_0137,relapse,52,167.0,972,1,0,0
3,Sample_0138,relapse,52,381.0,1699,1,1,1
4,Sample_0139,relapse,60,175.0,684,1,0,0
...,...,...,...,...,...,...,...,...
74,Sample_0209,relapse,58,37.0,195,1,0,0
75,Sample_0210,relapse,60,21.0,393,1,0,0
76,Sample_0211,relapse,74,21.0,205,1,0,0
77,Sample_0212,relapse,73,293.0,804,1,0,0


In [34]:
r2 = r2.replace(r'N/A', np.nan, regex=True)

### Calculation of progression-free survival period (months)

In [35]:
r2["progression_free_survival_months"] = round(r2["donor_relapse_interval_days"] / 30.41)
r2

Unnamed: 0,SAMPLE_ID,disease_status_last_followup,donor_age_at_diagnosis,donor_relapse_interval_days,donor_survival_time_days,primary_tumor,metastatic_tumor,family_history_of_cancer,progression_free_survival_months
0,Sample_0135,relapse,54,21.0,477,1,0,0,1.0
1,Sample_0136,progression,54,,184,1,0,0,
2,Sample_0137,relapse,52,167.0,972,1,0,0,5.0
3,Sample_0138,relapse,52,381.0,1699,1,1,1,13.0
4,Sample_0139,relapse,60,175.0,684,1,0,0,6.0
...,...,...,...,...,...,...,...,...,...
74,Sample_0209,relapse,58,37.0,195,1,0,0,1.0
75,Sample_0210,relapse,60,21.0,393,1,0,0,1.0
76,Sample_0211,relapse,74,21.0,205,1,0,0,1.0
77,Sample_0212,relapse,73,293.0,804,1,0,0,10.0


### Calculation of overal survival period (months)

In [36]:
r2["overal_survival_months"] = round(r2["donor_survival_time_days"] / 30.41)
r2

Unnamed: 0,SAMPLE_ID,disease_status_last_followup,donor_age_at_diagnosis,donor_relapse_interval_days,donor_survival_time_days,primary_tumor,metastatic_tumor,family_history_of_cancer,progression_free_survival_months,overal_survival_months
0,Sample_0135,relapse,54,21.0,477,1,0,0,1.0,16.0
1,Sample_0136,progression,54,,184,1,0,0,,6.0
2,Sample_0137,relapse,52,167.0,972,1,0,0,5.0,32.0
3,Sample_0138,relapse,52,381.0,1699,1,1,1,13.0,56.0
4,Sample_0139,relapse,60,175.0,684,1,0,0,6.0,22.0
...,...,...,...,...,...,...,...,...,...,...
74,Sample_0209,relapse,58,37.0,195,1,0,0,1.0,6.0
75,Sample_0210,relapse,60,21.0,393,1,0,0,1.0,13.0
76,Sample_0211,relapse,74,21.0,205,1,0,0,1.0,7.0
77,Sample_0212,relapse,73,293.0,804,1,0,0,10.0,26.0


### Final labels preparation

In [37]:
def sample_labeling(x):
    if x['disease_status_last_followup'] == "complete remission":
            return 1
    if x['disease_status_last_followup'] == "relapse" or x['disease_status_last_followup'] == "progression" :
        if x["progression_free_survival_months"] >= 6:
            return 1
        else:
            return 0
    else:
        return 'OTHER'


x_test = {
    'disease_status_last_followup': "relapse",
    'progression_free_survival_months': 2,
}

result = sample_labeling(x_test)

r2['final_label'] = r2.apply(sample_labeling, axis=1)
print(r2)

      SAMPLE_ID disease_status_last_followup  donor_age_at_diagnosis  \
0   Sample_0135                      relapse                      54   
1   Sample_0136                  progression                      54   
2   Sample_0137                      relapse                      52   
3   Sample_0138                      relapse                      52   
4   Sample_0139                      relapse                      60   
..          ...                          ...                     ...   
74  Sample_0209                      relapse                      58   
75  Sample_0210                      relapse                      60   
76  Sample_0211                      relapse                      74   
77  Sample_0212                      relapse                      73   
78  Sample_0213                      relapse                      52   

    donor_relapse_interval_days  donor_survival_time_days  primary_tumor  \
0                          21.0                       477  

### Final labels extraction

In [38]:
r2_labels = r2[['SAMPLE_ID', 'final_label']]
r2_labels = r2_labels[r2_labels['final_label'].isin([1, 0])]

r2_labels.to_csv("Repository2_final_labels_v1.csv")

### Clinical-based features extraction

In [39]:
def age_at_diagnosis_range(x):
    if x['donor_age_at_diagnosis'] >= 60:
        return 'age_equalto_or_higher_than_60'
    else:
        return 'age_lower_than_60'


r2['donor_age_at_diagnosis_range'] = r2.apply(age_at_diagnosis_range, axis=1)
r2

Unnamed: 0,SAMPLE_ID,disease_status_last_followup,donor_age_at_diagnosis,donor_relapse_interval_days,donor_survival_time_days,primary_tumor,metastatic_tumor,family_history_of_cancer,progression_free_survival_months,overal_survival_months,final_label,donor_age_at_diagnosis_range
0,Sample_0135,relapse,54,21.0,477,1,0,0,1.0,16.0,0,age_lower_than_60
1,Sample_0136,progression,54,,184,1,0,0,,6.0,0,age_lower_than_60
2,Sample_0137,relapse,52,167.0,972,1,0,0,5.0,32.0,0,age_lower_than_60
3,Sample_0138,relapse,52,381.0,1699,1,1,1,13.0,56.0,1,age_lower_than_60
4,Sample_0139,relapse,60,175.0,684,1,0,0,6.0,22.0,1,age_equalto_or_higher_than_60
...,...,...,...,...,...,...,...,...,...,...,...,...
74,Sample_0209,relapse,58,37.0,195,1,0,0,1.0,6.0,0,age_lower_than_60
75,Sample_0210,relapse,60,21.0,393,1,0,0,1.0,13.0,0,age_equalto_or_higher_than_60
76,Sample_0211,relapse,74,21.0,205,1,0,0,1.0,7.0,0,age_equalto_or_higher_than_60
77,Sample_0212,relapse,73,293.0,804,1,0,0,10.0,26.0,1,age_equalto_or_higher_than_60


In [40]:
r2_clinical_features = r2[['SAMPLE_ID', 'donor_age_at_diagnosis_range', 'primary_tumor', 'metastatic_tumor', 'family_history_of_cancer']]

r2_clinical_features.to_excel('Repository2_clinical_features.xlsx')

### Save prerpocessed clinical metadata

In [41]:
r2.to_excel('Repository2_clinical_metadata_cleaned.xlsx')