In [2]:
import pandas as pd
import numpy as np

# Import input data

In [3]:
r2 = pd.read_excel('./../../original_input_data/clinical_metadata/Repository2_sample_clinical_metadata.xlsx')
r2

Unnamed: 0,SAMPLE_ID,disease_status_last_followup,donor_age_at_diagnosis,donor_relapse_interval_days,donor_survival_time_days,primary_tumor,metastatic_tumor,family_history_of_cancer
0,Sample_0135,relapse,54,21.0,477,1,0,0
1,Sample_0136,progression,54,,184,1,0,0
2,Sample_0137,relapse,52,167.0,972,1,0,0
3,Sample_0138,relapse,52,381.0,1699,1,1,1
4,Sample_0139,relapse,60,175.0,684,1,0,0
...,...,...,...,...,...,...,...,...
74,Sample_0209,relapse,58,37.0,195,1,0,0
75,Sample_0210,relapse,60,21.0,393,1,0,0
76,Sample_0211,relapse,74,21.0,205,1,0,0
77,Sample_0212,relapse,73,293.0,804,1,0,0


## Replace NA with NaN/NaT values

r2 = r2.replace(r'N/A', np.nan, regex=True)

## Calculation of progression-free survival period (months)

In [4]:
r2["progression_free_survival_months"] = round(r2["donor_relapse_interval_days"] / 30.41)
r2

Unnamed: 0,SAMPLE_ID,disease_status_last_followup,donor_age_at_diagnosis,donor_relapse_interval_days,donor_survival_time_days,primary_tumor,metastatic_tumor,family_history_of_cancer,progression_free_survival_months
0,Sample_0135,relapse,54,21.0,477,1,0,0,1.0
1,Sample_0136,progression,54,,184,1,0,0,
2,Sample_0137,relapse,52,167.0,972,1,0,0,5.0
3,Sample_0138,relapse,52,381.0,1699,1,1,1,13.0
4,Sample_0139,relapse,60,175.0,684,1,0,0,6.0
...,...,...,...,...,...,...,...,...,...
74,Sample_0209,relapse,58,37.0,195,1,0,0,1.0
75,Sample_0210,relapse,60,21.0,393,1,0,0,1.0
76,Sample_0211,relapse,74,21.0,205,1,0,0,1.0
77,Sample_0212,relapse,73,293.0,804,1,0,0,10.0


## Calculation of overal survival period (months)

In [5]:
r2["overal_survival_months"] = round(r2["donor_survival_time_days"] / 30.41)
r2

Unnamed: 0,SAMPLE_ID,disease_status_last_followup,donor_age_at_diagnosis,donor_relapse_interval_days,donor_survival_time_days,primary_tumor,metastatic_tumor,family_history_of_cancer,progression_free_survival_months,overal_survival_months
0,Sample_0135,relapse,54,21.0,477,1,0,0,1.0,16.0
1,Sample_0136,progression,54,,184,1,0,0,,6.0
2,Sample_0137,relapse,52,167.0,972,1,0,0,5.0,32.0
3,Sample_0138,relapse,52,381.0,1699,1,1,1,13.0,56.0
4,Sample_0139,relapse,60,175.0,684,1,0,0,6.0,22.0
...,...,...,...,...,...,...,...,...,...,...
74,Sample_0209,relapse,58,37.0,195,1,0,0,1.0,6.0
75,Sample_0210,relapse,60,21.0,393,1,0,0,1.0,13.0
76,Sample_0211,relapse,74,21.0,205,1,0,0,1.0,7.0
77,Sample_0212,relapse,73,293.0,804,1,0,0,10.0,26.0


## Labels preparation

In [6]:
def sample_labeling(x):
    if x['disease_status_last_followup'] == "complete remission":
            return 'possible_responder'
    if x['disease_status_last_followup'] == "relapse" or x['disease_status_last_followup'] == "progression" :
        if x["progression_free_survival_months"] >= 6:
            return 'possible_responder'
        else:
            return 'possible_nonresponder'
    else:
        return 'other_use_as_test'

# Test for sample_labeling function

#x_test = {
    #'disease_status_last_followup': "relapse",
    #'progression_free_survival_months': 2,
#}

#result = sample_labeling(x_test)

r2['label'] = r2.apply(sample_labeling, axis=1)
r2

Unnamed: 0,SAMPLE_ID,disease_status_last_followup,donor_age_at_diagnosis,donor_relapse_interval_days,donor_survival_time_days,primary_tumor,metastatic_tumor,family_history_of_cancer,progression_free_survival_months,overal_survival_months,label
0,Sample_0135,relapse,54,21.0,477,1,0,0,1.0,16.0,possible_nonresponder
1,Sample_0136,progression,54,,184,1,0,0,,6.0,possible_nonresponder
2,Sample_0137,relapse,52,167.0,972,1,0,0,5.0,32.0,possible_nonresponder
3,Sample_0138,relapse,52,381.0,1699,1,1,1,13.0,56.0,possible_responder
4,Sample_0139,relapse,60,175.0,684,1,0,0,6.0,22.0,possible_responder
...,...,...,...,...,...,...,...,...,...,...,...
74,Sample_0209,relapse,58,37.0,195,1,0,0,1.0,6.0,possible_nonresponder
75,Sample_0210,relapse,60,21.0,393,1,0,0,1.0,13.0,possible_nonresponder
76,Sample_0211,relapse,74,21.0,205,1,0,0,1.0,7.0,possible_nonresponder
77,Sample_0212,relapse,73,293.0,804,1,0,0,10.0,26.0,possible_responder


## Save preprocessed clinical metadata for repository 2

In [7]:
r2.to_csv('Repository2_clinical_metadata_cleaned.csv')

## Extraction of samples with labels

In [9]:
r2_labels = r2[['SAMPLE_ID', 'label']]
r2_labels = r2_labels[r2_labels['label'].isin(['possible_responder', 'possible_nonresponder'])]
r2_labels.to_csv('Repository2_samples_with_labels.csv')
r2_labels

Unnamed: 0,SAMPLE_ID,label
0,Sample_0135,possible_nonresponder
1,Sample_0136,possible_nonresponder
2,Sample_0137,possible_nonresponder
3,Sample_0138,possible_responder
4,Sample_0139,possible_responder
...,...,...
74,Sample_0209,possible_nonresponder
75,Sample_0210,possible_nonresponder
76,Sample_0211,possible_nonresponder
77,Sample_0212,possible_responder


## Extraction of clinical features - being further used for correlation with genetic features

In [10]:
r2_clinical_features = r2[['SAMPLE_ID', 'donor_age_at_diagnosis', 'primary_tumor', 'metastatic_tumor', 'family_history_of_cancer']]

r2_clinical_features.to_csv('./../../clinical_features/Repository2_clinical_features.csv')