In [1]:
import ast
import pandas as pd
import numpy as np
import pandas_profiling as pp

from sklearn.model_selection import train_test_split

There is 877 samples chosen by Marzena to analysis. We want to predict IHC HER2 status based on following features:
- ERBB2_copy_number
- CEP17_copy_number
- ERBB2_CEP17_ratio
- ERBB2_minus_ploidy
- Purity
- Ploidy

In [2]:
with open('../../her2-status_config.json', 'r') as f:
    config = ast.literal_eval(f.read())

In [3]:
config

{'CLF_NAME': 'her2-status',
 'SEED': 23,
 'HOLDOUT_SIZE': 0.3,
 'FOLDS': 3,
 'REPEATS': 10,
 'FEATURES_FILE': '../../data/processed/features/all_her2-status.csv',
 'CLASSES_FILE': '../../data/processed/targets/her2-status.csv',
 'CLASS_LABEL': 'Class',
 'TARGET_CLASSES': [1, 0],
 'ALL_CLASSES': [1, 0],
 'Y_TRAIN_FILE': '../../data/processed/ready_for_analysis/her2-status/y_train.csv',
 'Y_HOLDOUT_FILE': '../../data/processed/ready_for_analysis/her2-status/y_holdout.csv',
 'TRAIN_DATA_FILE': '../../data/processed/ready_for_analysis/her2-status/data_training.csv',
 'HOLDOUT_DATA_FILE': '../../data/processed/ready_for_analysis/her2-status/data_holdout.csv'}

In [4]:
no_sanger_labels = pd.read_excel('data/RAFAL-.final.classifier.no.sanger.xlsx').set_index('sample_id')
no_sanger_features = pd.read_csv(config['FEATURES_FILE']).set_index('sample_id')

no_sanger_data = pd.merge(no_sanger_labels, no_sanger_features, right_index=True, left_index=True)

In [5]:
project = pd.DataFrame(no_sanger_data.reset_index()['sample_id'])
project['project'] = ''
project.loc[(project.sample_id.str.contains('^CPCT*')) | (project.sample_id.str.contains('^WIDE*')), 'project'] = 'HARTWIG'
project.loc[project.sample_id.str.contains('^TCGA*'), 'project'] = 'TCGA'
project.loc[project.sample_id.str.contains('^DO*'), 'project'] = 'ICGC'
project = project.set_index('sample_id')

In [6]:
no_sanger_data = pd.merge(no_sanger_data, project, right_index=True, left_index=True)

In [7]:
eda = pp.ProfileReport(no_sanger_data)
display(eda)

HBox(children=(HTML(value='Summarize dataset'), FloatProgress(value=0.0, max=25.0), HTML(value='')))




HBox(children=(HTML(value='Generate report structure'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Render HTML'), FloatProgress(value=0.0, max=1.0), HTML(value='')))






In [8]:
no_sanger_data = no_sanger_data[~no_sanger_data['ERBB2_minus_ploidy'].isna()]
no_sanger_data.replace([np.inf, -np.inf], 0, inplace=True)

In [9]:
for col in no_sanger_data.drop(['normal_sample_id', 'project'], axis=1).columns.tolist():
    print(no_sanger_data[col].var())

0.11248401826483893
256.0488871493778
2.8722700587083843
51.17943422468569
250.36058764618082
0.03425580626223096
0.7555664965397516
18.311493735681687
28.942946628571427


In [10]:
X = no_sanger_data.drop(['IHC HER2'], axis=1)
y = no_sanger_data[['IHC HER2']].rename(columns={'IHC HER2': 'Class'})

In [11]:
X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, test_size=config['HOLDOUT_SIZE'], random_state=config['SEED'], stratify=y)

In [12]:
X_train.to_csv(config['TRAIN_DATA_FILE'])
X_holdout.to_csv(config['HOLDOUT_DATA_FILE'])
y_train.to_csv(config['Y_TRAIN_FILE'])
y_holdout.to_csv(config['Y_HOLDOUT_FILE'])