In [1]:
import ast
import pandas as pd
import numpy as np
import pandas_profiling as pp

from sklearn.model_selection import train_test_split

There are 877 samples selected for the analysis. We want to predict IHC HER2 status based on following features:
- ERBB2_copy_number
- CEP17_copy_number
- ERBB2_CEP17_ratio
- ERBB2_minus_ploidy
- Purity
- Ploidy

In [2]:
with open('../her2-status_config.json', 'r') as f:
    config = ast.literal_eval(f.read())

In [3]:
labels = pd.read_csv(config['CLASSES_FILE']).set_index('sample_id')
features = pd.read_csv(config['FEATURES_FILE']).set_index('sample_id')

dataset = features.join(labels, how='inner')
dataset = dataset.drop(['normal_sample_id'], axis=1)
dataset = dataset.dropna()
dataset = dataset.replace(np.inf, 0)

In [4]:
dataset

Unnamed: 0_level_0,ERBB2_copy_number,CEP17_copy_number,ERBB2_CEP17_ratio,ERBB2_minus_ploidy,Purity,Ploidy,Tumour Average Depth,Normal Average Depth,Class
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
CPCT02010003T,2,2,1.000000,-0.0142,0.40,2.0142,50.54,34.73,0
CPCT02010260T,29,4,7.250000,26.3275,0.34,2.6725,49.39,29.09,1
CPCT02010267TII,5,2,2.500000,1.1522,0.32,3.8478,50.10,46.02,1
CPCT02010276TII,2,2,1.000000,-0.1678,0.74,2.1678,49.95,44.72,0
CPCT02010276T,2,2,1.000000,-0.1111,0.62,2.1111,48.78,44.72,0
...,...,...,...,...,...,...,...,...,...
TCGA-EW-A1PCT,6,8,0.750000,1.8396,0.81,4.1604,48.54,30.40,0
TCGA-EW-A1PHT,4,3,1.333333,0.6566,0.76,3.3434,48.44,30.71,0
TCGA-EW-A3U0T,4,4,1.000000,-0.1433,0.26,4.1433,50.41,31.53,0
TCGA-GM-A2DFT,3,2,1.500000,-0.7837,0.21,3.7837,50.11,31.16,0


In [5]:
eda = pp.ProfileReport(dataset)
display(eda)

Summarize dataset: 100%|██████████| 87/87 [00:19<00:00,  4.49it/s, Completed]
Generate report structure: 100%|██████████| 1/1 [00:08<00:00,  8.20s/it]
Render HTML: 100%|██████████| 1/1 [00:05<00:00,  5.71s/it]




In [6]:
X = dataset.drop(['Class'], axis=1)
y = dataset[['Class']]

In [7]:
X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, test_size=config['HOLDOUT_SIZE'], random_state=config['SEED'], stratify=y)

In [8]:
X_train.to_csv(config['TRAIN_DATA_FILE'])
X_holdout.to_csv(config['HOLDOUT_DATA_FILE'])
y_train.to_csv(config['Y_TRAIN_FILE'])
y_holdout.to_csv(config['Y_HOLDOUT_FILE'])