In [24]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

palette = sns.color_palette(["#2176AB", "#F97662", "#FFBF00", "#50C878", "#B284BE"])
sns.set_palette(palette)

df = pd.read_csv('../../data/archive/ST001937_AN003150.csv')

Data preprocessing and cleaning

In [25]:
df.drop(columns=['Sample ID','RAW_FILE_NAME'], inplace=True)

#df healthy and cancer where 0 is healthy and 1 is cancer malignant or benign
df_healthy_cancer = df.copy()
df_healthy_cancer['Phenotypes'] = df_healthy_cancer['Phenotypes'].map({'Healthy Controls': 0, 'Benign SPNS': 1, 'Maligant SPNS': 1})
df_healthy_cancer.value_counts('Phenotypes')

Phenotypes
1    880
0    280
dtype: int64

In [26]:

#df benign and malignant where 0 is benign and 1 is malignant
df_benign_malignant = df.drop(df[df['Phenotypes'] == 'Healthy Controls'].index)
df_benign_malignant['Phenotypes'] = df_benign_malignant['Phenotypes'].map({'Benign SPNS': 0, 'Maligant SPNS': 1})
df_benign_malignant.value_counts('Phenotypes')

Phenotypes
1    723
0    157
dtype: int64

Standardizing the data

In [27]:
from sklearn import preprocessing

#standardizing the data excluding the Diagnosis column
standard_scaler = preprocessing.StandardScaler()

hca_phenotypes = df_healthy_cancer['Phenotypes']
bm_phenotypes = df_benign_malignant['Phenotypes']

df_healthy_cancer_no_diagnosis = df_healthy_cancer.drop('Phenotypes', axis=1)
df_benign_malignant_no_diagnosis = df_benign_malignant.drop('Phenotypes', axis=1)

hca_df = pd.DataFrame(standard_scaler.fit_transform(df_healthy_cancer_no_diagnosis), columns=df_healthy_cancer_no_diagnosis.columns)
bm_df = pd.DataFrame(standard_scaler.fit_transform(df_benign_malignant_no_diagnosis), columns=df_benign_malignant_no_diagnosis.columns)


hca_df['Phenotypes'] = hca_phenotypes
bm_df['Phenotypes'] = bm_phenotypes

hca_df.describe()

Unnamed: 0,"1,3,5(10)-estratrien-3,6- beta-17-beta-triol","1,5-anhydroglucitol",17-alpha-20-alpha-dihydroxy-4-pregnen-3-one-1,17-alpha-20-alpha-dihydroxy-4-pregnen-3-one-2,17-alpha-20-alpha-dihydroxy-4-pregnen-3-one-3,17-alpha-20-alpha-dihydroxy-4-pregnen-3-one-4,17-alpha-20-alpha-dihydroxy-4-pregnen-3-one-5,17-alpha-20-alpha-dihydroxy-4-pregnen-3-one-6,1-hexadecanol,1-hydroxyanthraquinone prod,...,tyrosine-2,urea-1,urea-2,urea-3,urea-4,uridine,valine,xanthine,xanthosine,Phenotypes
count,1160.0,1160.0,1160.0,1160.0,1160.0,1160.0,1160.0,1160.0,1160.0,1160.0,...,1160.0,1160.0,1160.0,1160.0,1160.0,1160.0,1160.0,1160.0,1160.0,1160.0
mean,-1.2250740000000001e-17,0.0,-1.607909e-16,-4.9002950000000006e-17,-7.503576000000001e-17,-9.188053e-18,-1.2250740000000001e-17,6.125368e-17,2.388894e-16,2.450147e-17,...,1.2250740000000001e-17,1.684476e-16,-1.500715e-16,1.531342e-16,1.531342e-18,-1.2250740000000001e-17,1.2250740000000001e-17,1.2250740000000001e-17,-4.9002950000000006e-17,0.758621
std,1.000431,1.000431,1.000431,1.000431,1.000431,1.000431,1.000431,1.000431,1.000431,1.000431,...,1.000431,1.000431,1.000431,1.000431,1.000431,1.000431,1.000431,1.000431,1.000431,0.428104
min,-19.42518,-9.940976,-1.236017,-10.29234,-0.8241675,-18.77206,-1.069124,-1.05343,-1.895002,-1.189001,...,-1.581639,-1.614048,-1.355858,-1.76596,-0.3189689,-1.470885,-6.725485,-0.3204565,-20.99787,0.0
25%,-0.05704542,-0.187383,-0.5046493,-0.3383285,-0.7294057,-0.02809717,-0.527182,-0.5141986,-0.3879349,-0.5199062,...,-0.4687686,-0.5177106,-0.627544,-0.5467868,-0.2960466,-0.1133873,-0.2420243,-0.2384552,-0.2995882,1.0
50%,-0.04631734,-0.03135,-0.1984306,-0.2399466,-0.1796837,0.00729253,-0.1703977,-0.1943191,-0.08700994,-0.2484115,...,-0.09492164,-0.10624,-0.1183819,-0.117676,-0.2768648,-0.1075314,-0.1902063,-0.1568511,-0.1567816,1.0
75%,-0.006019838,0.134891,0.2421417,0.02934771,0.4299368,0.04657662,0.2657046,0.2554157,0.1912782,0.2528499,...,0.3678364,0.3786152,0.4831423,0.3583828,-0.2222636,-0.08608978,-0.05532306,-0.01482252,0.1799523,1.0
max,13.10926,24.667658,13.39818,11.76364,12.30419,20.76315,20.56656,14.94297,14.07417,8.887802,...,17.21807,15.00379,16.56874,16.464,13.89422,26.04879,17.38537,24.41393,11.88097,1.0


Class balancing

In [28]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import ClusterCentroids
import pandas as pd

# Separate input features and target
y = hca_df['Phenotypes']
X = hca_df.drop('Phenotypes', axis=1)

# 1 = 880, 0 = 280 ->  1 = 500, 0 = 500

# Setting up SMOTE for oversampling
smote = SMOTE(sampling_strategy={0: 500}, random_state=42)
X_res_over, y_res_over = smote.fit_resample(X, y)

# Setting up ClusterCentroids for undersampling
cc = ClusterCentroids(sampling_strategy={1: 500}, random_state=42)
X_res_under, y_res_under = cc.fit_resample(X_res_over, y_res_over)

# Concatenate our training data back together
res_df = pd.concat([X_res_under, y_res_under], axis=1)

# Check the new class counts
res_df['Phenotypes'].value_counts()
res_df.to_csv('../../data/ST001937_hca.csv', index=False)



In [29]:
y = bm_df['Phenotypes']
X = bm_df.drop('Phenotypes', axis=1)

# 1 = 723, 0 = 157 ->  1 = 300, 0 = 300

# Setting up SMOTE for oversampling
smote = SMOTE(sampling_strategy={0: 300}, random_state=42)
X_res_over, y_res_over = smote.fit_resample(X, y)

# Setting up ClusterCentroids for undersampling
cc = ClusterCentroids(sampling_strategy={1: 300}, random_state=42)
X_res_under, y_res_under = cc.fit_resample(X_res_over, y_res_over)

# Concatenate our training data back together
res_df = pd.concat([X_res_under, y_res_under], axis=1)

# Check the new class counts
res_df['Phenotypes'].value_counts()
res_df.to_csv('../../data/ST001937_bm.csv', index=False)

