In [151]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

output_directory = '/home/marco/DataspellProjects/thesis/output/visualization/'

palette = sns.color_palette(["#2176AB", "#F97662", "#FFBF00", "#50C878", "#B284BE"])
sns.set_palette(palette)

data = pd.read_csv('../../data/ST002498_AN004654.csv')

Data preprocessing and cleaning

In [152]:

df = pd.DataFrame()
df[['Diagnosis', 'PreBioPSA', 'Age', 'Height', 'Weight']] = data[['Diagnosis', 'PreBioPSA', 'Age', 'Height', 'Weight']]

#adding metabolites data.loc[:,"S-1-pyrroline-5-carboxylate": ]
df = pd.concat([df, data.loc[:,"S-1-pyrroline-5-carboxylate":]], axis=1)
df.drop(columns=['PreBioPSA'], inplace=True)

df.describe()

Unnamed: 0,Diagnosis,Age,Height,Weight,S-1-pyrroline-5-carboxylate,spermidine,1-methylnicotinamide,3-phosphoglycerate,"5,6-dihydrothymine",hypoxanthine,...,X-25790,X-25810,X-25828,X-25855,X-25948,X-25951,X-25952,X-25957,X-26008,X-26097
count,580.0,580.0,576.0,576.0,580.0,568.0,580.0,579.0,579.0,580.0,...,580.0,580.0,580.0,571.0,576.0,10.0,1.0,539.0,346.0,579.0
mean,0.460345,64.656897,69.375,184.880903,4383292.0,210516.2,6991586.0,381240.3,2225900.0,8298182.0,...,4449906.0,7932991.0,3239607.0,396956.7,290452.4,2387924.0,9318836.0,2276696.0,913910.5,442763.1
std,0.498855,8.615839,2.959877,30.373415,1743890.0,384477.7,30483020.0,294736.8,501135.4,5513231.0,...,10222000.0,19593030.0,719931.7,543797.9,539132.8,2928182.0,,1095406.0,2123511.0,174018.2
min,0.0,33.0,60.0,56.0,901322.2,34387.49,319324.1,57274.94,587530.4,288928.4,...,99732.23,923691.4,869877.8,133202.9,71983.65,85494.42,9318836.0,496155.5,37100.0,60306.52
25%,0.0,59.0,67.0,165.0,3146307.0,124892.7,1985982.0,240978.7,1892882.0,4424133.0,...,2343096.0,2434568.0,2751395.0,272121.4,176287.3,448371.8,9318836.0,1766566.0,293099.9,361063.9
50%,0.0,65.0,69.0,180.0,4119512.0,165810.5,2738064.0,314604.0,2193780.0,6882586.0,...,3295280.0,3629346.0,3308020.0,337131.0,218617.5,1725588.0,9318836.0,2173977.0,492160.0,416296.0
75%,1.0,70.0,71.0,200.0,5301167.0,210997.5,4024268.0,415166.5,2506647.0,11007390.0,...,4483314.0,6840499.0,3741641.0,401630.0,279815.2,3002805.0,9318836.0,2549987.0,799155.6,492169.1
max,1.0,85.0,79.0,315.0,12030980.0,8060037.0,427176600.0,2878343.0,4105726.0,37252590.0,...,198294700.0,398431100.0,6716725.0,9114612.0,9386813.0,9989038.0,9318836.0,16303000.0,26266980.0,2191476.0


Dropping variables with too many missing values

In [153]:
#Dropping variables with too many missing values
# Specifica il numero massimo di valori mancanti consentiti
# aumentare il trheshold per escludere le colonne con meno valori mancanti
max_missing_values = len(df) * 0.7

# Elimina le colonne con più di 'max_missing_values' valori mancanti
df_cleaned = df.dropna(axis=1, thresh=max_missing_values)

df_fillmean = df_cleaned.fillna(df_cleaned.mean())
df_fillzero = df_cleaned.fillna(0)

print(df_fillmean.shape, df_fillzero.shape)

(580, 279) (580, 279)


Standardizing the data

In [154]:
from sklearn import preprocessing

#standardizing the data excluding the Diagnosis column
standard_scaler = preprocessing.StandardScaler()

fillmean_diagnosis = df_fillmean['Diagnosis']
fillzero_diagnosis = df_fillzero['Diagnosis']

df_fillmean_no_diagnosis = df_fillmean.drop('Diagnosis', axis=1)
df_fillzero_no_diagnosis = df_fillzero.drop('Diagnosis', axis=1)

m_standard_df = pd.DataFrame(standard_scaler.fit_transform(df_fillmean_no_diagnosis), columns=df_fillmean_no_diagnosis.columns)
z_standard_df = pd.DataFrame(standard_scaler.fit_transform(df_fillzero_no_diagnosis), columns=df_fillzero_no_diagnosis.columns)

m_standard_df['Diagnosis'] = fillmean_diagnosis
z_standard_df['Diagnosis'] = fillzero_diagnosis


Class balancing

In [155]:
#Class balancing using SMOTE
from imblearn.over_sampling import SMOTE

#oversampling the minority class using SMOTE
oversample = SMOTE(sampling_strategy="auto")
X, y = oversample.fit_resample(m_standard_df.drop('Diagnosis', axis=1), m_standard_df['Diagnosis'])
Z, w = oversample.fit_resample(z_standard_df.drop('Diagnosis', axis=1), z_standard_df['Diagnosis'])


m_df = pd.DataFrame(X, columns=m_standard_df.drop('Diagnosis', axis=1).columns)
m_df['Diagnosis'] = y

z_df = pd.DataFrame(Z, columns=z_standard_df.drop('Diagnosis', axis=1).columns)
z_df['Diagnosis'] = w

m_df.describe()

  m_df['Diagnosis'] = y
  z_df['Diagnosis'] = w


Unnamed: 0,Age,Height,Weight,S-1-pyrroline-5-carboxylate,spermidine,1-methylnicotinamide,3-phosphoglycerate,"5,6-dihydrothymine",hypoxanthine,quinolinate,...,X-25519,X-25520,X-25790,X-25810,X-25828,X-25855,X-25948,X-25957,X-26097,Diagnosis
count,626.0,626.0,626.0,626.0,626.0,626.0,626.0,626.0,626.0,626.0,...,626.0,626.0,626.0,626.0,626.0,626.0,626.0,626.0,626.0,626.0
mean,0.007281,0.013231,0.014401,-0.019019,-0.006967,0.007775,-0.015831,0.022726,-0.007768,-0.00985,...,-0.00676,-0.008887,-0.00725,-0.012317,0.011702,-0.005203,-0.008883,-0.011663,-0.018642,0.5
std,0.991087,0.97859,0.996168,0.980483,0.964537,1.047648,0.969952,0.988012,0.986441,0.968697,...,0.972453,0.969794,0.964667,0.965131,0.98525,0.965049,0.964785,0.969907,0.971448,0.5004
min,-3.677439,-3.181103,-4.261623,-1.998392,-0.46332,-0.219073,-1.101068,-3.274967,-1.453987,-0.868538,...,-1.018346,-0.791425,-0.425937,-0.358053,-3.294444,-0.489258,-0.406981,-1.687716,-2.201596,0.0
25%,-0.657136,-0.466562,-0.657389,-0.708337,-0.223789,-0.164701,-0.477189,-0.638625,-0.685839,-0.363965,...,-0.471387,-0.460507,-0.205193,-0.280778,-0.646137,-0.227877,-0.213015,-0.450417,-0.469225,0.0
50%,0.039857,0.0,-0.095261,-0.180468,-0.115668,-0.139832,-0.226358,-0.024896,-0.259011,-0.200716,...,-0.218714,-0.27584,-0.116589,-0.218488,0.100297,-0.109241,-0.137063,-0.0672,-0.156216,0.5
75%,0.63001,0.551391,0.499934,0.498798,0.0,-0.097602,0.090974,0.572165,0.473678,0.083504,...,0.131996,0.092306,0.003415,-0.057701,0.698217,0.004223,-0.019494,0.201395,0.254564,1.0
max,2.363167,3.265933,4.302566,4.389207,20.648785,13.796128,8.486959,3.757618,5.256337,15.695124,...,10.548879,13.821,18.979855,19.947659,4.833957,16.171069,16.945407,13.295077,10.066393,1.0


In [156]:
#exporting the data to csv

m_df.to_csv('../../data/ST002498_m.csv', index=False)
z_df.to_csv('../../data/ST002498_z.csv', index=False)