In [1]:
import xmlrpc.client

import numpy as np
import pandas as pd
from skimage.metrics import mean_squared_error
from sklearn.cross_decomposition import PLSRegression
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt


In [2]:
df = pd.read_csv('../../data/ST001237_AN002055.csv')

In [3]:
df.drop(['Sample ID', 'Trial', 'raw file name', 'OS_Censor (1 means the time is a censoring time and 0 means a failure time in OS)'], axis=1, inplace=True)

Data cleaning and preprocessing

In [4]:

# rename columns to be more readable
df.rename(columns={'Time point': 'time_point',
                   'OS (Overall Survival, months)': 'os_time',
                   'Prior antiangiogenic regimens (≥2)': 'prior_antiangiogenic_regimens',
                   'Derived_Age_at_Consent': 'Age'
                   },
          inplace=True)

# Time point in days instead of weeks and baseline as 0 d
df['time_point'] = df['time_point'].apply(lambda x: 0 if x == 'baseline' else int(str(x).replace('week ', '')) * 7)
df['CRF_MSKCC_Risk_Group'] = df['CRF_MSKCC_Risk_Group'].apply(
    lambda x: 0 if x == 'POOR' else 1 if x == 'INTERMEDIATE' else 2)
df['Sex'] = df['Sex'].apply(lambda x: 0 if x == 'F' else 1)
df['Treatment'] = df['Treatment'].apply(lambda x: 0 if x == 'NIVOLUMAB' else 1)
df['Region'] = df['Region'].apply(lambda x: df['Region'].unique().tolist().index(x))
df['Race'] = df['Race'].apply(lambda x: 0 if x == 'WHITE' else 1)
df['prior_antiangiogenic_regimens'] = df['prior_antiangiogenic_regimens'].apply(lambda x: 0 if x == 'False' else int(x))
df['os_time'] = df['os_time'].astype(int)

#replace metabolites na values with mean
df.fillna(df.mean(), inplace=True)


Unnamed: 0,time_point,CRF_MSKCC_Risk_Group,Treatment,prior_antiangiogenic_regimens,os_time,Region,Age,Sex,Race,Kynurenine_μM,...,C36:2 PS plasmalogen,C16:0 ceramide (d18:1),C24:1 ceramide (d18:1),C14:0 SM,C16:1 SM,C16:0 SM,C18:1 SM,C18:2 SM,C18:0 SM,C20:0 SM
count,1221.0,1221.0,1221.0,1221.0,1221.0,1221.0,1221.0,1221.0,1221.0,1221.0,...,1221.0,1221.0,1221.0,1221.0,1221.0,1221.0,1221.0,1221.0,1221.0,1221.0
mean,18.437346,1.169533,0.333333,0.226044,25.097461,1.203112,61.226044,0.747748,0.143325,6.582139,...,5.791247,5.734799,6.380391,6.449352,6.616477,7.470586,6.444245,4.787712,6.788397,6.751793
std,24.349007,0.713974,0.471598,0.41844,15.295274,0.814635,10.469378,0.434483,0.350548,1.49958,...,0.24938,0.173388,0.162301,0.162414,0.128217,0.104635,0.156408,0.310861,0.134258,0.122432
min,0.0,0.0,0.0,0.0,0.0,0.0,23.0,0.0,0.0,3.373,...,5.272,5.139,5.853,5.943,6.18,7.113,5.915,2.684,6.31,6.263
25%,0.0,1.0,0.0,0.0,12.0,0.0,55.0,0.0,0.0,5.581,...,5.65,5.618,6.269,6.344,6.538,7.403,6.344,4.627,6.702,6.678
50%,0.0,1.0,0.0,0.0,23.0,1.0,62.0,1.0,0.0,6.337,...,5.755,5.731,6.379,6.451,6.614,7.473,6.447,4.811,6.792,6.754
75%,56.0,2.0,1.0,0.0,40.0,2.0,69.0,1.0,0.0,7.257,...,5.869,5.854,6.493,6.558,6.696,7.539,6.545,4.99,6.88,6.832
max,56.0,2.0,1.0,1.0,53.0,2.0,88.0,1.0,1.0,15.001,...,7.061,6.439,7.088,6.989,7.047,7.818,6.976,5.861,7.244,7.255


Addestra un modello PLS

In [62]:
X = df.drop(['CRF_MSKCC_Risk_Group'], axis=1)
y = df['CRF_MSKCC_Risk_Group'].values.reshape(-1, 1)


In [63]:
# Suddividi i dati in set di addestramento e test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Crea un modello PLS con un numero di componenti specificato
num_componenti = 2
modello_pls = PLSRegression(n_components=num_componenti)

# Addestra il modello PLS
modello_pls.fit(X_train, y_train)

# Effettua previsioni sul set di test
previsioni = modello_pls.predict(X_test)

# Calcola l'errore quadratico medio (MSE)
mse = mean_squared_error(y_test, previsioni)
print(f"Errore quadratico medio: {mse}")


Errore quadratico medio: 0.39451908581549255
