In [32]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from pyopls import OPLS
from sklearn.cross_decomposition import PLSRegression
from sklearn.metrics import r2_score, accuracy_score
from sklearn.metrics import mean_absolute_error

df = pd.read_csv('../../data/ST001937_AN003150.csv')

In [33]:
df = df.drop(['Sample ID', 'RAW_FILE_NAME'], axis=1)

# Remove helthy controls
df = df[df.Phenotypes != 'Healthy Controls']
y = df.Phenotypes.apply(lambda x: -1 if x == 'Benign SPNS' else 1 )
X = df.drop('Phenotypes', axis=1)

In [34]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Orthogonal Partial Least Squares (OPLS) Transformation
opls = OPLS(5)
Z_train = opls.fit_transform(X_train, y_train)

# Partial Least Squares (PLS) Regression
pls = PLSRegression(1)

# Fit the PLS model on the training set
pls.fit(Z_train, y_train)

# Predictions on the training set
processed_y_pred_train = pls.predict(Z_train)

# Predictions on the testing set
Z_test = opls.transform(X_test)
processed_y_pred_test = pls.predict(Z_test)

# Evaluation Metrics on Training Set
processed_q_squared_train = r2_score(y_train, processed_y_pred_train)
processed_dq_squared_train = r2_score(y_train, np.clip(processed_y_pred_train, -1, 1))
processed_accuracy_train = accuracy_score(y_train, np.sign(processed_y_pred_train))
# Evalueting MAE
processed_mae_train = mean_absolute_error(y_train, processed_y_pred_train)

# Evaluation Metrics on Testing Set
processed_q_squared_test = r2_score(y_test, processed_y_pred_test)
processed_dq_squared_test = r2_score(y_test, np.clip(processed_y_pred_test, -1, 1))
processed_accuracy_test = accuracy_score(y_test, np.sign(processed_y_pred_test))
# Evalueting MAE
processed_mae_test = mean_absolute_error(y_test, processed_y_pred_test)

# Print or plot the metrics to analyze overfitting
print("Training Set Metrics:")
print(f"Q^2: {processed_q_squared_train}")
print(f"DQ^2: {processed_dq_squared_train}")
print(f"Accuracy: {processed_accuracy_train}")
print(f"MAE: {processed_mae_train}")

print("\nTesting Set Metrics:")
print(f"Q^2: {processed_q_squared_test}")
print(f"DQ^2: {processed_dq_squared_test}")
print(f"Accuracy: {processed_accuracy_test}")
print(f"MAE: {processed_mae_test}")


Training Set Metrics:
Q^2: 0.3664636712606131
DQ^2: 0.3957351279233814
Accuracy: 0.8806818181818182
MAE: 0.45590173511358467

Testing Set Metrics:
Q^2: -5.936241401909724
DQ^2: -0.2405767421131657
Accuracy: 0.7613636363636364
MAE: 0.8567967390789721
