## <font color=FF4455>Regression for OSA</font>


# Data Preparation

### Dependencies

In [21]:
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt


We'll first start by splitting the data into train and test sets

In [17]:
def split_data(xlsx_data):
    """
    Reads xlsx file, separates features from variables, and split it into train and test
    """
    df = pd.read_excel(xlsx_data)
    # Define feature matrix (X) and target variable (y)
    if 'Patient' in df.columns:
        df = df.drop('Patient', axis=1)
    X = df.drop('IAH', axis=1)  # Features
    y = df['IAH']  # Target
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

In [6]:
def split_data_val(xlsx_data):
    """
    Reads xlsx file, separates features from variables, and split it into train, val and test
    """
    df = pd.read_excel(xlsx_data)
    # Define feature matrix (X) and target variable (y)
    X = df.drop('IAH', axis=1)  # Features
    y = df['IAH']  # Target
    # Step 1: Split into training+validation (80%) and test (20%)
    X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    # Step 2: Split training+validation into training (80% of the original 80%) and validation (20% of the original 80%)
    X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)
    return X_train, X_val, X_test, y_train, y_val, y_test

In [18]:
X_train, X_test, y_train, y_test = split_data("Clinical_data.xlsx")
X_train_norm, X_test_norm, y_train_norm, y_test_norm = split_data("Clinical_data_norm.xlsx")
X_train_selec, X_test_selec, y_train_selec, y_test_selec = split_data("Clinical_selected_features.xlsx")

In [9]:
print(f"Training set size: {len(X_train)}")
# print(f"Validation set size: {len(X_val)}")
print(f"Test set size: {len(X_test)}")

Training set size: 519
Test set size: 130


# Models

We will try to fit a model for the 3 datasets we have: The original, the normalized, and the one with only selected features according to Pearson correlation.

### Useful functions

In [10]:
def fit_and_evaluate_model(model, X_train, y_train, X_test, y_test):
    """
    Fits a regression model and returns predictions and train/test losses.
    """
    # Fit the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Calculate MSE loss
    test_loss_mse = mean_squared_error(y_test, y_test_pred)

    # Calculate MAE loss
    test_loss_mae = mean_absolute_error(y_test, y_test_pred)

    return y_train_pred, y_test_pred, test_loss_mse, test_loss_mae

### Multiple Linear Regression

In [19]:
# Initialize the model
model = LinearRegression()

# Fit and evaluate the model on the original dataset
y_train_pred, y_test_pred, train_loss, test_loss = fit_and_evaluate_model(model, X_train, y_train, X_test, y_test)
print(f"Original dataset - Train Loss: {train_loss}, Test Loss: {test_loss}")

# Fit and evaluate the model on the normalized dataset
y_train_pred_norm, y_test_pred_norm, train_loss_norm, test_loss_norm = fit_and_evaluate_model(model, X_train_norm, y_train_norm, X_test_norm, y_test_norm)
print(f"Normalized dataset - Train Loss: {train_loss_norm}, Test Loss: {test_loss_norm}")

# Fit and evaluate the model on the selected features dataset
y_train_pred_selec, y_test_pred_selec, train_loss_selec, test_loss_selec = fit_and_evaluate_model(model, X_train_selec, y_train_selec, X_test_selec, y_test_selec)
print(f"Selected features dataset - Train Loss: {train_loss_selec}, Test Loss: {test_loss_selec}")

Original dataset - Train Loss: 259.97991728514575, Test Loss: 303.38853474944176
Normalized dataset - Train Loss: 259.97991728514575, Test Loss: 303.3885347494418
Selected features dataset - Train Loss: 270.8814072980731, Test Loss: 306.43910031789835


Let's store the losse inside a dictionary so that we can use them later

In [20]:
MLRegression_losses = {
    "original": {
        "MSE": train_loss,
        "MAE": test_loss
    },
    "normalized": {
        "MSE": train_loss_norm,
        "MAE": test_loss_norm
    },
    "selected_features": {
        "MSE": train_loss_selec,
        "MAE": test_loss_selec
    }
}