In [50]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import pandas as pd

class LoLiMoTNode:
    def __init__(self):
        self.linear_model = None      # Linear model in this region (only for leaf nodes)
        self.split_feature = None     # Feature used to split this node
        self.split_value = None       # Value of the feature for splitting
        self.left = None              # Left child node
        self.right = None             # Right child node

class LoLiMoT:
    def __init__(self, max_depth=10, min_error=1e-3, min_samples_split=10):
        """
        Initializes the LoLiMoT model.

        Parameters:
        - max_depth (int): Maximum depth of the tree.
        - min_error (float): Minimum MSE improvement required to make a split.
        - min_samples_split (int): Minimum number of samples required to consider splitting a node.
        """
        self.max_depth = max_depth
        self.min_error = min_error
        self.min_samples_split = min_samples_split
        self.root = None

    def fit(self, X, y):
        """
        Fits the LoLiMoT model to the data.

        Parameters:
        - X (np.ndarray): Feature matrix.
        - y (np.ndarray): Target vector.
        """
        # Start recursion with the whole dataset
        self.root = self._fit_recursive(X, y, depth=0)

    def _fit_recursive(self, X, y, depth):
        # Create a new node
        node = LoLiMoTNode()

        # Fit a linear model for the current node
        linear_model = LinearRegression()
        linear_model.fit(X, y)

        # Predict the output and calculate the error
        y_pred = linear_model.predict(X)
        error = mean_squared_error(y, y_pred)

        # Stopping criteria:
        # - Max depth reached
        # - Error is below the threshold
        # - Not enough samples to split
        if (
            depth >= self.max_depth
            or error <= self.min_error
            or X.shape[0] < self.min_samples_split
        ):
            node.linear_model = linear_model  # Assign model only to leaf nodes
            return node

        # Find the best feature and value to split the data
        best_split_feature, best_split_value, best_error = self._find_best_split(X, y, linear_model)

        # If no significant improvement is found, make this a leaf node
        if best_split_feature is None or best_error >= error:
            node.linear_model = linear_model  # Assign model only to leaf nodes
            return node

        # Otherwise, split the data and recurse
        node.split_feature = best_split_feature
        node.split_value = best_split_value

        left_indices = X[:, best_split_feature] <= best_split_value
        right_indices = X[:, best_split_feature] > best_split_value

        # Ensure that both splits have enough samples
        if left_indices.sum() < self.min_samples_split or right_indices.sum() < self.min_samples_split:
            node.linear_model = linear_model  # Assign model only to leaf nodes
            return node

        node.left = self._fit_recursive(X[left_indices], y[left_indices], depth + 1)
        node.right = self._fit_recursive(X[right_indices], y[right_indices], depth + 1)

        return node

    def _find_best_split(self, X, y, current_model):
        """
        Finds the best feature and value to split the data to minimize MSE.

        Parameters:
        - X (np.ndarray): Feature matrix.
        - y (np.ndarray): Target vector.
        - current_model (LinearRegression): Linear model of the current node.

        Returns:
        - best_feature (int): Index of the best feature to split on.
        - best_value (float): Value of the feature to split on.
        - best_error (float): The combined MSE after the split.
        """
        # Initialize variables to store the best split
        best_feature = None
        best_value = None
        best_error = float('inf')

        n_features = X.shape[1]

        for feature_idx in range(n_features):
            # Sort the data by the current feature
            sorted_indices = np.argsort(X[:, feature_idx])
            X_sorted, y_sorted = X[sorted_indices], y[sorted_indices]

            # Try every possible split point
            for i in range(1, len(X_sorted)):
                # Skip if the current value is the same as the previous to avoid redundant splits
                if X_sorted[i, feature_idx] == X_sorted[i - 1, feature_idx]:
                    continue

                split_value = (X_sorted[i, feature_idx] + X_sorted[i - 1, feature_idx]) / 2

                # Split the data into two parts based on the split_value
                left_indices = X[:, feature_idx] <= split_value
                right_indices = X[:, feature_idx] > split_value

                # Ensure both splits have enough samples
                if left_indices.sum() < self.min_samples_split or right_indices.sum() < self.min_samples_split:
                    continue

                # Fit linear models on both splits
                left_model = LinearRegression()
                right_model = LinearRegression()

                left_model.fit(X[left_indices], y[left_indices])
                right_model.fit(X[right_indices], y[right_indices])

                # Calculate the combined error
                y_left_pred = left_model.predict(X[left_indices])
                y_right_pred = right_model.predict(X[right_indices])

                error_left = mean_squared_error(y[left_indices], y_left_pred)
                error_right = mean_squared_error(y[right_indices], y_right_pred)

                combined_error = (error_left * left_indices.sum() + error_right * right_indices.sum()) / len(y)

                # Update the best split if this one is better
                if combined_error < best_error:
                    best_feature = feature_idx
                    best_value = split_value
                    best_error = combined_error

        return best_feature, best_value, best_error

    def predict(self, X):
        """
        Predicts the target values for the input samples.

        Parameters:
        - X (np.ndarray): Feature matrix.

        Returns:
        - predictions (np.ndarray): Predicted target values.
        """
        # Predict the output for each sample in X
        return np.array([self._predict_recursive(x, self.root) for x in X])

    def _predict_recursive(self, x, node):
        """
        Recursively traverses the tree to make a prediction for a single sample.

        Parameters:
        - x (np.ndarray): Single sample feature vector.
        - node (LoLiMoTNode): Current node in the tree.

        Returns:
        - prediction (float): Predicted target value.
        """
        # If this is a leaf node, use the linear model to predict
        if node.linear_model is not None:
            return node.linear_model.predict([x])[0]

        # Otherwise, recurse into the left or right child
        if x[node.split_feature] <= node.split_value:
            return self._predict_recursive(x, node.left)
        else:
            return self._predict_recursive(x, node.right)

    def print_tree(self, node=None, depth=0):
        """
        Optional: Prints the tree structure for debugging purposes.

        Parameters:
        - node (LoLiMoTNode): Current node in the tree.
        - depth (int): Current depth in the tree.
        """
        if node is None:
            node = self.root

        indent = "  " * depth
        if node.linear_model is not None:
            print(f"{indent}Leaf: Depth={depth}, MSE={mean_squared_error(y, node.linear_model.predict(X)):.4f}")
            return

        print(f"{indent}Node: Feature {node.split_feature} <= {node.split_value:.4f}")
        if node.left:
            self.print_tree(node.left, depth + 1)
        if node.right:
            self.print_tree(node.right, depth + 1)

# Example Usage with the Provided Longitudinal Dataset

if __name__ == "__main__":
    # Create the longitudinal dataset
    np.random.seed(42)

    n_patients = 3
    n_visits = 4
    n_features = 3

    patient_ids = np.repeat(np.arange(1, n_patients + 1), n_visits)
    visit_numbers = np.tile(np.arange(1, n_visits + 1), n_patients)
    feature_data = np.random.rand(n_patients * n_visits, n_features)
    mds_updrs = np.random.rand(n_patients * n_visits) * 100

    df = pd.DataFrame(
        data=np.column_stack([patient_ids, visit_numbers, feature_data, mds_updrs]),
        columns=['patient_id', 'visit', 'feature_1', 'feature_2', 'feature_3', 'mds_updrs']
    )

    # Convert patient_id and visit to integers
    df['patient_id'] = df['patient_id'].astype(int)
    df['visit'] = df['visit'].astype(int)

    # Display the dataframe
    print("Longitudinal Dataset:")
    print(df)

    # Prepare the data for training
    # Features: feature_1, feature_2, feature_3
    X = df[['feature_1', 'feature_2', 'feature_3']].values
    # Target: mds_updrs
    y = df['mds_updrs'].values

    # Train the LoLiMoT model
    model = LoLiMoT(max_depth=5, min_error=1e-3, min_samples_split=2)  # Adjusted min_samples_split for small dataset
    model.fit(X, y)

    # Make predictions
    y_pred = model.predict(X)

    # Evaluate the model
    mse = mean_squared_error(y, y_pred)
    print("\nModel Evaluation:")
    print(f"Mean Squared Error: {mse:.4f}")

    # Optional: Print the tree structure
    print("\nTree Structure:")
    model.print_tree()

Longitudinal Dataset:
    patient_id  visit  feature_1  feature_2  feature_3  mds_updrs
0            1      1   0.374540   0.950714   0.731994  30.461377
1            1      2   0.598658   0.156019   0.155995   9.767211
2            1      3   0.058084   0.866176   0.601115  68.423303
3            1      4   0.708073   0.020584   0.969910  44.015249
4            2      1   0.832443   0.212339   0.181825  12.203823
5            2      2   0.183405   0.304242   0.524756  49.517691
6            2      3   0.431945   0.291229   0.611853   3.438852
7            2      4   0.139494   0.292145   0.366362  90.932040
8            3      1   0.456070   0.785176   0.199674  25.877998
9            3      2   0.514234   0.592415   0.046450  66.252228
10           3      3   0.607545   0.170524   0.065052  31.171108
11           3      4   0.948886   0.965632   0.808397  52.006802

Model Evaluation:
Mean Squared Error: 0.0000

Tree Structure:
Node: Feature 0 <= 0.4852
  Node: Feature 0 <= 0.1614
   

In [1]:
import os
import pickle
from pathlib import Path
import pandas as pd

# Define paths
PPMI_CLINICAL_GEN_DATA_DIR_INSIDE = Path('D:/data/raw/ppmi/behavior')
dataset_name = '01_22_2024'
PPMI_CLINICAL_GEN_DATA_DIR_INSIDE = PPMI_CLINICAL_GEN_DATA_DIR_INSIDE / 'dadu_etal_generated_data/clinical/ppmi'

# Load preprocessed data
preprocessed_data = pd.read_pickle(os.path.join(PPMI_CLINICAL_GEN_DATA_DIR_INSIDE, 'preprocessed', f"{dataset_name}.pkl"))

# Load representation learning data
representation_learning_data = pd.read_pickle(os.path.join(PPMI_CLINICAL_GEN_DATA_DIR_INSIDE, 'representation_learning', f"{dataset_name}.pkl"))

# Load clustering data
clustering_data = pd.read_pickle(os.path.join(PPMI_CLINICAL_GEN_DATA_DIR_INSIDE, 'clustering', f"{dataset_name}.pkl"))

# Combine data
input_data = {**preprocessed_data, **representation_learning_data, **clustering_data}
datasets = input_data['data_names']
dset_name = 'paper_experiment_flip_outlier'
input_data['M_chosen'][dset_name]

Unnamed: 0_level_0,CN2RSP,CN2RSP,CN2RSP,CN2RSP,CN2RSP,CN2RSP,CN346RSP,CN346RSP,CN346RSP,CN346RSP,...,a_trait,a_trait,a_trait,a_trait,SDMTOTAL,SDMTOTAL,SDMTOTAL,SDMTOTAL,SDMTOTAL,SDMTOTAL
EVENT_ID,BL,V04,V06,V08,V10,V12,BL,V04,V06,V08,...,V06,V08,V10,V12,BL,V04,V06,V08,V10,V12
PATNO,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
3000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.371429,0.257143,0.371429,0.285714,0.294118,0.254902,0.333333,0.294118,0.333333,0.294118
3001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.571429,0.314286,0.371429,0.542857,0.509804,0.627451,0.509804,0.392157,0.392157,0.450980
3002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.657143,0.542857,0.685714,0.742857,0.529412,0.588235,0.745098,0.470588,0.490196,0.588235
3003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.085714,0.228571,0.142857,0.114286,0.607843,0.666667,0.568627,0.372549,0.509804,0.490196
3004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.257143,0.342857,0.228571,0.257143,0.411765,0.274510,0.352941,0.431373,0.294118,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60059,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.200000,0.142857,0.028571,0.571429,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
60060,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.314286,0.600000,0.542857,0.571429,0.882353,0.745098,0.803922,0.705882,0.823529,0.803922
60063,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.514286,0.885714,0.457143,0.800000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
65002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.771429,0.819048,0.866667,0.914286,0.647059,0.549020,0.725490,0.607843,0.490196,0.372549


In [2]:
dataframe = pd.DataFrame(preprocessed_data['M_chosen'][dset_name])


In [3]:
dataframe

Unnamed: 0_level_0,CN2RSP,CN2RSP,CN2RSP,CN2RSP,CN2RSP,CN2RSP,CN346RSP,CN346RSP,CN346RSP,CN346RSP,...,a_trait,a_trait,a_trait,a_trait,SDMTOTAL,SDMTOTAL,SDMTOTAL,SDMTOTAL,SDMTOTAL,SDMTOTAL
EVENT_ID,BL,V04,V06,V08,V10,V12,BL,V04,V06,V08,...,V06,V08,V10,V12,BL,V04,V06,V08,V10,V12
PATNO,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
3000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.371429,0.257143,0.371429,0.285714,0.294118,0.254902,0.333333,0.294118,0.333333,0.294118
3001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.571429,0.314286,0.371429,0.542857,0.509804,0.627451,0.509804,0.392157,0.392157,0.450980
3002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.657143,0.542857,0.685714,0.742857,0.529412,0.588235,0.745098,0.470588,0.490196,0.588235
3003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.085714,0.228571,0.142857,0.114286,0.607843,0.666667,0.568627,0.372549,0.509804,0.490196
3004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.257143,0.342857,0.228571,0.257143,0.411765,0.274510,0.352941,0.431373,0.294118,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60059,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.200000,0.142857,0.028571,0.571429,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
60060,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.314286,0.600000,0.542857,0.571429,0.882353,0.745098,0.803922,0.705882,0.823529,0.803922
60063,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.514286,0.885714,0.457143,0.800000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
65002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.771429,0.819048,0.866667,0.914286,0.647059,0.549020,0.725490,0.607843,0.490196,0.372549


In [34]:
def flatten_wide_format_updrs(df):
    """
    Flattens multi-index column names into single-level column names.
    Example: ('UPDRS', 'BL') becomes 'UPDRS_BL'
    """
    if isinstance(df.columns, pd.MultiIndex):
        df.columns = [f"{col[0]}_{col[1]}" for col in df.columns]
    return df

def create_flattened_wide_format(df, columns_to_drop, target_column_name):
    """
    Creates flattened wide format datasets where each dataset uses features from previous visits 
    to predict the target of the next visit.
    
    Example: For BL_V04, features are from BL, target is from V04
            For BL_V04_V06, features are from BL and V04, target is from V06
    
    Returns:
    list of tuples
        Each tuple contains (name, features_df, target_series)
    """
    wide_format_dataframe = modify_dataframe(df, columns_to_drop, target_column_name)
    
    prediction_scenarios = [
        ('BL_V04', ['BL'], 'V04'),
        ('BL_V04_V06', ['BL', 'V04'], 'V06'),
        ('BL_V04_V06_V08', ['BL', 'V04', 'V06'], 'V08'),
        ('BL_V04_V06_V08_V10', ['BL', 'V04', 'V06', 'V08'], 'V10'),
        ('BL_V04_V06_V08_V10_V12', ['BL', 'V04', 'V06', 'V08', 'V10'], 'V12')
    ]
    
    flattened_dataframes = []
    
    for name, feature_visits, target_visit in prediction_scenarios:
        # Select feature columns from previous visits
        feature_columns = [col for col in wide_format_dataframe.columns 
                         if any(visit in col for visit in feature_visits)]
        
        # Select target column from the target visit
        target_column = [col for col in wide_format_dataframe.columns 
                        if target_visit in col and target_column_name in col][0]
        
        # Create separate feature DataFrame and target series
        features_df = wide_format_dataframe[feature_columns]
        target_series = wide_format_dataframe[target_column]
        
        # Flatten the features DataFrame
        flat_features_df = flatten_wide_format_updrs(features_df)
        
        flattened_dataframes.append((name, flat_features_df, target_series))
    
    return flattened_dataframes

# Usage example:
# flattened_data = create_flattened_wide_format(df, columns_to_drop, target_column_name)
# name, features_df, target = flattened_data[0]  # For BL_V04: features from BL, target from V04

In [35]:
flat_wide_format_updrs3 = create_flattened_wide_format(dataframe, ['UPDRS1', 'UPDRS2'], 'UPDRS3')

  df = df.drop(columns=drop_column_names)
  df = df.drop(columns=new_column_set)


In [47]:
name, features_df, target = flat_wide_format_updrs3[4]

In [45]:
name

'BL_V04_V06_V08_V10_V12'

In [52]:
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import KFold
import pandas as pd
from itertools import product

# Define parameter grid
param_grid = {
    'max_depth': [3, 5],
    'min_error': [1e-3, 1e-4],
    'min_samples_split': [5, 10]
}

# Initialize results dictionary
results = {
    'Dataset': [],
    'Best_max_depth': [],
    'Best_min_error': [],
    'Best_min_samples_split': [],
    'CV_Train_MAE': [],
    'CV_Train_RMSE': [],
    'CV_Train_R2': [],
    'CV_Test_MAE': [],
    'CV_Test_RMSE': [],
    'CV_Test_R2': []
}

# Set random seed
np.random.seed(42)

# 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Run grid search with cross-validation for each dataset
for name, X, y in flat_wide_format_updrs3:
    print(f"\nProcessing dataset: {name}")
    
    # Convert to numpy arrays
    X = X.to_numpy()
    y = y.to_numpy()
    
    # Initialize variables for grid search
    best_avg_test_rmse = float('inf')
    best_params = None
    
    # Generate all parameter combinations
    param_combinations = [dict(zip(param_grid.keys(), v)) 
                        for v in product(*param_grid.values())]
    
    # Grid search with cross-validation
    for params in param_combinations:
        print(f"Testing parameters: {params}")
        
        # Store metrics for each fold
        fold_train_mae = []
        fold_train_rmse = []
        fold_train_r2 = []
        fold_test_mae = []
        fold_test_rmse = []
        fold_test_r2 = []
        
        try:
            # Perform k-fold cross-validation
            for fold_idx, (train_idx, test_idx) in enumerate(kf.split(X)):
                X_train, X_test = X[train_idx], X[test_idx]
                y_train, y_test = y[train_idx], y[test_idx]
                
                # Train model
                model = LoLiMoT(**params)
                model.fit(X_train, y_train)
                
                # Get predictions
                y_train_pred = model.predict(X_train)
                y_test_pred = model.predict(X_test)
                
                # Calculate metrics
                fold_train_mae.append(mean_absolute_error(y_train, y_train_pred))
                fold_train_rmse.append(np.sqrt(mean_squared_error(y_train, y_train_pred)))
                fold_train_r2.append(r2_score(y_train, y_train_pred))
                
                fold_test_mae.append(mean_absolute_error(y_test, y_test_pred))
                fold_test_rmse.append(np.sqrt(mean_squared_error(y_test, y_test_pred)))
                fold_test_r2.append(r2_score(y_test, y_test_pred))
            
            # Calculate average metrics across folds
            avg_test_rmse = np.mean(fold_test_rmse)
            
            # Update best parameters if current model is better
            if avg_test_rmse < best_avg_test_rmse:
                best_avg_test_rmse = avg_test_rmse
                best_params = params
                best_metrics = {
                    'train_mae': np.mean(fold_train_mae),
                    'train_rmse': np.mean(fold_train_rmse),
                    'train_r2': np.mean(fold_train_r2),
                    'test_mae': np.mean(fold_test_mae),
                    'test_rmse': np.mean(fold_test_rmse),
                    'test_r2': np.mean(fold_test_r2)
                }
                
        except Exception as e:
            print(f"Error with parameters {params}: {str(e)}")
            continue
    
    print(f"\nBest parameters for {name}:")
    print(best_params)
    
    # Store results
    results['Dataset'].append(name)
    results['Best_max_depth'].append(best_params['max_depth'])
    results['Best_min_error'].append(best_params['min_error'])
    results['Best_min_samples_split'].append(best_params['min_samples_split'])
    results['CV_Train_MAE'].append(round(best_metrics['train_mae'], 3))
    results['CV_Train_RMSE'].append(round(best_metrics['train_rmse'], 3))
    results['CV_Train_R2'].append(round(best_metrics['train_r2'], 3))
    results['CV_Test_MAE'].append(round(best_metrics['test_mae'], 3))
    results['CV_Test_RMSE'].append(round(best_metrics['test_rmse'], 3))
    results['CV_Test_R2'].append(round(best_metrics['test_r2'], 3))
    
    # Print current results
    print(f"\nResults for {name} with best parameters:")
    print(f"CV Training Set Metrics (averaged over folds):")
    print(f"MAE: {best_metrics['train_mae']:.3f}")
    print(f"RMSE: {best_metrics['train_rmse']:.3f}")
    print(f"R²: {best_metrics['train_r2']:.3f}")
    print(f"CV Test Set Metrics (averaged over folds):")
    print(f"MAE: {best_metrics['test_mae']:.3f}")
    print(f"RMSE: {best_metrics['test_rmse']:.3f}")
    print(f"R²: {best_metrics['test_r2']:.3f}")

# Create DataFrame with results
results_df = pd.DataFrame(results)

# Save to Excel
results_df.to_excel('lolimot_exp_result_grid_search_5cv.xlsx', index=False)

# Display final results table
print("\nFinal Results:")
print(results_df)


Processing dataset: BL_V04
Testing parameters: {'max_depth': 3, 'min_error': 0.001, 'min_samples_split': 5}
Testing parameters: {'max_depth': 3, 'min_error': 0.001, 'min_samples_split': 10}
Testing parameters: {'max_depth': 3, 'min_error': 0.001, 'min_samples_split': 15}
Testing parameters: {'max_depth': 3, 'min_error': 0.0001, 'min_samples_split': 5}
Testing parameters: {'max_depth': 3, 'min_error': 0.0001, 'min_samples_split': 10}
Testing parameters: {'max_depth': 3, 'min_error': 0.0001, 'min_samples_split': 15}
Testing parameters: {'max_depth': 5, 'min_error': 0.001, 'min_samples_split': 5}
Testing parameters: {'max_depth': 5, 'min_error': 0.001, 'min_samples_split': 10}
Testing parameters: {'max_depth': 5, 'min_error': 0.001, 'min_samples_split': 15}
Testing parameters: {'max_depth': 5, 'min_error': 0.0001, 'min_samples_split': 5}
Testing parameters: {'max_depth': 5, 'min_error': 0.0001, 'min_samples_split': 10}
Testing parameters: {'max_depth': 5, 'min_error': 0.0001, 'min_sample

KeyboardInterrupt: 

In [6]:
def extract_and_drop_target_from_wide_format(df, target_column_name, visit):
    # Extract the target column
    target_column = df[(target_column_name, visit)]
    
    # Drop the target column from the main dataframe
    df = df.drop(columns=[(target_column_name, visit)])
    
    return df, target_column

In [9]:
def drop_updrs_2_3_4_from_wide_format(df):
    updrs2_columns = ["NP2SPCH", "NP2SALV", "NP2SWAL", "NP2EAT", "NP2DRES", "NP2HYGN", "NP2HWRT",
                  "NP2HOBB", "NP2TURN", "NP2TRMR", "NP2RISE", "NP2WALK", "NP2FREZ"]
    
# "['CMEDTM', 'EXAMTM', 'PN3RIGRL', 'DYSKPRES', 'DYSKIRAT', 'ANNUAL_TIME_BTW_DOSE_NUPDRS', 'ON_OFF_DOSE', 'PD_MED_USE']
    updrs3_columns = ["NP3SPCH", "NP3FACXP", "NP3RIGN", "NP3RIGRU", "NP3RIGLU", "NP3RIGLL",
                  "NP3FTAPR", "NP3FTAPL", "NP3HMOVR", "NP3HMOVL", "NP3PRSPR", "NP3PRSPL", "NP3TTAPR", "NP3TTAPL", "NP3LGAGR",
                  "NP3LGAGL", "NP3RISNG", "NP3GAIT", "NP3FRZGT", "NP3PSTBL", "NP3POSTR", "NP3BRADY", "NP3PTRMR", "NP3PTRML",
                  "NP3KTRMR", "NP3KTRML", "NP3RTARU", "NP3RTALU", "NP3RTARL", "NP3RTALL", "NP3RTALJ", "NP3RTCON",
                   "NHY"]

#     updrs4_columns = ["NP4WDYSK", "NP4DYSKI", "NP4OFF", "NP4FLCTI", "NP4FLCTX", "NP4DYSTN"]
    df = df.drop(columns=updrs2_columns + updrs3_columns)
    
    return df

def add_updrs1_column_in_wide_format(df):
    updrs1_columns = ['NP1COG', 'NP1HALL', 'NP1DPRS', 'NP1ANXS', 'NP1APAT', 'NP1DDS',
                      'NP1SLPN', 'NP1SLPD', 'NP1PAIN', 'NP1URIN', 'NP1CNST', 'NP1LTHD', 'NP1FATG']
    
    updrs1_multiindex = pd.MultiIndex.from_product([['UPDRS1'], df.columns.levels[1]], names=df.columns.names)

    updrs1_df = pd.DataFrame(index=df.index, columns=updrs1_multiindex)
    
    for visit in df.columns.levels[1]:
        updrs1_df[('UPDRS1', visit)] = df.loc[:, (updrs1_columns, visit)].sum(axis=1)
    
    df = df.drop(columns=updrs1_columns)
    
    df = pd.concat([df, updrs1_df], axis=1)
    
    return df

def modify_dataframe(df, columns_to_drop, new_column_name):

    updrs_columns = {
        'UPDRS1': ['NP1COG', 'NP1HALL', 'NP1DPRS', 'NP1ANXS', 'NP1APAT', 'NP1DDS', 'NP1SLPN', 'NP1SLPD', 'NP1PAIN', 'NP1URIN', 'NP1CNST', 'NP1LTHD', 'NP1FATG'],
        'UPDRS2': ['NP2SPCH', 'NP2SALV', 'NP2SWAL', 'NP2EAT', 'NP2DRES', 'NP2HYGN', 'NP2HWRT', 'NP2HOBB', 'NP2TURN', 'NP2TRMR', 'NP2RISE', 'NP2WALK', 'NP2FREZ'],
        'UPDRS3': ['NP3SPCH', 'NP3FACXP', 'NP3RIGN', 'NP3RIGRU', 'NP3RIGLU', 'NP3RIGLL', 'NP3FTAPR', 'NP3FTAPL', 'NP3HMOVR', 'NP3HMOVL', 'NP3PRSPR', 'NP3PRSPL', 'NP3TTAPR', 'NP3TTAPL', 'NP3LGAGR', 'NP3LGAGL', 'NP3RISNG', 'NP3GAIT', 'NP3FRZGT', 'NP3PSTBL', 'NP3POSTR', 'NP3BRADY', 'NP3PTRMR', 'NP3PTRML', 'NP3KTRMR', 'NP3KTRML', 'NP3RTARU', 'NP3RTALU', 'NP3RTARL', 'NP3RTALL', 'NP3RTALJ', 'NP3RTCON', 'NHY']
        #'UPDRS4': ['NP4WDYSK', 'NP4DYSKI', 'NP4OFF', 'NP4FLCTI', 'NP4FLCTX', 'NP4DYSTN']
    }
    
    drop_column_names = []
    for drop_target in columns_to_drop:
            drop_column_names.extend(updrs_columns[drop_target])
    
    df = df.drop(columns=drop_column_names)
    
    if new_column_name in updrs_columns:
        new_column_set = updrs_columns[new_column_name]
        new_column_multiindex = pd.MultiIndex.from_product([[new_column_name], df.columns.levels[1]], names=df.columns.names)
        new_column_df = pd.DataFrame(index=df.index, columns=new_column_multiindex)
        
        for visit in df.columns.levels[1]:
            new_column_df[(new_column_name, visit)] = df.loc[:, (new_column_set, visit)].sum(axis=1)
        
        df = df.drop(columns=new_column_set)
        df = pd.concat([df, new_column_df], axis=1)
    
    return df