In [1]:
# Use `pivot` to reshape the dataframe, setting `patient_id` as the index,
# then pivoting `visit` into columns and keeping the features as values.
df_wide = df.pivot(index='patient_id', columns='visit')

# Optionally, rename the columns to have a more readable multi-level structure
# Flatten the column multi-index to make it easier to work with
df_wide.columns = pd.MultiIndex.from_tuples(
    [(col[0], f'visit_{col[1]}') for col in df_wide.columns],
    names=['Feature', 'Visit']
)

# Show the resulting wide-format dataframe
print(df_wide)

NameError: name 'df' is not defined

In [3]:
df_wide.head()

Feature,feature_1,feature_1,feature_1,feature_1,feature_2,feature_2,feature_2,feature_2,feature_3,feature_3,feature_3,feature_3,mds_updrs,mds_updrs,mds_updrs,mds_updrs
Visit,visit_1,visit_2,visit_3,visit_4,visit_1,visit_2,visit_3,visit_4,visit_1,visit_2,visit_3,visit_4,visit_1,visit_2,visit_3,visit_4
patient_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
1,0.37454,0.598658,0.058084,0.708073,0.950714,0.156019,0.866176,0.020584,0.731994,0.155995,0.601115,0.96991,30.461377,9.767211,68.423303,44.015249
2,0.832443,0.183405,0.431945,0.139494,0.212339,0.304242,0.291229,0.292145,0.181825,0.524756,0.611853,0.366362,12.203823,49.517691,3.438852,90.93204
3,0.45607,0.514234,0.607545,0.948886,0.785176,0.592415,0.170524,0.965632,0.199674,0.04645,0.065052,0.808397,25.877998,66.252228,31.171108,52.006802


In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import pandas as pd

class LoLiMoTNode:
    def __init__(self):
        self.linear_model = None      # Linear model in this region (only for leaf nodes)
        self.split_feature = None     # Feature used to split this node
        self.split_value = None       # Value of the feature for splitting
        self.left = None              # Left child node
        self.right = None             # Right child node

class LoLiMoT:
    def __init__(self, max_depth=10, min_error=1e-3, min_samples_split=10):
        """
        Initializes the LoLiMoT model.

        Parameters:
        - max_depth (int): Maximum depth of the tree.
        - min_error (float): Minimum MSE improvement required to make a split.
        - min_samples_split (int): Minimum number of samples required to consider splitting a node.
        """
        self.max_depth = max_depth
        self.min_error = min_error
        self.min_samples_split = min_samples_split
        self.root = None

    def fit(self, X, y):
        """
        Fits the LoLiMoT model to the data.

        Parameters:
        - X (np.ndarray): Feature matrix.
        - y (np.ndarray): Target vector.
        """
        # Start recursion with the whole dataset
        self.root = self._fit_recursive(X, y, depth=0)

    def _fit_recursive(self, X, y, depth):
        # Create a new node
        node = LoLiMoTNode()

        # Fit a linear model for the current node
        linear_model = LinearRegression()
        linear_model.fit(X, y)

        # Predict the output and calculate the error
        y_pred = linear_model.predict(X)
        error = mean_squared_error(y, y_pred)

        # Stopping criteria:
        # - Max depth reached
        # - Error is below the threshold
        # - Not enough samples to split
        if (
            depth >= self.max_depth
            or error <= self.min_error
            or X.shape[0] < self.min_samples_split
        ):
            node.linear_model = linear_model  # Assign model only to leaf nodes
            return node

        # Find the best feature and value to split the data
        best_split_feature, best_split_value, best_error = self._find_best_split(X, y, linear_model)

        # If no significant improvement is found, make this a leaf node
        if best_split_feature is None or best_error >= error:
            node.linear_model = linear_model  # Assign model only to leaf nodes
            return node

        # Otherwise, split the data and recurse
        node.split_feature = best_split_feature
        node.split_value = best_split_value

        left_indices = X[:, best_split_feature] <= best_split_value
        right_indices = X[:, best_split_feature] > best_split_value

        # Ensure that both splits have enough samples
        if left_indices.sum() < self.min_samples_split or right_indices.sum() < self.min_samples_split:
            node.linear_model = linear_model  # Assign model only to leaf nodes
            return node

        node.left = self._fit_recursive(X[left_indices], y[left_indices], depth + 1)
        node.right = self._fit_recursive(X[right_indices], y[right_indices], depth + 1)

        return node

    def _find_best_split(self, X, y, current_model):
        """
        Finds the best feature and value to split the data to minimize MSE.

        Parameters:
        - X (np.ndarray): Feature matrix.
        - y (np.ndarray): Target vector.
        - current_model (LinearRegression): Linear model of the current node.

        Returns:
        - best_feature (int): Index of the best feature to split on.
        - best_value (float): Value of the feature to split on.
        - best_error (float): The combined MSE after the split.
        """
        # Initialize variables to store the best split
        best_feature = None
        best_value = None
        best_error = float('inf')

        n_features = X.shape[1]

        for feature_idx in range(n_features):
            # Sort the data by the current feature
            sorted_indices = np.argsort(X[:, feature_idx])
            X_sorted, y_sorted = X[sorted_indices], y[sorted_indices]

            # Try every possible split point
            for i in range(1, len(X_sorted)):
                # Skip if the current value is the same as the previous to avoid redundant splits
                if X_sorted[i, feature_idx] == X_sorted[i - 1, feature_idx]:
                    continue

                split_value = (X_sorted[i, feature_idx] + X_sorted[i - 1, feature_idx]) / 2

                # Split the data into two parts based on the split_value
                left_indices = X[:, feature_idx] <= split_value
                right_indices = X[:, feature_idx] > split_value

                # Ensure both splits have enough samples
                if left_indices.sum() < self.min_samples_split or right_indices.sum() < self.min_samples_split:
                    continue

                # Fit linear models on both splits
                left_model = LinearRegression()
                right_model = LinearRegression()

                left_model.fit(X[left_indices], y[left_indices])
                right_model.fit(X[right_indices], y[right_indices])

                # Calculate the combined error
                y_left_pred = left_model.predict(X[left_indices])
                y_right_pred = right_model.predict(X[right_indices])

                error_left = mean_squared_error(y[left_indices], y_left_pred)
                error_right = mean_squared_error(y[right_indices], y_right_pred)

                combined_error = (error_left * left_indices.sum() + error_right * right_indices.sum()) / len(y)

                # Update the best split if this one is better
                if combined_error < best_error:
                    best_feature = feature_idx
                    best_value = split_value
                    best_error = combined_error

        return best_feature, best_value, best_error

    def predict(self, X):
        """
        Predicts the target values for the input samples.

        Parameters:
        - X (np.ndarray): Feature matrix.

        Returns:
        - predictions (np.ndarray): Predicted target values.
        """
        # Predict the output for each sample in X
        return np.array([self._predict_recursive(x, self.root) for x in X])

    def _predict_recursive(self, x, node):
        """
        Recursively traverses the tree to make a prediction for a single sample.

        Parameters:
        - x (np.ndarray): Single sample feature vector.
        - node (LoLiMoTNode): Current node in the tree.

        Returns:
        - prediction (float): Predicted target value.
        """
        # If this is a leaf node, use the linear model to predict
        if node.linear_model is not None:
            return node.linear_model.predict([x])[0]

        # Otherwise, recurse into the left or right child
        if x[node.split_feature] <= node.split_value:
            return self._predict_recursive(x, node.left)
        else:
            return self._predict_recursive(x, node.right)

    def print_tree(self, node=None, depth=0):
        """
        Optional: Prints the tree structure for debugging purposes.

        Parameters:
        - node (LoLiMoTNode): Current node in the tree.
        - depth (int): Current depth in the tree.
        """
        if node is None:
            node = self.root

        indent = "  " * depth
        if node.linear_model is not None:
            print(f"{indent}Leaf: Depth={depth}, MSE={mean_squared_error(y, node.linear_model.predict(X)):.4f}")
            return

        print(f"{indent}Node: Feature {node.split_feature} <= {node.split_value:.4f}")
        if node.left:
            self.print_tree(node.left, depth + 1)
        if node.right:
            self.print_tree(node.right, depth + 1)

# Example Usage with the Provided Longitudinal Dataset

if __name__ == "__main__":
    # Create the longitudinal dataset
    np.random.seed(42)

    n_patients = 3
    n_visits = 4
    n_features = 3

    patient_ids = np.repeat(np.arange(1, n_patients + 1), n_visits)
    visit_numbers = np.tile(np.arange(1, n_visits + 1), n_patients)
    feature_data = np.random.rand(n_patients * n_visits, n_features)
    mds_updrs = np.random.rand(n_patients * n_visits) * 100

    df = pd.DataFrame(
        data=np.column_stack([patient_ids, visit_numbers, feature_data, mds_updrs]),
        columns=['patient_id', 'visit', 'feature_1', 'feature_2', 'feature_3', 'mds_updrs']
    )

    # Convert patient_id and visit to integers
    df['patient_id'] = df['patient_id'].astype(int)
    df['visit'] = df['visit'].astype(int)

    # Display the dataframe
    print("Longitudinal Dataset:")
    print(df)

    # Prepare the data for training
    # Features: feature_1, feature_2, feature_3
    X = df[['feature_1', 'feature_2', 'feature_3']].values
    # Target: mds_updrs
    y = df['mds_updrs'].values

    # Train the LoLiMoT model
    model = LoLiMoT(max_depth=5, min_error=1e-3, min_samples_split=2)  # Adjusted min_samples_split for small dataset
    model.fit(X, y)

    # Make predictions
    y_pred = model.predict(X)

    # Evaluate the model
    mse = mean_squared_error(y, y_pred)
    print("\nModel Evaluation:")
    print(f"Mean Squared Error: {mse:.4f}")

    # Optional: Print the tree structure
    print("\nTree Structure:")
    model.print_tree()

In [2]:
import os
import pickle
from pathlib import Path
import pandas as pd

# Define paths
PPMI_CLINICAL_GEN_DATA_DIR_INSIDE = Path('D:/data/raw/ppmi/behavior')
dataset_name = '01_22_2024'
PPMI_CLINICAL_GEN_DATA_DIR_INSIDE = PPMI_CLINICAL_GEN_DATA_DIR_INSIDE / 'dadu_etal_generated_data/clinical/ppmi'

# Load preprocessed data
preprocessed_data = pd.read_pickle(os.path.join(PPMI_CLINICAL_GEN_DATA_DIR_INSIDE, 'preprocessed', f"{dataset_name}.pkl"))

# Load representation learning data
representation_learning_data = pd.read_pickle(os.path.join(PPMI_CLINICAL_GEN_DATA_DIR_INSIDE, 'representation_learning', f"{dataset_name}.pkl"))

# Load clustering data
clustering_data = pd.read_pickle(os.path.join(PPMI_CLINICAL_GEN_DATA_DIR_INSIDE, 'clustering', f"{dataset_name}.pkl"))

# Combine data
input_data = {**preprocessed_data, **representation_learning_data, **clustering_data}
datasets = input_data['data_names']
dset_name = 'paper_experiment_flip_outlier'
input_data['M_chosen'][dset_name]

Unnamed: 0_level_0,CN2RSP,CN2RSP,CN2RSP,CN2RSP,CN2RSP,CN2RSP,CN346RSP,CN346RSP,CN346RSP,CN346RSP,...,a_trait,a_trait,a_trait,a_trait,SDMTOTAL,SDMTOTAL,SDMTOTAL,SDMTOTAL,SDMTOTAL,SDMTOTAL
EVENT_ID,BL,V04,V06,V08,V10,V12,BL,V04,V06,V08,...,V06,V08,V10,V12,BL,V04,V06,V08,V10,V12
PATNO,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
3000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.371429,0.257143,0.371429,0.285714,0.294118,0.254902,0.333333,0.294118,0.333333,0.294118
3001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.571429,0.314286,0.371429,0.542857,0.509804,0.627451,0.509804,0.392157,0.392157,0.450980
3002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.657143,0.542857,0.685714,0.742857,0.529412,0.588235,0.745098,0.470588,0.490196,0.588235
3003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.085714,0.228571,0.142857,0.114286,0.607843,0.666667,0.568627,0.372549,0.509804,0.490196
3004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.257143,0.342857,0.228571,0.257143,0.411765,0.274510,0.352941,0.431373,0.294118,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60059,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.200000,0.142857,0.028571,0.571429,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
60060,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.314286,0.600000,0.542857,0.571429,0.882353,0.745098,0.803922,0.705882,0.823529,0.803922
60063,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.514286,0.885714,0.457143,0.800000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
65002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.771429,0.819048,0.866667,0.914286,0.647059,0.549020,0.725490,0.607843,0.490196,0.372549


In [3]:
dataframe = pd.DataFrame(preprocessed_data['M_chosen'][dset_name])


In [4]:
dataframe

Unnamed: 0_level_0,CN2RSP,CN2RSP,CN2RSP,CN2RSP,CN2RSP,CN2RSP,CN346RSP,CN346RSP,CN346RSP,CN346RSP,...,a_trait,a_trait,a_trait,a_trait,SDMTOTAL,SDMTOTAL,SDMTOTAL,SDMTOTAL,SDMTOTAL,SDMTOTAL
EVENT_ID,BL,V04,V06,V08,V10,V12,BL,V04,V06,V08,...,V06,V08,V10,V12,BL,V04,V06,V08,V10,V12
PATNO,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
3000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.371429,0.257143,0.371429,0.285714,0.294118,0.254902,0.333333,0.294118,0.333333,0.294118
3001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.571429,0.314286,0.371429,0.542857,0.509804,0.627451,0.509804,0.392157,0.392157,0.450980
3002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.657143,0.542857,0.685714,0.742857,0.529412,0.588235,0.745098,0.470588,0.490196,0.588235
3003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.085714,0.228571,0.142857,0.114286,0.607843,0.666667,0.568627,0.372549,0.509804,0.490196
3004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.257143,0.342857,0.228571,0.257143,0.411765,0.274510,0.352941,0.431373,0.294118,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60059,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.200000,0.142857,0.028571,0.571429,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
60060,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.314286,0.600000,0.542857,0.571429,0.882353,0.745098,0.803922,0.705882,0.823529,0.803922
60063,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.514286,0.885714,0.457143,0.800000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
65002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.771429,0.819048,0.866667,0.914286,0.647059,0.549020,0.725490,0.607843,0.490196,0.372549
