In [44]:
#Mixed Effect Model Implementation
import numpy as np
import pandas as pd
import itertools
from sklearn.metrics import mean_squared_error
import statsmodels.api as sm 
from statsmodels.regression.mixed_linear_model import MixedLM
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import KFold, ParameterGrid

def prepare_temporal_data(data, visit_col='visit', target_col='UPDRS3_total'):
    visit_order = {
        'BL': 0, 'V04': 1, 'V06': 2, 
        'V08': 3, 'V10': 4, 'V12': 5
    }
    
    data_copy = data.copy()
    data_copy['visit_order'] = data_copy[visit_col].map(visit_order)
    
    # Get the last visit for each patient
    last_visit_mask = data_copy[visit_col] == data_copy.groupby(level='PATNO')[visit_col].transform('max')
    
    # X: all visits EXCEPT last visit
    X = data_copy[~last_visit_mask].copy()
    X = X.drop(columns=[target_col])
    
    # y: ONLY last visit's target
    y_last = data_copy[last_visit_mask][target_col]
    
    # Repeat y values to match X structure
    y = pd.Series(index=X.index)
    for pat in X.index.get_level_values('PATNO').unique():
        y[X.index.get_level_values('PATNO') == pat] = y_last[pat]
    
    print(f"X shape: {X.shape}")
    print(f"y shape: {y.shape}")
    
    # Drop non-numeric columns and add constant
    X_subset = X.drop(['visit', 'visit_order'], axis=1)
    X_subset = sm.add_constant(X_subset)
    
    try:
        # Fit model
        model = MixedLM(
            endog=y,
            exog=X_subset,
            groups=X.index.get_level_values('PATNO')
        ).fit()
        print("Model fitted successfully")
        return model
    except Exception as e:
        print(f"Error in fitting model: {str(e)}")
        print("\nDebug info:")
        print("X_subset shape:", X_subset.shape)
        print("y shape:", y.shape)
        print("Number of unique patients in X:", len(X.index.get_level_values('PATNO').unique()))
        print("Number of unique patients in y:", len(y.index.unique()))
        return None

def run_longitudinal_experiments(data_dict, visit_col='visit', target_col='UPDRS3_total'):
    """
    Main function to run longitudinal experiments
    """
    results = {}
    
    for dataset_name, data in data_dict.items():
        print(f"\nProcessing dataset: {dataset_name}")
        print("-" * 50)
        print(f"Initial data shape: {data.shape}")
        
        # Prepare temporal data
        X, y = prepare_temporal_data(data, visit_col, target_col)
        print('After prepare_temporal_data:')
        print(f'X shape: {X.shape}, y shape: {y.shape}')
        print(f'X index unique PATNOs: {len(X.index.get_level_values("PATNO").unique())}')

        # Split patients into train/test sets
        unique_patients = X.index.get_level_values('PATNO').unique()
        print(f"\nBefore split - unique patients: {len(unique_patients)}")
        
        train_patients, test_patients = train_test_split(unique_patients, 
                                               test_size=0.2, 
                                               random_state=42)
        print(f"After split - train patients: {len(train_patients)}, test patients: {len(test_patients)}")

        # Split data based on patients
        X_train = X[X.index.get_level_values('PATNO').isin(train_patients)]
        X_test = X[X.index.get_level_values('PATNO').isin(test_patients)]
        y_train = y[y.index.isin(X_train.index)]
        y_test = y[y.index.isin(X_test.index)]
        
        print("\nAfter patient-based splitting:")
        print(f"X_train shape: {X_train.shape}, X_test shape: {X_test.shape}")
        print(f"y_train shape: {y_train.shape}, y_test shape: {y_test.shape}")
        
        # Perform grid search using training data
        best_model, best_params, grid_results = grid_search_mixed_lm(
            X_train, y_train
        )
        
        # ... rest of the function ...

def grid_search_mixed_lm(X, y, n_folds=5):
    """
    Simplified Mixed LM for longitudinal data
    """
    print("X index:", X.index)
    print("y index:", y.index)
    print("\nX shape:", X.shape)
    print("y shape:", y.shape)
    
    # Drop non-numeric columns
    X_subset = X.drop(['visit', 'visit_order'], axis=1)
    
    # Add constant
    X_subset = sm.add_constant(X_subset)
    
    try:
        # Fit a single model
        test_model = MixedLM(
            endog=y,
            exog=X_subset,
            groups=X.index.get_level_values('PATNO')
        ).fit()
        
        return test_model, {'features': X_subset.columns.tolist()}, None
        
    except Exception as e:
        print(f"\nError fitting model: {str(e)}")
        raise ValueError("Could not fit model with basic configuration")
        
def calculate_metrics(y_true, y_pred):
    return {
        'mae': mean_absolute_error(y_true, y_pred),
        'rmse': np.sqrt(mean_squared_error(y_true, y_pred)),
        'r2': r2_score(y_true, y_pred)
    }

def calculate_temporal_metrics(model, test_data):
    return {
        'icc': calculate_icc(model),  # Intraclass Correlation Coefficient
        'consistent_effects': assess_temporal_consistency(model),
        'longitudinal_reliability': calculate_reliability(model)
    }

def calculate_diagnostic_metrics(model):
    return {
        'aic': model.aic,
        'bic': model.bic,
        'log_likelihood': model.llf,
        'condition_number': model.cond_no,
        'convergence_info': model.converged
    }

In [3]:
def create_combined_visits_df_long(main_dataframe, visits):
    combined_dfs = {}
    
    # Start by reshaping the baseline (BL) into long format
    bl_columns = [col for col in main_dataframe.columns if 'BL' in col[1]]
    bl_df = main_dataframe.loc[:, bl_columns].copy()
    bl_df.columns = bl_df.columns.droplevel(1)  # Drop the 'BL' level from columns
    bl_df['visit'] = 'BL'
    
    combined_so_far_df = bl_df
    combined_dfs['BL'] = bl_df
    
    for i in range(len(visits)):
        # Get columns for the current visit
        visit = visits[i]
        visit_columns = [col for col in main_dataframe.columns if visit in col[1]]
        
        # Create the DataFrame for the current visit
        visit_df = main_dataframe.loc[:, visit_columns].copy()
        visit_df.columns = visit_df.columns.droplevel(1) 
        visit_df['visit'] = visit 
        
        combined_df = pd.concat([combined_so_far_df, visit_df], axis=0)
        
        combined_name = 'BL_' + '_'.join(visits[:i+1])
    
        combined_dfs[combined_name] = combined_df
        
        combined_so_far_df = combined_df
    
    return combined_dfs

In [20]:
def process_updrs_columns(dataframe):
    updrs_columns = {
        'UPDRS1': ['NP1COG', 'NP1HALL', 'NP1DPRS', 'NP1ANXS', 'NP1APAT', 'NP1DDS', 'NP1SLPN', 'NP1SLPD', 'NP1PAIN', 'NP1URIN', 'NP1CNST', 'NP1LTHD', 'NP1FATG'],
        'UPDRS2': ['NP2SPCH', 'NP2SALV', 'NP2SWAL', 'NP2EAT', 'NP2DRES', 'NP2HYGN', 'NP2HWRT', 'NP2HOBB', 'NP2TURN', 'NP2TRMR', 'NP2RISE', 'NP2WALK', 'NP2FREZ'],
        'UPDRS3': ['NP3SPCH', 'NP3FACXP', 'NP3RIGN', 'NP3RIGRU', 'NP3RIGLU', 'NP3RIGLL', 'NP3FTAPR', 'NP3FTAPL', 'NP3HMOVR', 'NP3HMOVL', 'NP3PRSPR', 'NP3PRSPL', 'NP3TTAPR', 'NP3TTAPL', 'NP3LGAGR', 'NP3LGAGL', 'NP3RISNG', 'NP3GAIT', 'NP3FRZGT', 'NP3PSTBL', 'NP3POSTR', 'NP3BRADY', 'NP3PTRMR', 'NP3PTRML', 'NP3KTRMR', 'NP3KTRML', 'NP3RTARU', 'NP3RTALU', 'NP3RTARL', 'NP3RTALL', 'NP3RTALJ', 'NP3RTCON', 'NHY']
    }
    
    # Ensure all UPDRS columns exist in the dataframe
    existing_updrs3_columns = [col for col in updrs_columns['UPDRS3'] if col in dataframe.columns]
    
    dataframe['UPDRS3_total'] = dataframe[existing_updrs3_columns].sum(axis=1)
    
    columns_to_drop = []
    for key in updrs_columns:
        columns_to_drop.extend([col for col in updrs_columns[key] if col in dataframe.columns])
    
    dataframe = dataframe.drop(columns=columns_to_drop)
    
    return dataframe

In [23]:
def apply_updrs_processing_to_combined_dfs(combined_dfs):
    processed_dfs = {}
    
    for key, df in combined_dfs.items():
        processed_df = process_updrs_columns(df)
        
        processed_dfs[key] = processed_df
    
    return processed_dfs

In [2]:
target_visits = {
    'BL': 'V04',
    'BL_V04': 'V06',
    'BL_V04_V06': 'V08',
    'BL_V04_V06_V08': 'V10',
    'BL_V04_V06_V08_V10': 'V12'
}

In [29]:
def prepare_rnn_data_with_all_columns(processed_combined_dfs, patient_col='PATNO', updrs_col='UPDRS3_total', visit_col='visit'):


    X = []  # List to hold input sequences
    y = []  # List to hold target values (next visit's UPDRS3_total)
    
    for key, df in processed_combined_dfs.items():
        target_visit = target_visits.get(key)
        
        df = df.sort_values(by=[patient_col, visit_col])
        
        # Create a DataFrame holding the target visit's UPDRS3_total values for each patient
        target_df = df[df[visit_col] == target_visit][[patient_col, updrs_col]]
        
        # Drop rows corresponding to the target visit from the original DataFrame
        df = df[df[visit_col] != target_visit]
        
        # Drop columns that are not needed as input (PATNO, visit, and UPDRS3_total)
        input_columns = df.drop(columns=[patient_col, updrs_col, visit_col], errors='ignore').columns
        
        # Group data by patient
        grouped = df.groupby(patient_col)
        
        # Loop through each patient
        for patient_id, group in grouped:
            # Extract the input features as a NumPy array
            input_values = group[input_columns].values
            
            # Find the corresponding target value for this patient from the target_df
            target_value = target_df[target_df[patient_col] == patient_id][updrs_col].values[0]
            
            # Add the input sequence and corresponding target value to the lists
            X.append(input_values)
            y.append(target_value)
    
    return X, y

In [18]:
def extract_target_columns(dataframe):
    
    updrs1_columns = ['NP1COG', 'NP1HALL', 'NP1DPRS', 'NP1ANXS', 'NP1APAT', 'NP1DDS',
                      'NP1SLPN', 'NP1SLPD', 'NP1PAIN', 'NP1URIN', 'NP1CNST', 'NP1LTHD', 'NP1FATG']
    updrs2_columns = ["NP2SPCH", "NP2SALV", "NP2SWAL", "NP2EAT", "NP2DRES", "NP2HYGN", "NP2HWRT",
                  "NP2HOBB", "NP2TURN", "NP2TRMR", "NP2RISE", "NP2WALK", "NP2FREZ"]
    
# "['CMEDTM', 'EXAMTM', 'PN3RIGRL', 'DYSKPRES', 'DYSKIRAT', 'ANNUAL_TIME_BTW_DOSE_NUPDRS', 'ON_OFF_DOSE', 'PD_MED_USE']
    updrs3_columns = ["NP3SPCH", "NP3FACXP", "NP3RIGN", "NP3RIGRU", "NP3RIGLU", "NP3RIGLL",
                  "NP3FTAPR", "NP3FTAPL", "NP3HMOVR", "NP3HMOVL", "NP3PRSPR", "NP3PRSPL", "NP3TTAPR", "NP3TTAPL", "NP3LGAGR",
                  "NP3LGAGL", "NP3RISNG", "NP3GAIT", "NP3FRZGT", "NP3PSTBL", "NP3POSTR", "NP3BRADY", "NP3PTRMR", "NP3PTRML",
                  "NP3KTRMR", "NP3KTRML", "NP3RTARU", "NP3RTALU", "NP3RTARL", "NP3RTALL", "NP3RTALJ", "NP3RTCON",
                   "NHY"]

#     updrs4_columns = ["NP4WDYSK", "NP4DYSKI", "NP4OFF", "NP4FLCTI", "NP4FLCTX", "NP4DYSTN"]
    target_dataframe = pd.DataFrame(index=dataframe.index)
    target_dataframe['UPDRS1'] = dataframe[updrs1_columns].sum(axis=1)
    target_dataframe['UPDRS2'] = dataframe[updrs2_columns].sum(axis=1)
    target_dataframe['UPDRS3'] = dataframe[updrs3_columns].sum(axis=1)
#     target_dataframe['UPDRS4'] = dataframe[updrs4_columns].sum(axis=1)

    
    dataframe = dataframe.drop(columns=updrs1_columns + updrs2_columns + updrs3_columns)
    
    return target_dataframe, dataframe

In [7]:
import pandas as pd

def filter_bl_and_target(df, target_visit):
    """
    Filters the DataFrame to keep all data from the baseline (BL) visit and the UPDRS 
    score from the specified target visit.
    
    Parameters:
    df (pd.DataFrame): The input DataFrame with multi-index columns (features and visits).
    target_visit (str): The visit for which the UPDRS score is to be retained (e.g., 'V04').
    
    Returns:
    pd.DataFrame: A new DataFrame containing the BL data and the UPDRS score for the target visit.
    """
    
    # Keep all baseline (BL) data
    bl_data = df.xs('BL', level=1, axis=1)

    # Keep the UPDRS score for the specified visit (e.g., V04)
    if ('UPDRS', target_visit) in df.columns:
        updrs_target = df[('UPDRS', target_visit)].rename(f'UPDRS_{target_visit}')
    else:
        raise ValueError(f"UPDRS score for visit {target_visit} not found in the DataFrame.")
    
    # Combine the BL data and UPDRS target into a single DataFrame
    result_df = pd.concat([bl_data, updrs_target], axis=1)
    
    return result_df

In [38]:
# Prepare the dataset for RNN
def prepare_data(data, target_visit):
    visit_sequence = ['BL', 'V04', 'V06']
    target_index = visit_sequence.index(target_visit)
    required_visits = visit_sequence[:target_index+1]
    
    X, y = [], []
    for patient_id, group in data.groupby('PATNO'):
        group = group.set_index('visit')
        if all(visit in group.index for visit in required_visits):
            features = []
            for visit in required_visits:
                visit_features = group.loc[visit, ['Feature1', 'Feature2']].values
                if visit != target_visit:
                    visit_target = group.loc[visit, 'Target']
                    features.extend(np.concatenate((visit_features, [visit_target])))
                else:
                    features.extend(visit_features)
            X.append(features)
            y.append(group.loc[target_visit, 'Target'])
    return np.array(X), np.array(y)

