# Tensor Pipeline

This note book runs the  lsr structured tensor ridge regression model with the following parameters fixed.

1. Ridge regression coefficient: 
2. Max iterations
3. Separation Rank
4. Tucker Rank



## Install Libraries

In [7]:
%pip install dill

Note: you may need to restart the kernel to use updated packages.


## System Path

In [8]:
import sys
import platform

# Check the operating system
if platform.system() == "Windows":
    # Using double backslashes
    sys.path.append(r"D:\Tensor Based ML for Neuro Imaging\INSPIRE_CAHBHIR\Python Scripts\LSR-Tensor-Ridge-Regression\Closed_Form_Solver\Code Files")
elif platform.system() == "Darwin":  # macOS
    # Append path for macOS
    sys.path.append("/Users/lakrama/Neuro Project Codes/LSR-Tensor-Ridge-Regression/Closed_Form_Solver/Code Files")
    sys.path.append("/Users/lakrama/Neuro Project Codes/Datasets/Data_Sets/HCP/Resting State FMRI")

## Import Libraries

In [9]:
#Import sklearn stuff
import datetime
import numpy as np
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import scipy
from sklearn.metrics import r2_score 
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import scipy
import time 


#Used to load data from pkl file
import dill

#Import External Files
from KFoldCV import KFoldCV
from train_test import train_test
from DataGenerationB import *

## Important Functions 

In [10]:
#conversion of the HCP data from Vector --> Matrix

def merging_modalities(vector1,vector2,outputdim): 
    
    '''
    This code take vector1 and vector2 of same length as the input and reconstructed the matrix with the vector1 element as the
    upper triangle and the vector 2 elements as the lower triangle.
    '''
    
    #checking whether we can generate the symmetric matrix 
    vector1_length = vector1.shape[0]
    vector2_length = vector2.shape[0]
    desired_length = outputdim * (outputdim - 1) / 2
    
    if vector1_length != vector2_length:
        raise ValueError("Vector length mismatch.")
    elif vector1_length != desired_length:
        raise ValueError("Vector length is insufficient to construct the symmetric matrix.")

    #matrix_1
    
    matrix_1 = np.zeros((outputdim, outputdim))
    
    #counter for the vector 
    q = 0
    
    for i in range (outputdim):
        for j in range(i,outputdim):
            if i == j:
                matrix_1[i,j] = 1
            else:
                matrix_1[i,j] = vector1[q]
                q = q+1 
    
    
    #matrix 2
    
    matrix_2 = np.zeros((outputdim, outputdim))
    
    #counter for the vector 
    p = 0
    
    for i in range (outputdim):
        for j in range(i,outputdim):
            if i == j:
                matrix_2[i,j] = 1
            else:
                matrix_2[i,j] = vector2[p]
                p = p+1 
                
    #upper and lower triangle 
    upper_triangle = matrix_1
    lower_triangle = matrix_2.T
    
    #constructing the matrix
    matrix_eye = np.eye(outputdim)
    matrix =  upper_triangle + lower_triangle - matrix_eye
    
    return matrix

def samplestomat(dataset1,dataset2,outputdim):
    
    '''
    This code is developed to convert the vectorized data matrix in to a 3D data tensor.
    
    dataset : nd:array - (samples*features)
    outputdim : scalar

    '''

    #number of samples
    n_samples = dataset1.shape[0]
    #3D matrix to hold the output
    out_dataset = np.zeros((n_samples,outputdim,outputdim))

    for p in range(n_samples):
        
        vector_1 = dataset1[p]
        vector_2 = dataset2[p] 
        sample = merging_modalities(vector_1,vector_2,outputdim)
        out_dataset[p] = sample

    random_index = np.random.randint(0, n_samples)
    random_sample = out_dataset[random_index]

    # Plot the heatmap
    plt.figure(figsize=(10, 8))
    sns.heatmap(random_sample, cmap='viridis', cbar=True)
    plt.title(f'Heatmap of Random Sample {random_index}')
    plt.show()

    return out_dataset

# normalizing using frobenious norm

def normalize_by_frobenius_norm(samples):
    """
    Normalizes each sample (2D matrix) in the array by its Frobenius norm.

    Parameters:
    samples (numpy.ndarray): A 3D numpy array with dimensions [samples, rows, columns].

    Returns:
    numpy.ndarray: A 3D numpy array with each sample normalized by its Frobenius norm.
    """
    # Calculate the Frobenius norm for each sample
    frobenius_norms = np.linalg.norm(samples, axis=(1, 2))
    
    # Reshape the norms to broadcast correctly for division
    frobenius_norms = frobenius_norms[:, np.newaxis, np.newaxis]
    
    # Normalize each sample by its Frobenius norm
    normalized_samples = samples / frobenius_norms
    
    return normalized_samples
        


## Import Data

In [17]:
### FMRI - Resting State 

    
if platform.system() == "Windows": 
    file_path = r"D:\Tensor Based ML for Neuro Imaging\INSPIRE_CAHBHIR\Python Scripts\LSR-Tensor-Ridge-Regression\Data_Sets\HCP\Resting State FMRI\fmri_rs.npy"
elif platform.system() == "Darwin":
    file_path = r"/Users/lakrama/Neuro Project Codes/Datasets/Data_Sets/HCP/Resting State FMRI/fmri_rs.npy"  # Adjust the path for macOS

with open(file_path, "rb") as f:
    fmri_rs = np.load(f)
    
      
### FMRI - Language
#loading data

if platform.system() == 'Windows':
    file_path = r"D:\Tensor Based ML for Neuro Imaging\INSPIRE_CAHBHIR\Python Scripts\LSR-Tensor-Ridge-Regression\Data_Sets\HCP\Resting State FMRI\features_lang.mat"
elif platform.system() == 'Darwin':
    file_path = 'add the path'

dMRI_streamlog = (scipy.io.loadmat(file_path)['features_lang'])

# Each sample is a row
fmri_rs = fmri_rs.T
dMRI_streamlog = dMRI_streamlog.T


# Determine the platform and load the appropriate file
if platform.system() == "Windows": 
    mat_file_path = r"D:\Tensor Based ML for Neuro Imaging\INSPIRE_CAHBHIR\Python Scripts\LSR-Tensor-Ridge-Regression\Data_Sets\HCP\Resting State FMRI\MMP_HCP_60_splits.mat"
elif platform.system() == "Darwin":
    mat_file_path = "/Users/lakrama/Neuro Project Codes/Datasets/Data_Sets/HCP/Resting State FMRI/MMP_HCP_60_splits.mat"
else:
    raise ValueError("Unsupported platform")

# The dataframe to hold the results for a seed
columns = ['Seed', 'Best Lambda', 'NMSE', 'CORR', 'R2', 'Time Taken', 'Gradient']
results_df = pd.DataFrame(columns=columns)

number_of_seeds = 2

#iterating over the seeds
for seed in range (number_of_seeds):

    # Load the .mat file
    mat_file = scipy.io.loadmat(mat_file_path)

    # Extract subject lists from the loaded file
    seed_1 = mat_file['folds'][f'seed_{seed+1}'][0, 0]
    subject_lists = seed_1['sub_fold'][0, 0]['subject_list']
    test_subjects = [int(item[0]) for item in subject_lists[0, 0].flatten()]

    print(seed)

    #Getting the HCP test subjects 

    #array to hold the  subjects
    HCP_753_Subjects = []
    #file path
    file_path = ''
    #setting the file path
    if platform.system() == "Windows":
        file_path = r'D:\Tensor Based ML for Neuro Imaging\INSPIRE_CAHBHIR\Python Scripts\LSR-Tensor-Ridge-Regression\Data_Sets\HCP\Resting State FMRI\MMP_HCP_753_subs.txt'
    elif platform.system() == "Darwin":
        file_path = '/Users/lakrama/Neuro Project Codes/Datasets/Data_Sets/HCP/Resting State FMRI/MMP_HCP_753_subs.txt'
    #if file pat  h is returned then load
    if file_path:
        try:
            with open(file_path, 'r') as file:
                HCP_753_Subjects = [int(line.strip()) for line in file.readlines()]
        except Exception as e:
            print(f"An error occurred: {e}")


    #Put the HCP test subjects into a dataframe
    # Determine the platform and load the appropriate file
    if platform.system() == "Windows":
        csv_file_path = r"D:\Tensor Based ML for Neuro Imaging\INSPIRE_CAHBHIR\Python Scripts\LSR-Tensor-Ridge-Regression\Data_Sets\HCP\Resting State FMRI\MMP_HCP_componentscores.csv"
    elif platform.system() == "Darwin":
        csv_file_path = "/Users/lakrama/Neuro Project Codes/Datasets/Data_Sets/HCP/Resting State FMRI/MMP_HCP_componentscores.csv"

    df = pd.read_csv(csv_file_path)
    df['Subject'] = pd.to_numeric(df['Subject'], errors='coerce')
    df = df[df['Subject'].isin(HCP_753_Subjects)].reset_index(drop = True)

    #Split all our data into a Train and Test Set
    df_train, df_test = df[~df['Subject'].isin(test_subjects)], df[df['Subject'].isin(test_subjects)]

    #Create train and test arrays
    train_subjects = df_train.index.to_list()
    test_subjects = df_test.index.to_list()

    #Reshape labels into column vector

    #Functional Connectivity Matrices

    X_train_vec_fmri, Y_train = fmri_rs[train_subjects], df_train["varimax_cog"].to_numpy().reshape((-1, 1))
    X_test_vec_fmri, Y_test = fmri_rs[test_subjects], df_test["varimax_cog"].to_numpy().reshape((-1, 1))

    #Structural Connectivity

    X_train_vec_dmri = dMRI_streamlog[train_subjects]
    X_test_vec_dmri = dMRI_streamlog[test_subjects]

    #merging the modalities
    X_train = samplestomat(X_train_vec_fmri,X_train_vec_dmri,400)
    X_test  = samplestomat(X_test_vec_fmri,X_test_vec_dmri,400)
    Y_train = Y_train.reshape(-1)
    Y_test = Y_test.reshape(-1)


    print(X_train.shape)
    print(Y_train.shape)
    print(X_test.shape)
    print(Y_test.shape)

    #Kernel Equivalent Normalization Block
    X_train = normalize_by_frobenius_norm(X_train)
    X_test = normalize_by_frobenius_norm(X_test)

    #number of samples in train and test 
    n_train = X_train.shape[0]
    n_test = X_test.shape[0]

    # Reshape the 3D array to a 2D array where each row represents a sample
    # The shape of the original 3D array is (n_samples, n_features_per_sample, n_dimensions)
    # We reshape it to (n_samples, n_features_per_sample * n_dimensions)

    X_train_2D = X_train.reshape(n_train, -1)
    X_test_2D = X_test.reshape(n_test,-1)

    # Initialize StandardScaler
    scaler = StandardScaler(with_std = False) #standard scalar only

    # Fit scaler on train data and transform train data
    X_train_scaled = scaler.fit_transform(X_train_2D)
    # Transform test data using the scaler fitted on train data
    X_test_scaled = scaler.transform(X_test_2D)

    # Reshape the scaled data back to 3D
    X_train = X_train_scaled.reshape(n_train, X_train.shape[1],X_train.shape[2])
    X_test  = X_test_scaled.reshape(n_test, X_test.shape[1],X_train.shape[2])

    #average response value
    Y_train_mean = np.mean(Y_train)
    # Mean centering y_train and y_test
    Y_train = Y_train - Y_train_mean


    print("Sample mean for each feature (across samples):",scaler.mean_)
    print("Sample variance for each feature (across samples):",scaler.var_)
    print('Response Average:',Y_train_mean)

    tensor_dimensions = np.array([X_train.shape[1], X_train.shape[2]])
    tensor_mode_ranks = np.array([4, 4])
    separation_rank = 2

    #For now, define finite alpha set that we are searching over
    alphas =  [0.1,0.4] #,0.7,1,1.5,2,2.5,3,3.5,4,5,10,15,20]

    #Define Number of Folds we want
    k_folds = 2

    #Training 
    hypers = {'max_iter': 50, 'threshold': 1e-4, 'ranks': tuple(tensor_mode_ranks), 'separation_rank': separation_rank}
    lambda1, validation_normalized_estimation_error, validation_nmse_losses, validation_correlations, validation_R2_scores, objective_function_information,gradient_information = KFoldCV(X_train, Y_train, alphas, k_folds, hypers, B_tensored = None, intercept= False)

    #Testing 

    start_time = time.time()
    hypers = {'max_iter': 50, 'threshold': 1e-4, 'ranks': tuple(tensor_mode_ranks), 'separation_rank': separation_rank}
    test_normalized_estimation_error, test_nmse_loss, test_R2_loss, test_correlation, objective_function_values,gradient_values,iterate_level_values,factor_core_iteration = train_test(X_train, Y_train, X_test, Y_test, lambda1, hypers,Y_train_mean, B_tensored = None, intercept= False)
    end_time = time.time()
    elapsed_time = end_time - start_time


    #saving data of one seed

    formatted_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    max_iter = hypers['max_iter']

    if platform.system() == "Windows":
        pkl_file = rf"D:\Tensor Based ML for Neuro Imaging\INSPIRE_CAHBHIR\Python Scripts\LSR-Tensor-Ridge-Regression_All_Data\Closed_Form_Solver\Multimodal\LSR\Frobenious\Resting-Language\Across_Seeds\HCP_lambdas_{alphas}_seed_{seed}_sep_{separation_rank}_tucker_{tensor_mode_ranks}.pkl"
    elif platform.system() == "Darwin":
        pkl_file = f"/Users/lakrama/Neuro Project Codes/LSR-Tensor-Ridge-Regression/Stochastic LSR TRR/Experimental Results/ExecutionTime_intercept_5_{formatted_time}, n_train_{n_train},n_test_{n_test}, tensor_dimensions:{tensor_dimensions}, tensor_mode_= ranks:{tensor_mode_ranks}, separation_rank:{separation_rank}, max_iter={max_iter}.pkl"

    #printing error matrices for one seed

    print(f'SEED: {seed+1} : lambda1:{lambda1} : TNMSE:{test_nmse_loss} : TCORR:{test_correlation} : TR2: {test_R2_loss} ')

    #saving the data for one seed
    with open(pkl_file, "wb") as file:
        dill.dump((X_train,Y_train,X_test,Y_test, lambda1, validation_normalized_estimation_error, validation_nmse_losses, validation_correlations, validation_R2_scores, objective_function_information,gradient_information,test_normalized_estimation_error, test_nmse_loss, test_R2_loss, test_correlation, objective_function_values,gradient_values,iterate_level_values,factor_core_iteration), file)

    #loading the results to the dataframe

    best_lambda = lambda1
    nmse = test_nmse_loss
    corr = test_correlation
    r2 = test_R2_loss
    time_taken = elapsed_time
    gradient = gradient_values[-1,:,:]
    
    # Append the results to the dataframe
    seed_result_df = pd.DataFrame([{
        'Seed': seed,
        'Best Lambda': best_lambda,
        'NMSE': nmse,
        'CORR': corr,
        'R2': r2,
        'Time Taken': time_taken,
        'Gradient': gradient
    }])
    
    #concatenating the results
    results_df = pd.concat([results_df,seed_result_df], ignore_index = True) 

    # Define platform-specific file paths
    if platform.system() == 'Windows':
        file_path = r'D:\Tensor Based ML for Neuro Imaging\INSPIRE_CAHBHIR\Python Scripts\LSR-Tensor-Ridge-Regression_All_Data\Closed_Form_Solver\Multimodal\LSR\Frobenious\Resting-Language\Across_Seeds\Seed_Results'
    elif platform.system() == 'Darwin':
        file_path = 'addpath'  # Replace 'addpath' with the actual path for macOS

    # Save the result as a CSV
    results_df.to_csv(f'{file_path}/results_with_matrix_{seed}.csv', index=False)

    

    


0


KeyboardInterrupt: 