# HCP Vector Pipeline

## Install Libraries

In [1]:
%pip install dill

Note: you may need to restart the kernel to use updated packages.


## Import Google Drive Files

In [2]:
import sys
sys.path.append('/Users/lakrama/Neuro Project Codes/LSR-Tensor-Ridge-Regression/Closed_Form_Solver/Code Files')
sys.path.append('/Users/lakrama/Neuro Project Codes/LSR-Tensor-Ridge-Regression/Closed_Form_Solver/Data')

## Import Libraries

In [3]:
#Import sklearn stuff
import datetime
import numpy as np
import pandas as pd
import re

#scipy
import scipy

#Used to load data from pkl file
import dill
import pickle

#Import External Files
from KFoldCV import KFoldCV_Vectorized_HCP
from train_test import TrainTest_Vectorized
from DataGenerationB import *

#plotting
import matplotlib.pyplot as plt

#preprocessing 
from sklearn.preprocessing import StandardScaler

## Import Data

In [4]:
#Load data from npy file
with open("../Data/fmri_rs.npy", "rb") as f:
  fmri_rs = np.load(f)

#Each sample is a row
fmri_rs = fmri_rs.T

#Get Split to divide into train + test
mat_file = scipy.io.loadmat("../Data/MMP_HCP_60_splits.mat")
seed_1 = mat_file['folds']['seed_1']
subject_lists = seed_1[0, 0]['sub_fold'][0, 0]['subject_list']
test_subjects = [int(item[0]) for item in subject_lists[0,0].flatten()]

#Get HCP test subjects
HCP_753_Subjects = []
with open('../Data/MMP_HCP_753_subs.txt', 'r') as file:
    HCP_753_Subjects = [int(re.sub('\n', '', line)) for line in file.readlines()]

#Put the HCP test subjects into a dataframe
df = pd.read_csv("../Data/MMP_HCP_componentscores.csv")
df['Subject'] = pd.to_numeric(df['Subject'], errors='coerce')
df = df[df['Subject'].isin(HCP_753_Subjects)].reset_index(drop = True)

#Split all our data into a Train and Test Set
df_train, df_test = df[~df['Subject'].isin(test_subjects)], df[df['Subject'].isin(test_subjects)]


In [5]:
#Create train and test arrays
train_subjects = df_train.index.to_list()
test_subjects = df_test.index.to_list()

#Reshape labels into column vector
X_train, Y_train = fmri_rs[train_subjects], df_train["varimax_cog"].to_numpy().reshape((-1, 1))
X_test, Y_test = fmri_rs[test_subjects], df_test["varimax_cog"].to_numpy().reshape((-1, 1))

## Vectorize Data

In [6]:
# Initialize StandardScaler
scaler = StandardScaler() #standard scalar only

# Fit scaler on train data and transform train data
X_train_scaled = scaler.fit_transform(X_train)
# Transform test data using the scaler fitted on train data
X_test_scaled = scaler.transform(X_test)

#Reassign

X_train = X_train_scaled
X_test = X_test_scaled

print("Sample mean for each feature (across samples):",scaler.mean_)


alphas = [0,0.1,0.3, 0.5, 0.7, 0.9, 1, 1.5, 2, 2.5, 3, 5, 4,10,15,20,50,100]
k_folds = 2
lambda1, validation_normalized_estimation_errors, validation_nmse_losses, validation_correlations, validation_R2_scores, objective_function_values = KFoldCV_Vectorized_HCP(X_train, Y_train, alphas, k_folds, intercept= False)

test_normalized_estimation_error, test_nmse_loss, test_correlation, test_R2_score,Y_test_predicted,p_star = TrainTest_Vectorized(X_train, Y_train, X_test, Y_test, lambda1,intercept= False)

#print predicted
print(Y_test_predicted)
print(Y_test)

# Plotting both arrays
plt.figure(figsize=(10, 5))  # Set the figure size
plt.plot(Y_test_predicted, label='Predicted', color='blue')  # Plot Y_test_predicted
plt.plot(Y_test, label='Actual', color='red', linestyle='--')  # Plot Y_test with dashed line
plt.title(f"Comparison of Predicted and Actual Values")  # Title of the plot
plt.xlabel('Index')  # Label for the x-axis
plt.ylabel('Values')  # Label for the y-axis
#plt.yscale('log')
plt.legend()  # Add a legend
plt.grid(True)  # Add gridlines for better readability
plt.show()  # Display the plot

#Get current time and store in variable
formatted_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
pkl_file = f"/Users/lakrama/Neuro Project Codes/LSR-Tensor-Ridge-Regression/Closed_Form_Solver/Experimental Results/Vector_Base_Line/HCP_ExecutionTime_{formatted_time}.pkl"

print('best_lamda:',lambda1)
print(f"Test Normalized Estimation Error: {test_normalized_estimation_error}")
print(f"Test NMSE Loss: {test_nmse_loss}")
print(f"Test Correlation: {test_correlation}")
print(f"Test R2 Score: {test_R2_score}")
print(f"Objective Function Value: {p_star}")

#print("Validation NMSE Losses: ", validation_nmse_losses)
#print("Validation Correlations: ", validation_correlations)
#print("Validation R2 Scores: ", validation_R2_scores)

#with open(pkl_file, "wb") as file:
#  dill.dump((p_star,lambda1, validation_normalized_estimation_errors, validation_nmse_losses, validation_correlations, validation_R2_scores, test_normalized_estimation_error, test_nmse_loss, test_correlation, test_R2_score), file)






Sample mean for each feature (across samples): [0.33170613 0.35946642 0.20230985 ... 0.04708704 0.19885258 0.29165831]
Fold = 0, Alpha = 0, NMSE: 1.5968396669414675, Correlation: 0.4928766920606951, R^2 Score: -1.1908080325097905,Objective Function Value: 2840.0559514394827
Fold = 0, Alpha = 0.1, NMSE: 1.596834722694317, Correlation: 0.4928770542686001, R^2 Score: -1.190801249176062,Objective Function Value: 2840.0505512464993
Fold = 0, Alpha = 0.3, NMSE: 1.5968248343807518, Correlation: 0.492877778674695, R^2 Score: -1.190787682756568,Objective Function Value: 2840.0397511094966
Fold = 0, Alpha = 0.5, NMSE: 1.5968149463081627, Correlation: 0.4928785030678366, R^2 Score: -1.1907741166676846,Objective Function Value: 2840.0289513044304
Fold = 0, Alpha = 0.7, NMSE: 1.596805058476539, Correlation: 0.49287922744802615, R^2 Score: -1.1907605509093973,Objective Function Value: 2840.0181518312884
Fold = 0, Alpha = 0.9, NMSE: 1.596795170885873, Correlation: 0.49287995181526356, R^2 Score: -1.1