### Importing the Libraries

In [1]:
# Needed to import data
import sys
import pandas as pd
import re
import dill
import os 

# bases
import numpy as np
import scipy

#pre-processing 
from sklearn.preprocessing import StandardScaler

#k-Fold cross 
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

#Ridge 
from sklearn.linear_model import Ridge


### Setting up paths

In [2]:
sys.path.append('/Users/lakrama/Neuro Project Codes/LSR-Tensor-Ridge-Regression/Data_Sets/HCP')
sys.path.append('/Users/lakrama/Neuro Project Codes/LSR-Tensor-Ridge-Regression/Vector Regression Baseline/Experimental Results/Vector_Base_Line/with k-fold cv kernel mapping before centering')

### Importing Data 

In [3]:
#Load fMRI Resting State Data
with open("/Users/lakrama/Neuro Project Codes/LSR-Tensor-Ridge-Regression/Data_Sets/HCP/Resting State FMRI/fmri_rs.npy", "rb") as f:
  fmri_rs = np.load(f)

#Take the Transpose so that each Sample is a Row
fmri_rs = fmri_rs.T

#Get Split to divide into train + test(loaded data is in the form of features * sampels so need to transpose)
mat_file = scipy.io.loadmat("/Users/lakrama/Neuro Project Codes/LSR-Tensor-Ridge-Regression/Data_Sets/HCP/Resting State FMRI/MMP_HCP_60_splits.mat")
seed_1 = mat_file['folds']['seed_1']
subject_lists = seed_1[0, 0]['sub_fold'][0, 0]['subject_list']
test_subjects = [int(item[0]) for item in subject_lists[0,0].flatten()]

#Get HCP test subjects
HCP_753_Subjects = []
with open('/Users/lakrama/Neuro Project Codes/LSR-Tensor-Ridge-Regression/Data_Sets/HCP/Resting State FMRI/MMP_HCP_753_subs.txt', 'r') as file:
    HCP_753_Subjects = [int(re.sub('\n', '', line)) for line in file.readlines()]

#Put the HCP test subjects into a dataframe
df = pd.read_csv("/Users/lakrama/Neuro Project Codes/LSR-Tensor-Ridge-Regression/Data_Sets/HCP/Resting State FMRI/MMP_HCP_componentscores.csv")
df['Subject'] = pd.to_numeric(df['Subject'], errors='coerce')
df = df[df['Subject'].isin(HCP_753_Subjects)].reset_index(drop = True)

#Split all our data into a Train and Test Set
df_train, df_test = df[~df['Subject'].isin(test_subjects)], df[df['Subject'].isin(test_subjects)]

#Create Train and Test Arrays corresponding to Training and Test Subjects
train_subjects = df_train.index.to_list()
test_subjects = df_test.index.to_list()

#Reshape Labels into Column Vectors
X_train, Y_train = fmri_rs[train_subjects], df_train["varimax_cog"].to_numpy().reshape((-1, 1))
X_test, Y_test = fmri_rs[test_subjects], df_test["varimax_cog"].to_numpy().reshape((-1, 1))

### Preprocessing

1. Transforming to kernel hiblert space.

In [4]:
#Row Wise Normalization of Samples
def normalize_rows(matrix: np.ndarray):
    """
    Normalize each row of the given matrix by the norm of the row.
    
    Parameters:
    matrix (numpy.ndarray): The input matrix to be normalized.
    
    Returns:
    numpy.ndarray: The row-normalized matrix.
    """
    # Calculate the L2 norm for each row. Adding a small epsilon to avoid division by zero.
    row_norms = np.linalg.norm(matrix, axis=1, keepdims=True)
    epsilon = 1e-10  # Small value to prevent division by zero
    row_norms[row_norms == 0] = epsilon
    
    # Normalize each row by its norm
    normalized_matrix = matrix / row_norms
    return normalized_matrix

#Preprocess Data
X_train = normalize_rows(X_train)
X_test = normalize_rows(X_test)

2. Removing the intercept from the data

In [5]:

def LRR(X_train,Y_train,X_validatioon,Y_validation,alpha,Y_train_mean = None):

    #model 
    ridge_regression = Ridge(alpha=alpha,fit_intercept=False)
    #fitting model
    ridge_regression.fit(X_train,Y_train)
    #testing 

    if Y_train_mean is not None:
        Y_predicted = ridge_regression.predict(X_validatioon).flatten().reshape(-1,1) + Y_train_mean
    else:
        Y_predicted = ridge_regression.predict(X_validatioon).flatten().reshape(-1,1) 

    #error matrices 

    #nmse
    nmse    = (np.linalg.norm(Y_validation - Y_predicted)**2/(np.linalg.norm(Y_validation)**2))
    #r2
    numerator = np.sum((Y_validation - Y_predicted)**2)
    denominator = np.sum((Y_validation - np.mean(Y_validation))**2)
    r2 = 1 - (numerator / denominator)
    #correlation
    corr    = np.corrcoef(Y_validation.flatten(), Y_predicted.flatten())[0,1]
    
    return nmse,r2,corr

In [6]:
#number of samples in train and test
n_train = X_train.shape[0]
n_test = X_test.shape[0]



# Initialize StandardScaler
scaler = StandardScaler(with_std = False) #standard scalar only

# Fit scaler on train data and transform train data
X_train_scaled = scaler.fit_transform(X_train)

# Transform test data using the scaler fitted on train data
X_test_scaled = scaler.transform(X_test)


#average response value
Y_train_mean = np.mean(Y_train)

# Mean centering y_train and y_test
Y_train = Y_train - Y_train_mean

#printing the outcomes
print("Sample mean for each feature (across samples):",scaler.mean_)
print("Sample variance for each feature (across samples):",scaler.var_)
print('Response Average:',Y_train_mean)

Sample mean for each feature (across samples): [0.00856716 0.00931574 0.00521223 ... 0.00123731 0.00516826 0.00765609]
Sample variance for each feature (across samples): None
Response Average: 1.127096656182391


### K-Fold Cross Validation

In [7]:


#number of folds we are using 
n_splits = 10
kfold = KFold(n_splits=n_splits, shuffle= True)

#the regularization coefficients to search over
alphas = [0.05,0.06,0.07,0.08,0.09,0.1,0.11,0.12,0.13,0.14,0.15,0.16,0.17,0.18,0.19] 
k_fold_results = np.zeros(shape=[n_splits,len(alphas),3])

#iterating over the folds
for fold,(train_ids,validation_ids) in enumerate(kfold.split(X_train)):

    #the training and validatiion data 
    X_train_fold,Y_train_fold = X_train[train_ids],Y_train[train_ids]
    X_validation_fold,Y_validation_fold = X_train[validation_ids],Y_train[validation_ids]
    
    #iterating over the alpha values 
    for alpha_idx,alpha in enumerate(alphas):
        nmse,r2,corr = LRR(X_train_fold,Y_train_fold,X_validation_fold,Y_validation_fold,alpha)
        k_fold_results[fold,alpha_idx,0] = nmse
        k_fold_results[fold,alpha_idx,1] = r2
        k_fold_results[fold,alpha_idx,2] = corr

        print(f"fold = {fold},alpha = {alpha},nmse = {nmse},r2 = {r2}, corr = {corr}")

#Choosing the best lambda

#array to hold all the sums
nmse_sum = np.zeros(len(alphas))

# Iterate over the folds and alpha values to accumulate nmse values
for alpha_idx in range(len(alphas)):
    for fold in range(n_splits):
        nmse_sum[alpha_idx] += k_fold_results[fold, alpha_idx, 0]

#best lambda 

best_alpha_idx = np.argmin(nmse_sum)
best_alpha = alphas[best_alpha_idx]        
print(f"The best alpha value is {best_alpha}")
        



fold = 0,alpha = 0.05,nmse = 0.5931910042481722,r2 = 0.404796952893397, corr = 0.6530016491779435
fold = 0,alpha = 0.06,nmse = 0.5909306685383792,r2 = 0.4070649554293724, corr = 0.6521687200200885
fold = 0,alpha = 0.07,nmse = 0.5894718204492765,r2 = 0.4085287517811057, corr = 0.6512085799782409
fold = 0,alpha = 0.08,nmse = 0.5886174461853381,r2 = 0.40938602399464896, corr = 0.6501608119753686
fold = 0,alpha = 0.09,nmse = 0.5882262161305655,r2 = 0.4097785810615171, corr = 0.6490536613566105
fold = 0,alpha = 0.1,nmse = 0.5881944260588534,r2 = 0.40981047896188183, corr = 0.6479076148413828
fold = 0,alpha = 0.11,nmse = 0.5884445008815222,r2 = 0.4095595559111421, corr = 0.6467377289970906
fold = 0,alpha = 0.12,nmse = 0.5889174443312843,r2 = 0.4090850082858555, corr = 0.6455551850364151
fold = 0,alpha = 0.13,nmse = 0.5895677441815769,r2 = 0.408432502685374, corr = 0.6443683510584973
fold = 0,alpha = 0.14,nmse = 0.5903598519749022,r2 = 0.40763770814390654, corr = 0.6431835226598307
fold = 0,a

### Training and Testing with best Lambda

In [11]:
nmse_best,r2_best,corr_best =  LRR(X_train,Y_train,X_test,Y_test,best_alpha,Y_train_mean=Y_train_mean)

### Reporting Errors

In [12]:
print(f"The best Lamda Values Selected After KFold CV: {best_alpha}")
print(f"Testing Normalized Mean Squred Error: {nmse_best}")
print(f"Testing R2: {r2_best}")
print(f"Testing Correlation: {corr_best}")

The best Lamda Values Selected After KFold CV: 0.13
Testing Normalized Mean Squred Error: 0.7347483036434304
Testing R2: 0.2123021022487751
Testing Correlation: 0.49637869310294774


### Saving Stuff

In [10]:

# Convert alphas list to a string for the file name
alphas_str = '_'.join(map(str, alphas))

# Define the content to write
results = (
    f"The best Lambda Values Selected After KFold CV: {best_alpha}\n"
    f"Testing Normalized Mean Squared Error: {nmse_best}\n"
    f"Testing R2: {r2_best}\n"
    f"Testing Correlation: {corr_best}\n"
)

# Define the file path
file_path = f'/Users/lakrama/Neuro Project Codes/LSR-Tensor-Ridge-Regression/Vector Regression Baseline/Experimental Results/Vector_Base_Line/with k-fold cv kernel mapping before centering/{alphas_str}results.txt'

# Ensure the directory exists
os.makedirs(os.path.dirname(file_path), exist_ok=True)

# Write the results to a text file
with open(file_path, 'w') as file:
    file.write(results)

print(f"Results saved to {file_path}")


Results saved to /Users/lakrama/Neuro Project Codes/LSR-Tensor-Ridge-Regression/Vector Regression Baseline/Experimental Results/Vector_Base_Line/with k-fold cv kernel mapping before centering/0.05_0.06_0.07_0.08_0.09_0.1_0.11_0.12_0.13_0.14_0.15_0.16_0.17_0.18_0.19results.txt
