In [3]:
import csv
import os
import random
import pickle
import gc
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import nilearn
from nilearn import connectome
from nilearn.connectome import ConnectivityMeasure
import sklearn
import warnings
import skbold
from skbold.preproc import ConfoundRegressor
from scipy.stats import pearsonr
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [8]:
# Define ConfoundRegressor: skbold
def confound_regressor_skbold(features_train, features_test, confounds_train, confounds_test):
    # Scale features (train and test sets)
    scaler_features = StandardScaler()
    features_train_scaled = scaler_features.fit_transform(features_train)
    features_test_scaled = scaler_features.transform(features_test)
    
    # Scale confounds (train and test sets)
    scaler_confounds = StandardScaler()
    confounds_train_scaled = scaler_confounds.fit_transform(confounds_train)
    confounds_test_scaled = scaler_confounds.transform(confounds_test)

    # Convert full sets into np.array
    features_full_scaled_np = np.array(pd.concat([pd.DataFrame(features_train_scaled, columns = features_train.columns), pd.DataFrame(features_test_scaled, columns = features_test.columns)], axis=0))
    confounds_full_scaled_np = np.array(pd.concat([pd.DataFrame(confounds_train_scaled, columns = confounds_train.columns), pd.DataFrame(confounds_test_scaled, columns = confounds_test.columns)], axis=0))
    
    # Define ConfoundRegressor on a FULL set (train and test)
    cfr = ConfoundRegressor(confound=confounds_full_scaled_np, X=features_full_scaled_np)
    features_train_corrected = cfr.fit_transform(features_train_scaled)
    features_test_corrected = cfr.transform(features_test_scaled)


    return features_train_corrected, features_test_corrected, features_train_scaled, features_test_scaled, scaler_features

# Get timeseries for 25 ICA components

In [None]:
# Make a list of folder paths
folder_paths = ["/Resting_State/rsfMRI_bulk_main/1-4000/unzipped/",
"/Resting_State/rsfMRI_bulk_main/5000-14000/unzipped/",
"/Resting_State/rsfMRI_bulk_main/14000_24000_split/unzipped/",
"/Resting_State/rsfMRI_bulk_main/14000-24000/unzipped/",
"/Resting_State/rsfMRI_bulk_main/24000-34000/unzipped/",
"/Resting_State/rsfMRI_bulk_main/24000-34000_split/unzipped/",
"/Resting_State/rsfMRI_bulk_main/34000-44000/unzipped/",
"/Resting_State/rsfMRI_bulk_main/34000-44000_split/unzipped/",
"/Resting_State/rsfMRI_bulk_main/44000-54413/unzipped/"]

missing_file_count = 0

timeseries_list = []
index_list = []
skipped_folders = []

for folder_path in folder_paths:

    print(f'Started {folder_path}')
    for subject_folder in sorted(os.listdir(folder_path)):

        print(f'Started {subject_folder}')
        subfolder_path = os.path.join(folder_path, subject_folder)
        subject_folder_name = os.path.basename(subfolder_path)

        if os.path.isdir(os.path.join(subfolder_path, 'fMRI')):
            subfolder_path = os.path.join(subfolder_path, 'fMRI')
        else:
            subfolder_path = subfolder_path

        timeseries_file = os.path.join(subfolder_path, "rfMRI_25.dr", "dr_stage1.txt")

        if not os.path.exists(timeseries_file):
            missing_file_count += 1
            skipped_folders.append(subfolder_path)
            continue

        print('Appending files')
        timeseries_25 = np.loadtxt(timeseries_file)
        timeseries_list.append(timeseries_25)
        index_list.append(subject_folder_name)

print('Number of folders without the file:', missing_file_count)
print('Skipped folders:', *skipped_folders, sep='\n')

index_list_df = pd.DataFrame(index_list)
index_list_df.columns = ['eid']
index_list_df.sort_values(by='eid')

print('Started instance 2')
timeseries_instance_2 = []
index_instance_2 = []
for folder_name, timeseries in zip(index_list, timeseries_list):
    mid_part = folder_name.split("_")[-2]
    if mid_part == '2':
        timeseries_instance_2.append(timeseries)
        index_instance_2.append(folder_name)

index_25_df_full = pd.DataFrame(index_list, columns=['eid']).to_csv('/Cog-Ment/PLS/brain/rs/ica_tangent/files/index_25_ica_full.csv', index=False)
index_instance_2_25_df = pd.DataFrame(index_instance_2, columns=['eid'])
index_instance_2_25_df['eid'] = index_instance_2_25_df['eid'].str.replace('_20227_2_0', '').astype(int)
index_instance_2_25_df.to_csv('/Cog-Ment/PLS/brain/rs/ica_tangent/files/index_25_ica_instance_2.csv', index=False)

* vectorize=True returned flattened lower triangular parts

In [None]:
# Get dictionary and save it
timeseries_25_dict = {}
for index, timeseries in zip(index_instance_2_25_df['eid'], timeseries_instance_2):
    timeseries_25_dict[index] = timeseries

with open(f'/Cog-Ment/PLS/brain/rs/ica_tangent/files/timeseries_25_dict.pkl', "wb") as f:
    pickle.dump(timeseries_25_dict, f)

In [None]:
# Load the dictionary
with open('/Cog-Ment/PLS/brain/rs/ica_tangent/files/timeseries_25_dict.pkl', "rb") as a:
    timeseries_25_dict = pickle.load(a)

# Extract tangent matrices for 21 'good' components

(outlined [here](https://www.fmrib.ox.ac.uk/ukbiobank/group_means/rfMRI_GoodComponents_d25_v1.txt))

In [None]:
folds = ["0", "1", "2", "3", "4"]
for fold in folds:

    print("__________________________________________")

    print(f"Started fold {fold}")

    print('Setting ConnectivityMeasure model')

    tangent_measure = ConnectivityMeasure(
    kind="tangent",
    standardize="zscore_sample",
    vectorize = True,
    discard_diagonal = True)

    print("Uploading train and test id")

    train_id = pd.read_csv(f'/Cog-Ment/g_factor_5_folds_python/fold_{fold}/train_id_fold_{fold}.csv')
    test_id = pd.read_csv(f'/Cog-Ment/g_factor_5_folds_python/fold_{fold}/test_id_fold_{fold}.csv')

    tangent_25_train = []
    tangent_25_test = []
    
    tangent_train_id = []
    tangent_test_id = []
    
    print("Getting train set")

    for id_val in train_id['eid'].values:
        if id_val in timeseries_25_dict.keys():
            tangent_25_train.append(timeseries_25_dict[id_val])
            tangent_train_id.append(id_val)

    print("Get 21 components, fit, and transform, train set")
    tangent_21_train = np.array(tangent_25_train)[:, :, [0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]]
    tangent_matrices_21_train = tangent_measure.fit_transform(tangent_21_train)
    pd.DataFrame(tangent_matrices_21_train, columns = [f'Component {i+1} Tangent (21 IC)' for i in range(tangent_matrices_21_train.shape[1])], index=tangent_train_id).to_csv(f'/Cog-Ment/PLS/brain/rs/ica_tangent/tangent_matrices_21_train_fold_{fold}.csv')
    
    print("__________________________________________")
    print("Getting test set")
            
    for id_val in test_id['eid'].values:
        if id_val in timeseries_25_dict.keys():
            tangent_25_test.append(timeseries_25_dict[id_val])
            tangent_test_id.append(id_val)

    print("Get 21 components and transform, test set")
    tangent_21_test = np.array(tangent_25_test)[:, :, [0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]]
    tangent_matrices_21_test = tangent_measure.transform(tangent_21_test)
    pd.DataFrame(tangent_matrices_21_test, columns = [f'Component {i+1} Tangent (21 IC)' for i in range(tangent_matrices_21_test.shape[1])], index=tangent_test_id).to_csv(f'/Cog-Ment/PLS/brain/rs/ica_tangent/tangent_matrices_21_test_fold_{fold}.csv')


__________________________________________
Started fold 0
Setting ConnectivityMeasure model
Uploading train and test id
Getting train set
Get 21 components, fit, and transform, train set
__________________________________________
Getting test set
Get 21 components and transform, test set
__________________________________________
Started fold 1
Setting ConnectivityMeasure model
Uploading train and test id
Getting train set
Get 21 components, fit, and transform, train set
__________________________________________
Getting test set
Get 21 components and transform, test set
__________________________________________
Started fold 2
Setting ConnectivityMeasure model
Uploading train and test id
Getting train set
Get 21 components, fit, and transform, train set
__________________________________________
Getting test set
Get 21 components and transform, test set
__________________________________________
Started fold 3
Setting ConnectivityMeasure model
Uploading train and test id
Getting train

Generate column names that will reflect connections between components (lower triangular part with diagonal discarded)

In [96]:
def generate_column_names(components):
    column_names = []
    for i in range(1, len(components)):
        for j in range(i):
            column_names.append(f'Component {components[i]} & Component {components[j]} Tangent (55 IC)')
    return column_names

# Original components
good_21_orig = [1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]

# Generate column names
column_names = generate_column_names(good_21_orig)
column_names

['Component 2 & Component 1 Tangent (55 IC)',
 'Component 3 & Component 1 Tangent (55 IC)',
 'Component 3 & Component 2 Tangent (55 IC)',
 'Component 5 & Component 1 Tangent (55 IC)',
 'Component 5 & Component 2 Tangent (55 IC)',
 'Component 5 & Component 3 Tangent (55 IC)',
 'Component 6 & Component 1 Tangent (55 IC)',
 'Component 6 & Component 2 Tangent (55 IC)',
 'Component 6 & Component 3 Tangent (55 IC)',
 'Component 6 & Component 5 Tangent (55 IC)',
 'Component 7 & Component 1 Tangent (55 IC)',
 'Component 7 & Component 2 Tangent (55 IC)',
 'Component 7 & Component 3 Tangent (55 IC)',
 'Component 7 & Component 5 Tangent (55 IC)',
 'Component 7 & Component 6 Tangent (55 IC)',
 'Component 8 & Component 1 Tangent (55 IC)',
 'Component 8 & Component 2 Tangent (55 IC)',
 'Component 8 & Component 3 Tangent (55 IC)',
 'Component 8 & Component 5 Tangent (55 IC)',
 'Component 8 & Component 6 Tangent (55 IC)',
 'Component 8 & Component 7 Tangent (55 IC)',
 'Component 9 & Component 1 Tangen

Rename columns to reflect connections between components

In [None]:
for fold in folds:
    print(f"Renaming columns for fold {fold}")

    tangent_matrices_21_train = pd.read_csv(f'/Cog-Ment/PLS/brain/rs/ica_tangent/tangent_matrices_21_train_fold_{fold}.csv')
    tangent_matrices_21_test = pd.read_csv(f'/Cog-Ment/PLS/brain/rs/ica_tangent/tangent_matrices_21_test_fold_{fold}.csv')

    # Rename the columns
    tangent_matrices_21_train.columns = ['Unnamed: 0'] + column_names
    tangent_matrices_21_test.columns = ['Unnamed: 0'] + column_names

    # Save the updated DataFrames back to CSV files
    tangent_matrices_21_train.to_csv(f'/Cog-Ment/PLS/brain/rs/ica_tangent/tangent_matrices_renamed/tangent_matrices_21_train_fold_{fold}.csv', index=False)
    tangent_matrices_21_test.to_csv(f'/Cog-Ment/PLS/brain/rs/ica_tangent/tangent_matrices_renamed/tangent_matrices_21_test_fold_{fold}.csv', index=False)

    print(f"Columns renamed for fold {fold}")

Renaming columns for fold 0
Columns renamed for fold 0
Renaming columns for fold 1
Columns renamed for fold 1
Renaming columns for fold 2
Columns renamed for fold 2
Renaming columns for fold 3
Columns renamed for fold 3
Renaming columns for fold 4
Columns renamed for fold 4


If needed, convert a 1D array back into NxN matrix

In [None]:
tangent_matrices_21_test = pd.read_csv('/Cog-Ment/PLS/brain/rs/ica_tangent/tangent_matrices_renamed/tangent_matrices_21_test_fold_0.csv')
def vector_to_full_matrix(vector):
    # Calculate the size of the original matrix N
    k = len(vector)
    N = int((1 + np.sqrt(1 + 8 * k)) / 2)
    
    # Initialize an NxN matrix with zeros
    matrix = np.zeros((N, N))
    
    # Fill the lower triangular part (excluding the diagonal)
    index = 0
    for i in range(1, N):
        for j in range(i):
            matrix[i, j] = vector[index]
            matrix[j, i] = vector[index]
            index += 1
    np.fill_diagonal(matrix, vector[index:index + N])
    return matrix

# Example usage
matrix = vector_to_full_matrix(tangent_matrices_21_test.drop(columns='Unnamed: 0').iloc[0].values)
matrices = [vector_to_full_matrix(row) for row in tangent_matrices_21_test.drop(columns='Unnamed: 0').iloc[:3].values]

# Get timeseries for 100 ICA components

In [None]:
# Make a list of folder paths
folder_paths = ["/Resting_State/rsfMRI_bulk_main/1-4000/unzipped/",
"/Resting_State/rsfMRI_bulk_main/5000-14000/unzipped/",
"/Resting_State/rsfMRI_bulk_main/14000_24000_split/unzipped/",
"/Resting_State/rsfMRI_bulk_main/14000-24000/unzipped/",
"/Resting_State/rsfMRI_bulk_main/24000-34000/unzipped/",
"/Resting_State/rsfMRI_bulk_main/24000-34000_split/unzipped/",
"/Resting_State/rsfMRI_bulk_main/34000-44000/unzipped/",
"/Resting_State/rsfMRI_bulk_main/34000-44000_split/unzipped/",
"/Resting_State/rsfMRI_bulk_main/44000-54413/unzipped/"]

missing_file_count = 0

timeseries_list = []
index_list = []
skipped_folders = []

for folder_path in folder_paths:

    print(f'Started {folder_path}')
    for subject_folder in sorted(os.listdir(folder_path)):

        print(f'Started {subject_folder}')
        subfolder_path = os.path.join(folder_path, subject_folder)
        subject_folder_name = os.path.basename(subfolder_path)

        if os.path.isdir(os.path.join(subfolder_path, 'fMRI')):
            subfolder_path = os.path.join(subfolder_path, 'fMRI')
        else:
            subfolder_path = subfolder_path

        timeseries_file = os.path.join(subfolder_path, "rfMRI_100.dr", "dr_stage1.txt")

        if not os.path.exists(timeseries_file):
            missing_file_count += 1
            skipped_folders.append(subfolder_path)
            continue

        print('Appending files')
        timeseries_100 = np.loadtxt(timeseries_file)
        timeseries_list.append(timeseries_100)
        index_list.append(subject_folder_name)

print('Number of folders without the file:', missing_file_count)
print('Skipped folders:', *skipped_folders, sep='\n')

index_list_df = pd.DataFrame(index_list)
index_list_df.columns = ['eid']
index_list_df.sort_values(by='eid')

print('Started instance 2')
timeseries_instance_2 = []
index_instance_2 = []
for folder_name, timeseries in zip(index_list, timeseries_list):
    mid_part = folder_name.split("_")[-2]
    if mid_part == '2':
        timeseries_instance_2.append(timeseries)
        index_instance_2.append(folder_name)

index_100_df_full = pd.DataFrame(index_list, columns=['eid']).to_csv('/Cog-Ment/PLS/brain/rs/ica_tangent/files/index_100_ica_full.csv', index=False)
index_instance_2_100_df = pd.DataFrame(index_instance_2, columns=['eid'])
index_instance_2_100_df['eid'] = index_instance_2_100_df['eid'].str.replace('_20227_2_0', '').astype(int)
index_instance_2_100_df.to_csv('/Cog-Ment/PLS/brain/rs/ica_tangent/files/index_100_ica_instance_2.csv', index=False)

In [None]:
# Get dictionary and save it
timeseries_100_dict = {}
for index, timeseries in zip(index_instance_2_100_df['eid'], timeseries_instance_2):
    timeseries_100_dict[index] = timeseries

with open(f'/Cog-Ment/PLS/brain/rs/ica_tangent/files/timeseries_100_dict.pkl', "wb") as f:
    pickle.dump(timeseries_100_dict, f)


# Load the dictionary
with open('/Cog-Ment/PLS/brain/rs/ica_tangent/files/timeseries_100_dict.pkl', "rb") as a:
    timeseries_100_dict = pickle.load(a)

# Extract tangent matrices for 55 'good' components

(outlined [here](https://www.fmrib.ox.ac.uk/ukbiobank/group_means/rfMRI_GoodComponents_d100_v1.txt))

In [None]:
folds = ["0", "1", "2", "3", "4"]
for fold in folds:

    print("__________________________________________")

    print(f"Started fold {fold}")

    print('Setting ConnectivityMeasure model')

    tangent_measure = ConnectivityMeasure(
    kind="tangent",
    standardize="zscore_sample",
    vectorize = True,
    discard_diagonal = True)

    # Extract 55 good ICA out of 100

    good_55_orig = [2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
                40, 41, 42, 43, 45, 46, 48, 49, 50, 52, 53, 57, 58, 60, 63, 64, 93]
    good_55 = [ i-1 for i in good_55_orig ]

    print("Uploading train and test id")

    train_id = pd.read_csv(f'/Cog-Ment/g_factor_5_folds_python/fold_{fold}/train_id_fold_{fold}.csv')
    test_id = pd.read_csv(f'/Cog-Ment/g_factor_5_folds_python/fold_{fold}/test_id_fold_{fold}.csv')

    tangent_100_train = []
    tangent_100_test = []
    
    tangent_train_id = []
    tangent_test_id = []
    
    print("Getting train set")

    for id_val in train_id['eid'].values:
        if id_val in timeseries_100_dict.keys():
            tangent_100_train.append(timeseries_100_dict[id_val])
            tangent_train_id.append(id_val)

    print("Get 55 components, fit, and transform, train set")

    tangent_55_train = np.array(tangent_100_train)[:, :, good_55]
    tangent_matrices_55_train = tangent_measure.fit_transform(tangent_55_train)
    pd.DataFrame(tangent_matrices_55_train, columns = [f'Component {i+1} Tangent (55 IC)' for i in range(tangent_matrices_55_train.shape[1])], index=tangent_train_id).to_csv(f'/Cog-Ment/PLS/brain/rs/ica_tangent/tangent_matrices_55_train_fold_{fold}.csv')

    print("__________________________________________")
    print("Getting test set")
            
    for id_val in test_id['eid'].values:
        if id_val in timeseries_100_dict.keys():
            tangent_100_test.append(timeseries_100_dict[id_val])
            tangent_test_id.append(id_val)

    print("Get 55 components and transform, test set")
    tangent_55_test = np.array(tangent_100_test)[:, :, good_55]
    tangent_matrices_55_test = tangent_measure.transform(tangent_55_test)
    pd.DataFrame(tangent_matrices_55_test, columns = [f'Component {i+1} Tangent (55 IC)' for i in range(tangent_matrices_55_test.shape[1])], index=tangent_test_id).to_csv(f'/Cog-Ment/PLS/brain/rs/ica_tangent/tangent_matrices_55_test_fold_{fold}.csv')

__________________________________________
Started fold 0
Setting ConnectivityMeasure model
Uploading train and test id
Getting train set
Get 21 components, fit, and transform, train set
__________________________________________
Getting test set
Get 21 components and transform, test set
__________________________________________
Started fold 1
Setting ConnectivityMeasure model
Uploading train and test id
Getting train set
Get 21 components, fit, and transform, train set
__________________________________________
Getting test set
Get 21 components and transform, test set
__________________________________________
Started fold 2
Setting ConnectivityMeasure model
Uploading train and test id
Getting train set
Get 21 components, fit, and transform, train set
__________________________________________
Getting test set
Get 21 components and transform, test set
__________________________________________
Started fold 3
Setting ConnectivityMeasure model
Uploading train and test id
Getting train

Generate column names that will reflect connections between components (lower triangular part with diagonal discarded)

In [80]:
def generate_column_names(components):
    column_names = []
    for i in range(1, len(components)):
        for j in range(i):
            column_names.append(f'Component {components[i]} & Component {components[j]} Tangent (55 IC)')
    return column_names

# Original components
good_55_orig = [2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
                40, 41, 42, 43, 45, 46, 48, 49, 50, 52, 53, 57, 58, 60, 63, 64, 93]

# Generate column names
column_names = generate_column_names(good_55_orig)
column_names

['Component 3 & Component 2 Tangent (55 IC)',
 'Component 4 & Component 2 Tangent (55 IC)',
 'Component 4 & Component 3 Tangent (55 IC)',
 'Component 5 & Component 2 Tangent (55 IC)',
 'Component 5 & Component 3 Tangent (55 IC)',
 'Component 5 & Component 4 Tangent (55 IC)',
 'Component 6 & Component 2 Tangent (55 IC)',
 'Component 6 & Component 3 Tangent (55 IC)',
 'Component 6 & Component 4 Tangent (55 IC)',
 'Component 6 & Component 5 Tangent (55 IC)',
 'Component 7 & Component 2 Tangent (55 IC)',
 'Component 7 & Component 3 Tangent (55 IC)',
 'Component 7 & Component 4 Tangent (55 IC)',
 'Component 7 & Component 5 Tangent (55 IC)',
 'Component 7 & Component 6 Tangent (55 IC)',
 'Component 8 & Component 2 Tangent (55 IC)',
 'Component 8 & Component 3 Tangent (55 IC)',
 'Component 8 & Component 4 Tangent (55 IC)',
 'Component 8 & Component 5 Tangent (55 IC)',
 'Component 8 & Component 6 Tangent (55 IC)',
 'Component 8 & Component 7 Tangent (55 IC)',
 'Component 9 & Component 2 Tangen

Rename columns to reflect connections between components

In [None]:
for fold in folds:
    print(f"Renaming columns for fold {fold}")

    tangent_matrices_55_train = pd.read_csv(f'/Cog-Ment/PLS/brain/rs/ica_tangent/tangent_matrices_55_train_fold_{fold}.csv')
    tangent_matrices_55_test = pd.read_csv(f'/Cog-Ment/PLS/brain/rs/ica_tangent/tangent_matrices_55_test_fold_{fold}.csv')

    # Rename the columns
    tangent_matrices_55_train.columns = ['Unnamed: 0'] + column_names
    tangent_matrices_55_test.columns = ['Unnamed: 0'] + column_names

    # Save the updated DataFrames back to CSV files
    tangent_matrices_55_train.to_csv(f'/Cog-Ment/PLS/brain/rs/ica_tangent/tangent_matrices_renamed/tangent_matrices_55_train_fold_{fold}.csv', index=False)
    tangent_matrices_55_test.to_csv(f'/Cog-Ment/PLS/brain/rs/ica_tangent/tangent_matrices_renamed/tangent_matrices_55_test_fold_{fold}.csv', index=False)

    print(f"Columns renamed for fold {fold}")

Renaming columns for fold 0
Columns renamed for fold 0
Renaming columns for fold 1
Columns renamed for fold 1
Renaming columns for fold 2
Columns renamed for fold 2
Renaming columns for fold 3
Columns renamed for fold 3
Renaming columns for fold 4
Columns renamed for fold 4


If needed, convert a 1D array back into NxN matrix

In [None]:
tangent_matrices_55_test = pd.read_csv('/Cog-Ment/PLS/brain/rs/ica_tangent/tangent_matrices_renamed/tangent_matrices_55_test_fold_0.csv')
def vector_to_full_matrix(vector):
    # Calculate the size of the original matrix N
    k = len(vector)
    N = int((1 + np.sqrt(1 + 8 * k)) / 2)
    
    # Initialize an NxN matrix with zeros
    matrix = np.zeros((N, N))
    
    # Fill the lower triangular part (excluding the diagonal)
    index = 0
    for i in range(1, N):
        for j in range(i):
            matrix[i, j] = vector[index]
            matrix[j, i] = vector[index]
            index += 1
    np.fill_diagonal(matrix, vector[index:index + N])
    
    return matrix

# Example usage
matrix = vector_to_full_matrix(tangent_matrices_55_test.drop(columns='Unnamed: 0').iloc[0].values)
matrices = [vector_to_full_matrix(row) for row in tangent_matrices_55_test.drop(columns='Unnamed: 0').iloc[:3].values]

# PLS on RS ICA tangent

In [None]:
modalities = ['tangent_matrices_21', 'tangent_matrices_55']
confounds = pd.read_csv('/Cog-Ment/PLS/brain/rs/ica_main/data_tables/rs_confounds.csv')

warnings.simplefilter(action='ignore', category=FutureWarning)

############## 1
seed = 42

for modality in modalities:

    print(f'Started {modality}', flush=True)

    folds = ["0", "1", "2", "3", "4"]
    pls_result = {}

    for fold in folds:

        tangent_train_id = pd.read_csv(f'/Cog-Ment/PLS/brain/rs/ica_tangent/{modality}_train_fold_{fold}.csv').rename(columns={'Unnamed: 0': 'eid'})
        tangent_test_id = pd.read_csv(f'/Cog-Ment/PLS/brain/rs/ica_tangent/{modality}_test_fold_{fold}.csv').rename(columns={'Unnamed: 0': 'eid'})
        
        # Match confounds to MRI
        print(f'Matching brain data to confounds in {modality} fold {fold}', flush=True)
        
        print('____Train____')
        conf_to_brain_match_train = pd.merge(confounds, tangent_train_id['eid'], on='eid')
        conf_to_brain_match_train.to_csv(f'/Cog-Ment/PLS/brain/rs/ica_tangent/fold_{fold}/pls_output/{modality}_conf_to_brain_match_train_fold_{fold}.csv', index=False)

        brain_to_conf_match_train = pd.merge(conf_to_brain_match_train['eid'], tangent_train_id, on='eid')
        brain_to_conf_match_train.to_csv(f'/Cog-Ment/PLS/brain/rs/ica_tangent/fold_{fold}/pls_output/{modality}_brain_to_conf_match_train_fold_{fold}.csv', index=False)
        
        print('____Test___') 
        conf_to_brain_match_test = pd.merge(confounds, tangent_test_id['eid'], on='eid')
        conf_to_brain_match_test.to_csv(f'/Cog-Ment/PLS/brain/rs/ica_tangent/fold_{fold}/pls_output/{modality}_conf_to_brain_match_test_fold_{fold}.csv', index=False)

        brain_to_conf_match_test = pd.merge(conf_to_brain_match_test['eid'], tangent_test_id, on='eid')
        brain_to_conf_match_test.to_csv(f'/Cog-Ment/PLS/brain/rs/ica_tangent/fold_{fold}/pls_output/{modality}_brain_to_conf_match_test_fold_{fold}.csv', index=False)
        
        # Upload g-factor with ID
        g_train_full = pd.read_csv(f'/Cog-Ment/PLS/g_factor/g_train_with_id_fold_{fold}.csv')
        g_test_full = pd.read_csv(f'/Cog-Ment/PLS/g_factor/g_test_with_id_fold_{fold}.csv')

        
        ############## 2
        print(f'Matching confounds to {modality} fold {fold}', flush=True)
        
        # Match confounds to MRI
        print('Getting train and test without IDs')
        brain_train, brain_test, conf_train, conf_test = brain_to_conf_match_train.drop(columns=['eid']), brain_to_conf_match_test.drop(columns=['eid']), conf_to_brain_match_train.drop(columns=['eid']), conf_to_brain_match_test.drop(columns=['eid'])
        
        ############## 3
        print(f'Matching g-factor to {modality} fold {fold}', flush=True)
        
        # Match g-factor back to MRI
        print('Metching g-factor to brain')
        g_train, g_test, g_train_id, g_test_id = pd.merge(g_train_full, brain_to_conf_match_train['eid'], on='eid').drop(columns=['eid']), pd.merge(g_test_full, brain_to_conf_match_test['eid'], on='eid').drop(columns=['eid']), pd.merge(g_train_full, brain_to_conf_match_train['eid'], on='eid')['eid'], pd.merge(g_test_full, brain_to_conf_match_test['eid'], on='eid')['eid']
        g_train.to_csv(f'/Cog-Ment/PLS/brain/rs/ica_tangent/fold_{fold}/pls_output/g_train_{modality}_matched_fold_{fold}.csv', index=False)
        g_test.to_csv(f'/Cog-Ment/PLS/brain/rs/ica_tangent/fold_{fold}/pls_output/g_test_{modality}_matched_fold_{fold}.csv', index=False)
        g_train_id.to_csv(f'/Cog-Ment/PLS/brain/rs/ica_tangent/fold_{fold}/pls_output/g_train_id_{modality}_matched_fold_{fold}.csv', index=False)
        g_test_id.to_csv(f'/Cog-Ment/PLS/brain/rs/ica_tangent/fold_{fold}/pls_output/g_test_id_{modality}_matched_fold_{fold}.csv', index=False)
        
        ############## 4
        print(f'Applying ConfoundRegressor to {modality} fold {fold}', flush=True)
        
        # Apply ConfoundRegressor
        features_train_corr, features_test_corr, features_train_scaled, features_test_scaled, scaler_features = confound_regressor_skbold(brain_train, brain_test, conf_train, conf_test)
        
        pd.DataFrame(features_train_corr, columns = brain_train.columns).to_csv(f'/Cog-Ment/PLS/brain/rs/ica_tangent/fold_{fold}/pls_output/{modality}_train_corr_{fold}.csv', index=False)
        pd.DataFrame(features_test_corr, columns = brain_test.columns).to_csv(f'/Cog-Ment/PLS/brain/rs/ica_tangent/fold_{fold}/pls_output/{modality}_test_corr_{fold}.csv', index=False)
        
        pd.DataFrame(features_train_scaled, columns = brain_train.columns).to_csv(f'/Cog-Ment/PLS/brain/rs/ica_tangent/fold_{fold}/pls_output/{modality}_train_scaled_{fold}.csv', index=False)
        pd.DataFrame(features_test_scaled, columns = brain_test.columns).to_csv(f'/Cog-Ment/PLS/brain/rs/ica_tangent/fold_{fold}/pls_output/{modality}_test_scaled_{fold}.csv', index=False)
        

        with open(f'/Cog-Ment/PLS/brain/rs/ica_tangent/fold_{fold}/pls_output/scaler_features_{modality}_fold_{fold}.pkl', "wb") as f:
            pickle.dump(scaler_features, f)
            

        # Initiate and run PLS
        parameters = {'n_components': range(1, 36, 1)}
        pls = PLSRegression()
        model = GridSearchCV(pls, parameters, scoring = 'neg_mean_absolute_erro', cv=KFold(10, shuffle = True, random_state=seed), verbose=4, n_jobs = 8)
        
        
        print(f'Fitting PLS to {modality} fold {fold}', flush=True)
        model.fit(features_train_corr, np.array(g_train))
        
        print(f'Model parameters for fold {fold}:', model.cv_results_['params'])
        print(f'Mean test score for fold {fold}:', model.cv_results_['mean_test_score'])
        print(f'Rank test score for fold {fold}:', model.cv_results_['rank_test_score'])
        print(model)
        
        print(f'Saving PLS model for {modality} fold {fold}')
        with open(f'/Cog-Ment/PLS/brain/rs/ica_tangent/fold_{fold}/models/pkl/{modality}_model_fold_{fold}.pkl', "wb") as f:
            pickle.dump(model, f)
            
        print(f'Best params in fold {fold} = ', model.best_params_)
        print(f'Best score (neg_mean_absolute_error) in fold {fold} = ', model.best_score_)
            
        # Predict the values
        print(f'Predicting & saving g_test for {modality} fold {fold}', flush=True)
        g_pred_test = model.predict(np.array(features_test_corr))
        pd.DataFrame(g_pred_test, columns=['g predicted test']).to_csv(f'/Cog-Ment/PLS/brain/rs/ica_tangent/fold_{fold}/g_pred/{modality}_test_fold_{fold}.csv')

        g_pred_test_with_id = pd.concat([g_test_id.astype(int), pd.DataFrame(g_pred_test, columns=['g predicted test'])], axis=1).to_csv(f'/Cog-Ment/PLS/brain/rs/ica_tangent/fold_{fold}/g_pred/{modality}_g_pred_test_id_fold_{fold}.csv')

        
        print(f'Predicting & saving g_train for {modality} fold {fold}', flush=True)
        g_pred_train = model.predict(np.array(features_train_corr))
        pd.DataFrame(g_pred_train, columns=['g predicted train']).to_csv(f'/Cog-Ment/PLS/brain/rs/ica_tangent/fold_{fold}/g_pred/{modality}_g_pred_train_fold_{fold}.csv')
        

        g_pred_train_with_id = pd.concat([g_train_id.astype(int), pd.DataFrame(g_pred_train, columns=['g predicted train'])], axis=1).to_csv(f'/Cog-Ment/PLS/brain/rs/ica_tangent/fold_{fold}/g_pred/{modality}_g_pred_train_id_fold_{fold}.csv')
        
            
        print(f"Fold = {fold}")
        print("----------")
        print("MSE = ", mean_squared_error(np.array(g_test)[:,0], g_pred_test[:,0]))
        print("MAE = ", mean_absolute_error(np.array(g_test)[:,0], g_pred_test[:,0]))
        print("R2 = ", r2_score(np.array(g_test)[:,0], g_pred_test[:,0]))
        print("Pearson's r = ", pearsonr(np.array(g_test)[:,0], g_pred_test[:,0]))
        print("----------")
            
        pls_result['fold'] = fold
        pls_result['modality'] = modality
        pls_result['n_components'] = model.best_params_
        pls_result['MSE'] = mean_squared_error(np.array(g_test)[:,0], g_pred_test[:,0])
        pls_result['MAE'] = mean_absolute_error(np.array(g_test)[:,0], g_pred_test[:,0])
        pls_result['R2'] = r2_score(np.array(g_test)[:,0], g_pred_test[:,0])
        pls_result['Pearson '] = pearsonr(np.array(g_test)[:,0], g_pred_test[:,0])
            
        with open(f'/Cog-Ment/PLS/brain/rs/ica_tangent/fold_{fold}/models/csv/{modality}_fold_{fold}_PLS_result.csv', 'a', newline='') as f:
            writer = csv.DictWriter(f, fieldnames=pls_result.keys())
            writer.writerow(pls_result)
            
        pls_result.clear()
        
        corr, pval = stats.pearsonr(np.squeeze(np.array(g_test)), np.squeeze(g_pred_test))
        r2 = r2_score(np.squeeze(np.array(g_test)), np.squeeze(g_pred_test))
        mse = mean_squared_error(np.squeeze(np.array(g_test)), np.squeeze(g_pred_test))
        result = pd.DataFrame([modality, fold, corr, pval, r2, mse, model.best_params_], index=['Modality', 'Fold', 'Correlation', 'P-value', 'R2', 'MSE', 'n components'], columns=['Values']).to_csv(f'/Cog-Ment/PLS/brain/rs/ica_tangent/fold_{fold}/models/csv/{modality}_fold_{fold}_full_result.csv')

Started tangent_matrices_21
Matching brain data to confounds in tangent_matrices_21 fold 0
____Train____
____Test___
Matching confounds to tangent_matrices_21 fold 0
Getting train and test without IDs
Matching g-factor to tangent_matrices_21 fold 0
Metching g-factor to brain
Applying ConfoundRegressor to tangent_matrices_21 fold 0
Fitting PLS to tangent_matrices_21 fold 0
Fitting 10 folds for each of 35 candidates, totalling 350 fits
[CV 2/10] END ..................n_components=1;, score=-0.559 total time=   0.2s
[CV 1/10] END ..................n_components=1;, score=-0.550 total time=   0.2s
[CV 4/10] END ..................n_components=1;, score=-0.548 total time=   0.2s
[CV 5/10] END ..................n_components=1;, score=-0.562 total time=   0.1s
[CV 3/10] END ..................n_components=1;, score=-0.552 total time=   0.2s
[CV 6/10] END ..................n_components=1;, score=-0.558 total time=   0.1s
[CV 7/10] END ..................n_components=1;, score=-0.548 total time=   

## Display and average results across five folds

In [None]:
five_folds = []
folds = ["0", "1", "2", "3", "4"]
for modality in modalities:
    for fold in folds:
        pls = pd.read_csv(f'/Cog-Ment/PLS/brain/rs/ica_tangent/fold_{fold}/models/csv/{modality}_fold_{fold}_PLS_result.csv', header=None)
        pls.columns = ['Fold', 'Modality', 'n components', 'MSE', 'MAE', 'R2', 'Pearson ']
        #pls.index = [modality] * len(pls)
        five_folds.append(pls)
        five_folds_all_modalities = pd.concat(five_folds, ignore_index=False)

five_folds_all_modalities['Pearson '] = five_folds_all_modalities['Pearson '].astype(str).str.replace('PearsonRResult\(statistic=|pvalue=|\)', '', regex=True)
five_folds_all_modalities[['Pearson ', 'p-value']] = five_folds_all_modalities['Pearson '].str.split(',', expand=True).astype(float).round(decimals=3)
five_folds_all_modalities = five_folds_all_modalities.round(decimals=3)
with pd.option_context('display.max_rows', None):
    display(five_folds_all_modalities)

Unnamed: 0,Fold,Modality,n components,MSE,MAE,R2,Pearson r,p-value
0,0,tangent_matrices_21,{'n_components': 4},0.46,0.541,0.085,0.293,0.0
0,1,tangent_matrices_21,{'n_components': 3},0.666,0.652,0.033,0.188,0.0
0,2,tangent_matrices_21,{'n_components': 3},0.754,0.696,0.035,0.189,0.0
0,3,tangent_matrices_21,{'n_components': 4},0.479,0.548,0.059,0.246,0.0
0,4,tangent_matrices_21,{'n_components': 4},0.495,0.556,0.057,0.243,0.0
0,0,tangent_matrices_55,{'n_components': 3},0.435,0.528,0.135,0.369,0.0
0,1,tangent_matrices_55,{'n_components': 3},0.655,0.645,0.05,0.242,0.0
0,2,tangent_matrices_55,{'n_components': 3},0.743,0.687,0.048,0.237,0.0
0,3,tangent_matrices_55,{'n_components': 3},0.458,0.536,0.101,0.327,0.0
0,4,tangent_matrices_55,{'n_components': 3},0.468,0.541,0.108,0.333,0.0


In [None]:
# Average across folds
five_folds_all_modalities_mean = five_folds_all_modalities[['R2', 'Pearson ', 'Modality', 'MSE', 'MAE']]
five_folds_all_modalities_mean.groupby(['Modality']).mean().round(3).sort_values(by='R2', ascending=False)

Unnamed: 0_level_0,R2,Pearson r,MSE,MAE
Modality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
tangent_matrices_55,0.088,0.302,0.552,0.587
tangent_matrices_21,0.054,0.232,0.571,0.599
