In [None]:
import os
import numpy as np
import scipy
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.io import loadmat

from scipy.sparse.linalg import eigsh, ArpackError
from scipy.sparse import csgraph
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch_geometric.utils as utils
from torch_geometric.utils import to_networkx
from torch_geometric.nn import GCNConv, global_mean_pool
from torch_geometric.datasets import TUDataset, Planetoid
from torch_geometric.loader import DataLoader
from torch_geometric.data import Data

# Simulated

In [None]:
# Adopted from https://github.com/basiralab/ReMI-Net-Star/tree/9f427d47e99dfef495119b82f43ceb06e3272bbe
# Antivectorize given vector into adjacency matrix
def antiVectorize(vec, m):
    M = np.zeros((m,m))
    M[np.tril_indices(m,k=-1)] = vec
    M= M.transpose()
    M[np.tril_indices(m,k=-1)] = vec
    return M

# Vectorize graph adjacency matrix into a vector
def vectorize(M):
    return M[np.tril_indices(M.shape[0], k=-1)]

def multivariate_simulate(n_samples=200,n_time=2,n_views=4):
    # Note that changing the node count is not provided right now, since we use correlation matrix
    # and the mean values of connectivities from real data and it is for 35 nodes.
    
    # Import all required statistical information.
    allstats = np.load("./stats/REALDATA_LH_AVGMEANS.npy") # Connectivity mean values of LH. You can also try with RH.
    allcorrs = np.load("./stats/REALDATA_LH_AVGCORRS.npy") # Correlation matrix in LH. You can also try with RH.
    all_diffs = np.load("./stats/REAL_TIME_DIFF.npy") # This is an overall representation of time differences in both (LH and RH) datasets.
    
    times = []
    for t in range(n_time):
        views = []
        for v in range(n_views):
            # Note that we randomly assign a new random state to ensure it will generate a different dataset at each run.
            # Generate data with the correlations and mean values at the current timepoint.
            if t < 2:
                connectomic_means = allstats[t,v]
                data = multivariate_normal.rvs(connectomic_means,allcorrs[t,v],n_samples,random_state=randint(1,9999))
            # If the requested timepoints are more than we have in real data, use the correlation information from the last timepoint.
            else:
                connectomic_means = allstats[-1,v]
                data = multivariate_normal.rvs(connectomic_means,allcorrs[-1,v],n_samples,random_state=randint(1,9999))

            adj = []
            for idx, sample in enumerate(data):
                # Create adjacency matrix.
                matrix = antiVectorize(sample,35)
                # Perturb the real time difference with nonlinear tanh function.
                noise = np.tanh( t / n_time )
                # Further timepoints will have more significant difference from the baseline (t=6 >> t=1).
                matrix = matrix + all_diffs[:,:,v] * ( noise + 1 )
                adj.append(matrix)
            views.append(np.array(adj))

        times.append(np.array(views))
    
    alldata=np.array(times)
    alldata = np.transpose(alldata,(2,0,3,4,1))
    return alldata 

def prepare_data(new_data=False, n_samples=200, n_times=6):
    # Note that data with 200 samples and 6 timepoints is very large (5.8M data points),
    # check your GPU memory to make sure there is enough space to allocate. If not, try:
    # - to reduce dataset size by changing n_samples or n_times.
    # - on CPU (this will allocate memory on RAM) --> This might work for example if you have 1GB GPU memory but 16GB RAM.
    # - on another computer with a better NVIDIA graphics card. --> 2GB GPU memory will not be enough for 5.8M data.
    try:
        if new_data:
            samples = multivariate_simulate(n_samples,n_times)
            np.save('./multivariate_simulation_data.npy',samples)
        else:
            samples = np.load('./multivariate_simulation_data.npy')
    except:
        samples = multivariate_simulate(n_samples,n_times)
        np.save('./multivariate_simulation_data.npy',samples)
    return samples

In [None]:
X = prepare_data(new_data=True,n_samples=100,n_times=3)

# EMCI-AD

In [None]:
# Load the data
data_path = '/Users/gitaayusalsabila/Documents/0thesis/datasets/eMCI_AD_data/'
data_t1 = scipy.io.loadmat(data_path + '67subjectslh_view2_t1.mat')['view2']
data_t2 = scipy.io.loadmat(data_path + '67subjectslh_view2_t2.mat')['view2']
labels = scipy.io.loadmat(data_path + 'label_dataLH.mat')['label_data']
print(labels.shape)

# Define the antiVectorize function
def antiVectorize(vec, m):
    M = np.zeros((m, m))
    M[np.tril_indices(m, k=-1)] = vec
    M = M.transpose()
    M[np.tril_indices(m, k=-1)] = vec
    return M

# Number of subjects and size of the adjacency matrix
num_subjects = data_t1.shape[0]
matrix_size = int((1 + np.sqrt(1 + 8 * data_t1.shape[1])) / 2)

# Initialize arrays for the results
emci_ad_timepoints = np.zeros((num_subjects, 2, matrix_size, matrix_size))
emci_ad_timepoints_labeled = np.zeros((num_subjects, 2, 1, matrix_size, matrix_size))

# Process each subject's data
for i in range(num_subjects):
    emci_ad_timepoints[i, 0] = antiVectorize(data_t1[i], matrix_size)
    emci_ad_timepoints[i, 1] = antiVectorize(data_t2[i], matrix_size)

# Save the arrays to .npy files
np.save('emci-ad_timepoints.npy', emci_ad_timepoints)

# SLIM160

In [None]:
def extract_data_from_mat(file_path):
    mat_data = loadmat(file_path)
    return mat_data['r']

def traverse_and_extract_complete_sessions(base_path,atlas_name):
    subject_data = []
    subjects = os.listdir(base_path)
    
    for subject in subjects:
        subject_path = os.path.join(base_path, subject)
        if os.path.isdir(subject_path):
            sessions = ['session_1', 'session_2', 'session_3']
            session_data = []
            for session in sessions:
                session_path = os.path.join(subject_path, session)
                if os.path.isdir(session_path):
                    atlas_path = os.path.join(session_path, atlas_name)
                    mat_file_path = os.path.join(atlas_path, 'ROI_FC.mat')
                    if os.path.isfile(mat_file_path):
                        print(f"Extracting data from: {mat_file_path}")
                        data = extract_data_from_mat(mat_file_path)
                        session_data.append(data)
            # Check if all 3 sessions are present
            if len(session_data) == 3:
                subject_data.append(session_data)
    
    return np.array(subject_data)
    

In [None]:
# Base directory containing subject folders
path_160 = '/Users/gitaayusalsabila/Documents/0 Thesis/datasets/slim/SWU'  # Replace this with the actual path
atlas_name_160 = 'Dosenbach_160'
# Extract the data
slim160 = traverse_and_extract_complete_sessions(path_160,atlas_name_160)

# Save the consolidated data to a numpy file for later use
np.save('slim160.npy', slim160)

# Print the shape of the consolidated data
print(f"Consolidated data shape: {slim160.shape}")