### External label and SC-FC correlation

In [1]:
import pickle 
import numpy as np
import pandas as pd

with open('../dataset/processed/data_dict_5_classes.pkl', 'rb') as f:
    data_dict = pickle.load(f)


In [4]:
import torch
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from tqdm import tqdm
from sklearn.model_selection import train_test_split

def calculate_correlation(matrix1, matrix2):
    # Flatten the matrices
    m1_flat = matrix1.flatten()
    m2_flat = matrix2.flatten()
    # Calculate Pearson correlation manually
    m1_mean = m1_flat.mean()
    m2_mean = m2_flat.mean()
    numerator = ((m1_flat - m1_mean) * (m2_flat - m2_mean)).sum()
    denominator = torch.sqrt(((m1_flat - m1_mean) ** 2).sum() * ((m2_flat - m2_mean) ** 2).sum())
    cor = numerator / denominator
    return cor

X = []
Y = []
for i in tqdm(range(len(data_dict))):
    # get FC, and SC
    FC = torch.tensor(data_dict[i]['FC'])
    SC = torch.tensor(data_dict[i]['SC'])
    # calculate the correlation
    cor = calculate_correlation(FC, SC)
    # get the external label
    y = np.array(data_dict[i]['label'])
    ext_label = (y[2] + y[3] + y[4] >= 1)
    # append to the list
    X.append(cor)
    Y.append(ext_label)


X = torch.tensor(X)
Y = torch.tensor(Y)

# split the data into train and test
train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size=0.2, random_state=42)
train_X = train_X.reshape(-1, 1)
test_X = test_X.reshape(-1, 1)


100%|██████████| 6298/6298 [00:04<00:00, 1481.77it/s]
  Y = torch.tensor(Y)


In [11]:

# balance the data
from imblearn.over_sampling import SMOTE

# Apply SMOTE to balance the data
smote = SMOTE(random_state=42)
train_X_res, train_Y_res = smote.fit_resample(train_X, train_Y)

# down sample
from imblearn.under_sampling import RandomUnderSampler

# Apply RandomUnderSampler to downsample the data
rus = RandomUnderSampler(random_state=42)
train_X_down, train_Y_down = rus.fit_resample(train_X, train_Y)


# train the mode
model = LogisticRegression()
model.fit(train_X_down, train_Y_down)
# evaluate the model
yhat = model.predict(test_X)
# evaluate predictions with AUC
auc = roc_auc_score(test_Y, yhat)

In [14]:
# random forest
model = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
model.fit(train_X_down, train_Y_down)
yhat = model.predict(test_X)
auc = roc_auc_score(test_Y, yhat)

In [15]:
auc

np.float64(0.5261384185434819)