In [9]:
from nilearn.connectome import ConnectivityMeasure
import matplotlib.pyplot as plt
import networkx as nx
import pandas as pd
import numpy as np
import scipy.io
import os
import pickle

In [10]:
def filter_strings_not_in_list(strings_list, substrings_list):
    filtered_list = []
    for s in strings_list:
        if not any(substring in s for substring in substrings_list):
            filtered_list.append(s)
    return filtered_list
folder_path = '/path_to/ROISignals_FunImgARCWF'  # Pfad zum Ordner
#folder_path = '/path_to/ROISignals_FunImgARglobalCWF'
strings_list = []
for file_name in os.listdir(folder_path):
    strings_list.append(file_name)
with open('/Users/mieeesh/Desktop/Project/BA/coding_notebooks/data_tools/first_id_list.pkl', 'rb') as file:
    data_list_substrings = pickle.load(file)

filtered_list = filter_strings_not_in_list(strings_list, data_list_substrings)
print(len(filtered_list))

labels = []

for j, file_name in enumerate(filtered_list[:1300]):
    
    degree_list = []
    pagerank_list = []
    closeness_list = []
    betweenness_list = []
    clustering_list = []

    mat = scipy.io.loadmat(os.path.join(folder_path, file_name))
    matrix = mat['ROISignals']
    aal_array = matrix[:, :116]  # 1~116: Automated Anatomical Labeling (AAL) atlas (Tzourio-Mazoyer et al., 2002)
    #hoac_array = matrix[:, 116:212]   # 117~212: Harvard-Oxford atlas (Kennedy et al., 1998)– cortical areas
    #hoas_array = matrix[:, 212:228]   # 213~228: Harvard-Oxford atlas (Kennedy et al., 1998)– subcortical areas
    #ccl_array = matrix[:, 228:428]    # 229~428: Craddock’s clustering 200 ROIs (Craddock et al., 2012)
    #zrp_array = matrix[:, 428:1408]   # 429~1408: Zalesky’s random parcelations (compact version: 980 ROIs) (Zalesky et al., 2010)
    #dbf_array = matrix[:, 1408:1568]  # 1409~1568: Dosenbach’s 160 functional ROIs (Dosenbach et al., 2010)

    correlation_measure = ConnectivityMeasure(kind='correlation')
    correlation_matrix = correlation_measure.fit_transform([aal_array])[0]
    
    for factor in range(10, 35):
        threshold = factor/100
        # Anwenden des Schwellwerts auf die Konnektivitätsmatrix
        thresholded_matrix = np.where(correlation_matrix >= threshold, 1, 0)
        np.fill_diagonal(thresholded_matrix, 0)
        # Erstellen des Graphen aus Konnektivitätsmatrix
        graph = nx.from_numpy_array(thresholded_matrix)

        zahl = int(file_name.split('-')[1])

        # degree model
        degree_list.extend(dict(nx.degree(graph)).values())

        # PageRank Centrality
        pagerank_list.extend(nx.pagerank(graph).values())

        # Closeness Centrality
        closeness_list.extend(nx.closeness_centrality(graph).values())

        # Betweenness Centrality
        betweenness_list.extend(nx.betweenness_centrality(graph).values())

        # Clustering Coefficient
        clustering_list.extend(nx.clustering(graph).values())

    lists_array = np.array([degree_list, pagerank_list, closeness_list, betweenness_list, clustering_list]).flatten()

    if j == 0:
        features_array = lists_array
    else:
        features_array = np.vstack((features_array, lists_array))

    labels.append(zahl)
    if (j+1) % 100 == 0:
        print(f'----{j+1} Done!')  

1530
99
199
299
399
499
599
699
799
899
999
1099
1199
1299


In [8]:
import numpy as np
a = np.array([[5,6,7],[7,8,7],[3,2,3]]).flatten()
b = np.array([[0,0,4],[7,5,7],[7,8,3]]).flatten()
c = np.vstack((a,b))
c.shape

(2, 9)

In [13]:
X = features_array
y = np.array(labels)

In [26]:
features.shape

(1300, 14500)

In [30]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier  # Import RandomForestClassifier
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, auc, roc_curve

#fpath = 'path/for/feature/file/(csv)'
#lpath = 'path/for/label/file/(csv)'
seed = 81 # random seed
no_folds = 10 # number of folds in out_loop
no_nested_folds = 10 # number of folds in nested_loop

skf = StratifiedKFold(n_splits=no_folds, shuffle=True, random_state=101)
nested_skf = StratifiedKFold(n_splits=no_nested_folds, shuffle=True)
param_grid = {'n_estimators': [50, 100, 200, 300], 'max_depth': [None, 10, 20, 30]}  # Modify the parameter grid for RandomForestClassifier
eval_metrics = np.zeros((skf.n_splits, 4))
print('Loading data ...')
# features = np.loadtxt(fpath, delimiter=',')
# labels = np.loadtxt(lpath, dtype='int32')

features = X
labels = y

print(f'Finished')
print(f'{np.sum(labels == 1)} MDD & {np.sum(labels == 2)} HC')

alphas = [0.1, 0.12, 0.14, 0.16, 0.18, 0.2, 0.25, 0.3]
for alpha in alphas:
    # Feature Selection with Lasso-Method
    lasso_model = Lasso(alpha=alpha)  # choose hyperparameter 
    lasso_model.fit(features, labels)
    features_df = pd.DataFrame(features)
    selected_features = features_df.columns[lasso_model.coef_ != 0]
    features_array = features[:,selected_features]
    print(f"Ausgewählte Features: {features_array.shape[1]} für alpha = {alpha}")

    # ROC plotting preparation
    TPR, AUC = [], []
    mean_fpr = np.linspace(0, 1, 100)

    for n_cv, (train_ind, test_ind) in enumerate(skf.split(features_array, labels)):
        print('Processing the No.%i cross-validation in %i-fold CV' % (n_cv + 1, skf.n_splits))
        x_train, y_train = features_array[train_ind, ], labels[train_ind, ]
        x_test, y_test = features_array[test_ind, ], labels[test_ind, ]

        # Training
        init_clf = RandomForestClassifier()
        grid = GridSearchCV(init_clf, param_grid, cv=nested_skf, scoring='accuracy', n_jobs=5)
        grid.fit(x_train, y_train)
        print('----The best parameters: n_estimators=%d, max_depth=%s with accuracy of %f' % (
            grid.best_params_['n_estimators'], grid.best_params_['max_depth'], grid.best_score_))

        clf = RandomForestClassifier(n_estimators=grid.best_params_['n_estimators'],
                                     max_depth=grid.best_params_['max_depth'])
        clf.fit(x_train, y_train)
        y_predict = clf.predict(x_test)
        y_proba = clf.predict_proba(x_test)

        # ROC information for each fold
        cv_fpr, cv_tpr, cv_thresholds = roc_curve(y_test, y_proba[:, 1], pos_label=2)
        cv_auc = auc(cv_fpr, cv_tpr)
        interp_tpr = np.interp(mean_fpr, cv_fpr, cv_tpr)
        interp_tpr[0] = 0.0
        TPR.append(interp_tpr)
        AUC.append(cv_auc)

        # Evaluation
        tn, fp, fn, tp = confusion_matrix(y_test, y_predict).ravel()
        cv_accuracy = (tn + tp) / (tn + fp + fn + tp)
        cv_sensitivity = tp / (tp + fn)
        cv_specificity = tn / (tn + fp)
        eval_metrics[n_cv, 0] = cv_accuracy
        eval_metrics[n_cv, 1] = cv_sensitivity
        eval_metrics[n_cv, 2] = cv_specificity
        eval_metrics[n_cv, 3] = cv_auc

    # reporting model evaluation measures
    df = pd.DataFrame(eval_metrics)
    df.columns = ['ACC', 'SEN', 'SPE', 'AUC']
    df.index = ['CV_' + str(i + 1) for i in range(skf.n_splits)]
    print(df)
    print('\nAverage Accuracy: %.4f' % (eval_metrics[:, 0].mean()))
    print('Average Sensitivity: %.4f' % (eval_metrics[:, 1].mean()))
    print('Average Specificity: %.4f' % (eval_metrics[:, 2].mean()))
    print('Average area under ROC curve: %.4f' % (eval_metrics[:, 3].mean()))

    # saving ROC plotting information
    # mean_tpr = np.mean(TPR, axis=0)
    # mean_tpr[-1] = 1.0
    # mean_auc = auc(mean_fpr, mean_tpr)
    # np.savez(fpath + '/../ROC_MTR.npz', tpr=mean_tpr, fpr=mean_fpr, auc=mean_auc)

Loading data ...
Finished
717 MDD & 583 HC
Ausgewählte Features: 137 für alpha = 0.1
Processing the No.1 cross-validation in 10-fold CV


  model = cd_fast.enet_coordinate_descent(


----The best parameters: n_estimators=200, max_depth=20 with accuracy of 0.603419
Processing the No.2 cross-validation in 10-fold CV
----The best parameters: n_estimators=300, max_depth=20 with accuracy of 0.617094
Processing the No.3 cross-validation in 10-fold CV
----The best parameters: n_estimators=200, max_depth=30 with accuracy of 0.612821
Processing the No.4 cross-validation in 10-fold CV
----The best parameters: n_estimators=50, max_depth=20 with accuracy of 0.609402
Processing the No.5 cross-validation in 10-fold CV
----The best parameters: n_estimators=200, max_depth=None with accuracy of 0.606838
Processing the No.6 cross-validation in 10-fold CV
----The best parameters: n_estimators=200, max_depth=20 with accuracy of 0.613675
Processing the No.7 cross-validation in 10-fold CV
----The best parameters: n_estimators=300, max_depth=20 with accuracy of 0.609402
Processing the No.8 cross-validation in 10-fold CV
----The best parameters: n_estimators=200, max_depth=30 with accurac

  model = cd_fast.enet_coordinate_descent(


Ausgewählte Features: 120 für alpha = 0.12
Processing the No.1 cross-validation in 10-fold CV
----The best parameters: n_estimators=200, max_depth=20 with accuracy of 0.604274
Processing the No.2 cross-validation in 10-fold CV
----The best parameters: n_estimators=300, max_depth=30 with accuracy of 0.602564
Processing the No.3 cross-validation in 10-fold CV
----The best parameters: n_estimators=100, max_depth=30 with accuracy of 0.624786
Processing the No.4 cross-validation in 10-fold CV
----The best parameters: n_estimators=300, max_depth=30 with accuracy of 0.616239
Processing the No.5 cross-validation in 10-fold CV
----The best parameters: n_estimators=100, max_depth=20 with accuracy of 0.615385
Processing the No.6 cross-validation in 10-fold CV
----The best parameters: n_estimators=200, max_depth=10 with accuracy of 0.605128
Processing the No.7 cross-validation in 10-fold CV
----The best parameters: n_estimators=100, max_depth=None with accuracy of 0.608547
Processing the No.8 cros