In [1]:
import os
import numpy as np
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import pandas as pd
from sklearn.preprocessing import LabelBinarizer

In [2]:
labels_file = 'Data/participants.tsv'
data_directory = 'cc_matrices/'

In [3]:
def read_matrix_from_file(file_path):
    with open(file_path, 'r') as file:
        content = file.readlines()
        matrix = []
        for line in content:
            values = line.rstrip().split(',')
            row = []
            for val in values:
                if val.strip():
                    try:
                        row.append(float(val))
                    except ValueError:
                        print(f"Warning: Non-numeric value '{val}' found. Skipping.")
            if row:
                matrix.append(row)
    return np.array(matrix)

In [31]:
participant_data = pd.read_csv(labels_file, sep='\t')
diagnosis_data = participant_data[['participant_id', 'diagnosis']]
labels_dict = dict(zip(diagnosis_data['participant_id'], diagnosis_data['diagnosis']))
unique_labels = np.unique(list(labels_dict.values()))

labels_dict = dict(zip(diagnosis_data['participant_id'], diagnosis_data['diagnosis']))
label_to_number = {label: i + 1 for i, label in enumerate(unique_labels)}
labels_dict = {participant_id: label_to_number[label] for participant_id, label in labels_dict.items()}
labels_dict = {int(key.replace('sub-', '')): value for key, value in labels_dict.items()}

print("Unique Labels:", unique_labels)
print("Labels:\n", labels_dict)

Unique Labels: ['ADHD' 'BIPOLAR' 'CONTROL' 'SCHZ']
Labels:
 {10159: 3, 10171: 3, 10189: 3, 10193: 3, 10206: 3, 10217: 3, 10225: 3, 10227: 3, 10228: 3, 10235: 3, 10249: 3, 10269: 3, 10271: 3, 10273: 3, 10274: 3, 10280: 3, 10290: 3, 10292: 3, 10299: 3, 10304: 3, 10316: 3, 10321: 3, 10325: 3, 10329: 3, 10339: 3, 10340: 3, 10345: 3, 10347: 3, 10356: 3, 10361: 3, 10365: 3, 10376: 3, 10377: 3, 10388: 3, 10428: 3, 10429: 3, 10438: 3, 10440: 3, 10448: 3, 10455: 3, 10460: 3, 10471: 3, 10478: 3, 10487: 3, 10492: 3, 10501: 3, 10506: 3, 10517: 3, 10523: 3, 10524: 3, 10525: 3, 10527: 3, 10530: 3, 10557: 3, 10565: 3, 10570: 3, 10575: 3, 10624: 3, 10629: 3, 10631: 3, 10638: 3, 10668: 3, 10672: 3, 10674: 3, 10678: 3, 10680: 3, 10686: 3, 10692: 3, 10696: 3, 10697: 3, 10704: 3, 10707: 3, 10708: 3, 10719: 3, 10724: 3, 10746: 3, 10762: 3, 10779: 3, 10785: 3, 10788: 3, 10844: 3, 10855: 3, 10871: 3, 10877: 3, 10882: 3, 10891: 3, 10893: 3, 10912: 3, 10934: 3, 10940: 3, 10948: 3, 10949: 3, 10958: 3, 10963: 3,

In [32]:
from collections import Counter
print(unique_labels)
occurrences = Counter(labels_dict.values())
print("Occurrences of each key:")
for key, count in occurrences.items():
    print(f"{unique_labels[key-1]}: {count}")

['ADHD' 'BIPOLAR' 'CONTROL' 'SCHZ']
Occurrences of each key:
CONTROL: 130
SCHZ: 50
BIPOLAR: 49
ADHD: 43


In [5]:
def read_data():
    data = []
    target = []
    for filename in os.listdir(data_directory):
        if filename.endswith('.txt'):
            file_id = int(filename.split('-')[-1].split('.')[0])
            if int(file_id) in labels_dict:
                file_path = os.path.join(data_directory, filename)
                matrix = read_matrix_from_file(file_path)
                data.append(matrix.flatten())
                target.append(labels_dict[file_id])
    return data, target

In [6]:
data, target = read_data()

In [14]:
for idx,d in enumerate(data):
    print(len(d), idx, target[idx])
    if len(d) != 13689:
        del data[idx]
        del target[idx]

13689 0 3
13689 1 3
13689 2 3
13689 3 3
13689 4 2
13689 5 4
13689 6 2
13689 7 3
13689 8 4
13689 9 3
13689 10 3
13689 11 3
13689 12 3
13689 13 1
13689 14 3
13689 15 2
13689 16 3
13689 17 3
13689 18 4
13689 19 4
13689 20 4
13689 21 3
13689 22 3
13689 23 3
13689 24 4
13689 25 4
13689 26 4
13689 27 4
13689 28 4
13689 29 2
13689 30 2
13689 31 3
13689 32 3
13689 33 3
13689 34 3
13689 35 4
13689 36 4
13689 37 4
13689 38 3
13689 39 3
13689 40 1
13689 41 2
13689 42 2
13689 43 4
13689 44 3
13689 45 2
13689 46 3
13689 47 4
13689 48 4
13689 49 4
13689 50 3
13689 51 4
13689 52 3
13689 53 3
13689 54 2
13689 55 2
13689 56 1
13689 57 4
12544 58 4
13689 59 2
13689 60 3
13689 61 4
13689 62 4
12769 63 3
13689 64 4
13689 65 3
13689 66 4
13689 67 3
13689 68 2
13689 69 4
13689 70 4
13689 71 4


In [17]:
print(len(data))
print(len(target))

72
72


In [20]:
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)
print(len(X_train))
print(len(y_train))

57
57


In [21]:
svm_model = svm.SVC()
svm_model.fit(X_train, y_train)

SVC()

In [22]:
y_pred = svm_model.predict(X_test)

In [24]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))
recall = recall_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))
f1 = f1_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))
conf_matrix = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.5333333333333333
Precision: 0.4791666666666667
Recall: 0.6666666666666666
F1 Score: 0.5339912280701754
Confusion Matrix:
 [[0 1 2]
 [0 7 0]
 [0 4 1]]
