In [1]:
import os

import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split

In [2]:
labels_file = 'Data/participants.tsv'
data_directory = 'cc_matrices/'

In [3]:
def read_matrix_from_file(file_path):
    with open(file_path, 'r') as file:
        content = file.readlines()
        matrix = []
        for line in content:
            values = line.rstrip().split(',')
            row = []
            for val in values:
                if val.strip():
                    try:
                        row.append(float(val))
                    except ValueError:
                        print(f"Warning: Non-numeric value '{val}' found. Skipping.")
            if row:
                matrix.append(row)
    return np.array(matrix)

In [26]:
participant_data = pd.read_csv(labels_file, sep='\t')
diagnosis_data = participant_data[['participant_id', 'diagnosis']]
unique_labels = ['CONTROL', 'ADHD', 'BIPOLAR', 'SCHZ']

labels_dict = dict(zip(diagnosis_data['participant_id'], diagnosis_data['diagnosis']))
labels_dict = {int(key.replace('sub-', '')): value for key, value in labels_dict.items()}
label_to_number = {label: i for i, label in enumerate(unique_labels)}
numerical_labels_dict = {participant_id: label_to_number[label] for participant_id, label in labels_dict.items()}

print("Unique Labels:", unique_labels)
print("Labels:\n", numerical_labels_dict)

Unique Labels: ['CONTROL', 'ADHD', 'BIPOLAR', 'SCHZ']
Labels:
 {10159: 0, 10171: 0, 10189: 0, 10193: 0, 10206: 0, 10217: 0, 10225: 0, 10227: 0, 10228: 0, 10235: 0, 10249: 0, 10269: 0, 10271: 0, 10273: 0, 10274: 0, 10280: 0, 10290: 0, 10292: 0, 10299: 0, 10304: 0, 10316: 0, 10321: 0, 10325: 0, 10329: 0, 10339: 0, 10340: 0, 10345: 0, 10347: 0, 10356: 0, 10361: 0, 10365: 0, 10376: 0, 10377: 0, 10388: 0, 10428: 0, 10429: 0, 10438: 0, 10440: 0, 10448: 0, 10455: 0, 10460: 0, 10471: 0, 10478: 0, 10487: 0, 10492: 0, 10501: 0, 10506: 0, 10517: 0, 10523: 0, 10524: 0, 10525: 0, 10527: 0, 10530: 0, 10557: 0, 10565: 0, 10570: 0, 10575: 0, 10624: 0, 10629: 0, 10631: 0, 10638: 0, 10668: 0, 10672: 0, 10674: 0, 10678: 0, 10680: 0, 10686: 0, 10692: 0, 10696: 0, 10697: 0, 10704: 0, 10707: 0, 10708: 0, 10719: 0, 10724: 0, 10746: 0, 10762: 0, 10779: 0, 10785: 0, 10788: 0, 10844: 0, 10855: 0, 10871: 0, 10877: 0, 10882: 0, 10891: 0, 10893: 0, 10912: 0, 10934: 0, 10940: 0, 10948: 0, 10949: 0, 10958: 0, 10963:

In [28]:
from collections import Counter

occurrences = Counter(labels_dict.values())
print("Whole dataset")
print(occurrences)

Counter({'CONTROL': 130, 'SCHZ': 50, 'BIPOLAR': 49, 'ADHD': 43})


In [31]:
def read_data():
    data = []
    target = []
    for filename in os.listdir(data_directory):
        if filename.endswith('.txt'):
            file_id = int(filename.split('-')[-1].split('.')[0])
            if int(file_id) in labels_dict:
                file_path = os.path.join(data_directory, filename)
                matrix = read_matrix_from_file(file_path)
                data.append(matrix.flatten())
                label = labels_dict[file_id]
                if label == "CONTROL":
                    target.append(0)
                else:
                    target.append(1)
            else:
                raise Exception(f'{file_id} not in labels_dict')
    return data, target

In [61]:
data, target = read_data()

In [62]:
print("Current dataset")
for idx, d in enumerate(data):
    if len(d) != 13689:
        print(f'Deleted file: size:{len(d)}, idx:{idx}, label:{target[idx]}')
        del data[idx]
        del target[idx]

unique_values = set(target)
for val in unique_values:
    count = target.count(val)
    print(f"Count of {val}: {count}")

Current dataset
Deleted file: size:12544, idx:27, label:0
Deleted file: size:12544, idx:58, label:1
Deleted file: size:12769, idx:63, label:0
Count of 0: 29
Count of 1: 43


In [63]:
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)
print(len(X_train),len(y_train))
print(len(X_test),len(y_test))

57 57
15 15


In [58]:
svm_model = svm.SVC()
svm_model.fit(X_train, y_train)

SVC()

In [59]:
y_pred = svm_model.predict(X_test)

In [60]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.5333333333333333
Precision: 0.5333333333333333
Recall: 1.0
F1 Score: 0.6956521739130436
Confusion Matrix:
 [[0 7]
 [0 8]]
