In [14]:
import os
import polars as pl

source_directory = 'data'
output_directory = 'out'

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

def clean_csv(file_path):
    data = pl.read_csv(file_path, delimiter=';')
    data.dropna(axis=1, how='all', inplace=True)
    output_file_path = os.path.join(output_directory, os.path.basename(file_path))
    data.to_csv(output_file_path, index=False, sep=';')


for file_name in os.listdir(source_directory):
    if file_name.endswith('.csv'):
        file_path = os.path.join(source_directory, file_name)
        clean_csv(file_path)


In [15]:
import numpy as np
datadir = "data"

X_data = []
Y_data = []
X_val = []
Y_val = []


# control
files = os.listdir(os.path.join(datadir, "ctrl"))
files.sort()
for file in files:
    path = os.path.join(datadir, "ctrl", file)
    df = pl.read_csv(path, has_header=False, columns=[0], separator=";")
    X_data.append(np.reshape(df.to_numpy(), (-1,)))
    Y_data.append(np.array([1.0, 0.0]))

# positive
files = os.listdir(os.path.join(datadir, "pt"))
files.sort()
for file in files:
    path = os.path.join(datadir, "pt", file)
    df = pl.read_csv(path, has_header=False, columns=[0], separator=";")
    X_data.append(np.reshape(df.to_numpy(), (-1,)))
    Y_data.append(np.array([0.0, 1.0]))

# validation
files = os.listdir(os.path.join(datadir, "v"))
files.sort()
for file in files:
    path = os.path.join(datadir, "v", file)
    df = pl.read_csv(path, has_header=False, columns=[0], separator=";")
    X_val.append(np.reshape(df.to_numpy(), (-1,)))
    Y_val.append(np.array([0.0, 1.0]) if 'd' in file else np.array([1.0, 0.0]))

assert len(X_data) == len(Y_data)
assert len(X_val) == len(Y_val)
assert len(X_data[0]) == len(X_val[0])
assert len(Y_data[0]) == len(Y_val[0])

print("Training samples:", len(X_data))
print("Testing samples: ", len(X_val))

Training samples: 57
Testing samples:  6


In [17]:
X_data

[array([-0.000957, -0.000863, -0.000702, ..., -0.00106 , -0.000734,
        -0.000483]),
 array([-1.02e-03, -9.30e-04, -8.28e-04, ...,  9.04e-05,  2.81e-04,
         2.60e-04]),
 array([-0.00125 , -0.0011  , -0.000955, ...,  0.000673,  0.000955,
         0.000893]),
 array([-0.00162, -0.00133, -0.00115, ...,  0.00144,  0.00177,  0.00174]),
 array([-0.0023 , -0.00198, -0.00175, ...,  0.00193,  0.00245,  0.0026 ]),
 array([-0.0021 , -0.00182, -0.00168, ...,  0.00173,  0.00234,  0.00264]),
 array([-0.00132, -0.00164, -0.00188, ..., -0.00091, -0.00114, -0.00121]),
 array([-0.00124 , -0.00161 , -0.00189 , ..., -0.000881, -0.00106 ,
        -0.0011  ]),
 array([-0.000997, -0.00143 , -0.00176 , ..., -0.0005  , -0.000584,
        -0.000623]),
 array([-6.02e-04, -1.10e-03, -1.47e-03, ..., -7.61e-05, -4.88e-05,
        -5.19e-05]),
 array([-0.000211, -0.000739, -0.00109 , ...,  0.000118,  0.00025 ,
         0.000326]),
 array([-5.91e-05, -5.78e-04, -8.89e-04, ...,  1.20e-04,  3.07e-04,
         

In [37]:
def get_average_change(array):
    x = np.linspace(0, 1, len(array)-1)
    arr = (array[1:]-array[:-1]) * x * (1 - x) * 6
    return sum(arr)

In [23]:
def extract_features(arrays):
    features = []
    for array in arrays:
        mean = np.mean(array)
        std = np.std(array)
        min_val = np.min(array)
        max_val = np.max(array)
        median = np.median(array)
        range_val = max_val - min_val
        q1 = np.percentile(array, 25)
        q3 = np.percentile(array, 75)

        half = len(array)//2
        start_median = np.median(array[:half])
        end_median = np.median(array[half:])

        mean_square = np.mean(array ** 2)
        change = get_average_change(array)

        features.append({
            "mean": mean,
            "std": std,
            "min": min_val,
            "max": max_val,
            "median": median,
            "range": range_val,
            "q1": q1,
            "q3": q3,
            "start_median": start_median,
            "end_median": end_median,
            "mean_square": mean_square,
            "avg_change": change,
        })

    features = pl.DataFrame(features)
    return features


In [24]:
feature_set = extract_features(X_data)
feature_set_val = extract_features(X_val)


In [25]:
feature_set

mean,std,min,max,median,range,q1,q3,start_median,end_median,mean_square
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
-0.000001,0.001234,-0.00582,0.00595,0.000005,0.01177,-0.000778,0.00079,0.000011,-0.000003,0.000002
1.2860e-7,0.001142,-0.00438,0.00532,-0.000005,0.0097,-0.000739,0.000728,3.9560e-7,-0.00001,0.000001
5.8977e-7,0.001124,-0.00396,0.00506,-0.000012,0.00902,-0.000728,0.000719,-0.00001,-0.000017,0.000001
0.000001,0.001193,-0.00461,0.00468,-0.000018,0.00929,-0.000766,0.00075,-0.000008,-0.00003,0.000001
8.4025e-7,0.001402,-0.00495,0.00559,-0.00004,0.01054,-0.000893,0.000864,-0.000034,-0.000046,0.000002
…,…,…,…,…,…,…,…,…,…,…
-2.0739e-7,0.00092,-0.00407,0.00557,-0.000045,0.00964,-0.000521,0.000468,-0.000047,-0.000044,8.4592e-7
-3.7262e-7,0.000911,-0.00345,0.00529,-0.000071,0.00874,-0.000551,0.000477,-0.00007,-0.000071,8.2919e-7
7.2132e-7,0.000955,-0.00526,0.00335,0.0000614,0.00861,-0.000505,0.000593,0.00005,0.0000711,9.1269e-7
7.8934e-7,0.000986,-0.00554,0.00318,0.0000791,0.00872,-0.000503,0.000603,0.000083,0.0000739,9.7123e-7


In [26]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(feature_set, Y_data)
y_pred = clf.predict(feature_set_val)

print("Accuracy:", accuracy_score(Y_val, y_pred))
print("Classification Report:\n", classification_report(Y_val, y_pred))


Accuracy: 0.8333333333333334
Classification Report:
               precision    recall  f1-score   support

           0       0.75      1.00      0.86         3
           1       1.00      0.67      0.80         3

   micro avg       0.83      0.83      0.83         6
   macro avg       0.88      0.83      0.83         6
weighted avg       0.88      0.83      0.83         6
 samples avg       0.83      0.83      0.83         6



In [27]:
from sklearn.model_selection import GridSearchCV

param_grid_rf = {
    'n_estimators': [10, 50, 100, 200],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

grid_search_rf = GridSearchCV(RandomForestClassifier(), param_grid_rf, cv=5, verbose=1)
grid_search_rf.fit(feature_set, Y_data)
best_rf = grid_search_rf.best_estimator_


Fitting 5 folds for each of 144 candidates, totalling 720 fits


240 fits failed out of a total of 720.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
240 fits failed with the following error:
Traceback (most recent call last):
  File "/home/mvogt/ufsc/infmed/.venv/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/mvogt/ufsc/infmed/.venv/lib/python3.12/site-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/home/mvogt/ufsc/infmed/.venv/lib/python3.12/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/home/mvogt/ufsc/infmed/.venv/lib/python3.12/site-packages/sklearn/utils/_param_validation.py", li

In [28]:
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(feature_set, Y_data)
knn_pred = knn_model.predict(feature_set_val)

print("Accuracy:", accuracy_score(Y_val, knn_pred))
print("Classification Report:\n", classification_report(Y_val, knn_pred))


Accuracy: 0.6666666666666666
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.33      0.50         3
           1       0.60      1.00      0.75         3

   micro avg       0.67      0.67      0.67         6
   macro avg       0.80      0.67      0.62         6
weighted avg       0.80      0.67      0.62         6
 samples avg       0.67      0.67      0.67         6



In [29]:
Y_data = np.argmax(Y_data, axis=1)
Y_val = np.argmax(Y_val, axis=1)


In [31]:
from sklearn.svm import SVC
svm_model = SVC(kernel='linear', C=1.0)  # Linear kernel
svm_model.fit(feature_set, Y_data)
svm_pred = svm_model.predict(feature_set_val)

print("Accuracy:", accuracy_score(Y_val, svm_pred))
print("Classification Report:\n", classification_report(Y_val, svm_pred))


Accuracy: 0.5
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.50      1.00      0.67         3

    accuracy                           0.50         6
   macro avg       0.25      0.50      0.33         6
weighted avg       0.25      0.50      0.33         6



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [32]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

param_dist_svm = {
    'C': uniform(0.1, 10),
    'gamma': ['scale', 'auto'],
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid']
}

random_search_svm = RandomizedSearchCV(SVC(), param_distributions=param_dist_svm, n_iter=100, cv=5, verbose=1, random_state=42)
random_search_svm.fit(feature_set, Y_data)
best_svm = random_search_svm.best_estimator_


Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [33]:
from sklearn.naive_bayes import GaussianNB
nb_model = GaussianNB()
nb_model.fit(feature_set, Y_data)
nb_pred = nb_model.predict(feature_set_val)

print("Accuracy:", accuracy_score(Y_val, nb_pred))
print("Classification Report:\n", classification_report(Y_val, nb_pred))


Accuracy: 0.6666666666666666
Classification Report:
               precision    recall  f1-score   support

           0       0.67      0.67      0.67         3
           1       0.67      0.67      0.67         3

    accuracy                           0.67         6
   macro avg       0.67      0.67      0.67         6
weighted avg       0.67      0.67      0.67         6

