# Transformação do dataset

Neste _notebook_ é realizada a trasnformação do _dataset_ EEG de `.txt` para um arquivo `.npy` (_numpy array_).

In [6]:
import os
import numpy as np

### Métodos para filtragem dos dados

Na seguite célula são definidas algumas funções para automatizar a remoção dos trechos ruidosos dos dados

In [17]:
# Remove ranges of time from the data_array, where times_ranges is a list of tuples (tuples contains the range start-end)
def remove_times_ranges(data_array: list[float], times_ranges):
    cleaned_data_array = data_array.copy()
    previous_end = 0
    
    for start, end in times_ranges:
        previous_end_index = previous_end * 250
        start_index = start * 250

        if start_index == 0 and previous_end_index == 0:
            previous_end = end
            continue

        cleaned_data_array = np.concatenate((cleaned_data_array, data_array[previous_end_index:start_index]))
        previous_end = end
    return data_array

# Transform a string that represents a time value ("min:sec") into number and converts it to seconds
def convert_minutes_to_seconds(time):
    minutes, seconds = map(int, time.split(":"))
    return minutes * 60 + seconds

# Creates a list of tuples witch contains start and end times (in sec) that will be removed
def create_tuples_of_times_range_in_seconds(times_minutes):
    times_ranges_in_seconds = []
    for start, end in times_minutes:
        start_seconds = convert_minutes_to_seconds(start)
        end_seconds = convert_minutes_to_seconds(end)
        times_ranges_in_seconds.append((start_seconds, end_seconds))
    return times_ranges_in_seconds

### Análise e filtragem dos dados

Nesta célula são definidos os trechos dos dados a serem removidos de cada arquivo e são removidos efetivamente através das funções definidas acima.

In [18]:
directory = '../dataset-s6/'

# Get list of files in directory
files = os.listdir(directory)

# Filter files that are txt and start with 'OpenBCI...'
dataset_files = [file for file in files if file.endswith('.txt') and file.startswith('OpenBCI')]

# Load data from each file into an array
data_array = []
for file in dataset_files:
    file_path = os.path.join(directory, file)
    data = np.loadtxt(file_path, dtype=float, delimiter=',', skiprows=5, usecols=range(1, 9))
    data_array.append(data)


# Realizando os cortes de dados ruidosos

# File 1 - 90 MB # data_array[i]
FILE_1_90MB = 0
times_ranges_in_minutes = [
    ("0:00", "0:03"),
    ("0:07", "0:10"),
    ("2:50", "2:53"),
    ("4:14", "4:16"),
    ("4:35", "4:38"),
    ("8:01", "8:03"),
    ("9:33", "9:36"),
    ("15:54", "15:56"),
    ("16:59", "17:01"),
    ("19:19", "19:22"),
    ("19:39", "19:49")
]

times_ranges_in_seconds = []
times_ranges_in_seconds = create_tuples_of_times_range_in_seconds(times_ranges_in_minutes)

data_array_filtered = []
data_array_filtered = remove_times_ranges(data_array[FILE_1_90MB], times_ranges_in_seconds)
np.save('../dataset-s6/data.npy', data_array_filtered)


# File 2 - 14,5 MB
# FILE_2_14MB = 1
# times_ranges_in_minutes = [
#     ("0:00", "0:08"),
#     ("0:58", "1:07"),
#     ("2:59", "3:08")
# ]


# File 3 - 36,4 MB
# FILE_3_36MB = 2
# times_ranges_in_minutes = [
#     ("0:00", "0:08"),
#     ("1:58", "2:02"),
#     ("7:37", "7:39")
# ]





# TODO: concatenar todos os arquivos .txt do dataset em um único .npy array
# data_final = np.concatenate((data1[500:,:], data2[500:-1750,:], data3[500:,:]), axis=0)

# np.save('../dataset-s6/data.npy', data_final)