In [2]:
import os

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import dask.dataframe as dd
import dask.array as da
from dask import delayed
from tqdm import tqdm
from dask.diagnostics import ProgressBarc   

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle

ImportError: cannot import name 'ProgressBarc' from 'dask.diagnostics' (/home/haroon/miniconda3/envs/tf/lib/python3.11/site-packages/dask/diagnostics/__init__.py)

In [17]:
def convert_parquet_to_npy(file_path, save_path):
    dataframe = pd.read_parquet(file_path)
    data_array = dataframe.to_numpy()
    np.save(save_path, data_array)

def convert_folder_to_npy(folder_path, save_folder):
    os.makedirs(save_folder, exist_ok=True) 

    files_to_convert = [file for file in os.listdir(folder_path) if file.endswith('.parquet')]
    
    for file in tqdm(files_to_convert, desc=f"Converting files in {folder_path} to .npy", unit="file"):
        file_path = os.path.join(folder_path, file)
        save_path = os.path.join(save_folder, f"{os.path.splitext(file)[0]}.npy")
        convert_parquet_to_npy(file_path, save_path)

def pad_or_truncate(data_point, target_shape):
    current_shape = data_point.shape
    if current_shape == target_shape:
        return data_point
    elif current_shape[0] < target_shape[0]:
        # Pad the data point along the first axis
        padding = [(0, target_shape[0] - current_shape[0])] + [(0, 0)] * (len(current_shape) - 1)
        return np.pad(data_point, padding, mode='constant', constant_values=0)
    else:
        # Truncate the data point along the first axis
        return data_point[:target_shape[0], ...]

def chunked_concatenate(file_paths, chunk_size, target_shape):
    chunks = []
    for chunk_start in range(0, len(file_paths), chunk_size):
        chunk_end = min(chunk_start + chunk_size, len(file_paths))
        chunk_data = [np.load(file) for file in file_paths[chunk_start:chunk_end]]
        chunks.append(chunk_data)
    return chunks

def flatten_chunks(chunks, target_shape):
    flattened_data = [pad_or_truncate(data_point, target_shape) for chunk in chunks for data_point in chunk]
    return flattened_data

def load_data(path, num_files=None, chunk_size=100, target_shape = (85199 - 10000 + 1, 20)):
    npy_train_save_folder = os.path.join(path, 'npy_train')
    npy_test_save_folder = os.path.join(path, 'npy_test')

    # Read Numpy files for train data in chunks
    train_eeg_files = [os.path.join(npy_train_save_folder, file) for file in os.listdir(npy_train_save_folder)[:num_files] if file.endswith('.npy')]
    train_eeg_chunks = chunked_concatenate(train_eeg_files, chunk_size, target_shape) if train_eeg_files else None

    # Read Numpy files for test data in chunks
    test_eeg_files = [os.path.join(npy_test_save_folder, file) for file in os.listdir(npy_test_save_folder) if file.endswith('.npy')]
    test_eeg_chunks = chunked_concatenate(test_eeg_files, chunk_size, target_shape) if test_eeg_files else None

    if train_eeg_chunks is not None:
        # Interpolate NaN values for each chunk
        for i, chunk in enumerate(train_eeg_chunks):
            train_eeg_chunks[i] = [np.nan_to_num(data_point, nan=np.nanmean(data_point)) for data_point in chunk]
            print("NaN values in train_eeg chunk {}: {}".format(i, any(np.isnan(data_point).any() for data_point in train_eeg_chunks[i])))

    # Read labels
    train_labels = pd.read_csv('train.csv', nrows=num_files)
    test_labels = pd.read_csv('test.csv')

    # Flatten the chunks to get a list of data points
    flattened_train_eeg = flatten_chunks(train_eeg_chunks, target_shape) if train_eeg_chunks else None
    flattened_test_eeg = flatten_chunks(test_eeg_chunks, target_shape) if test_eeg_chunks else None

    return flattened_train_eeg, flattened_test_eeg, train_labels, test_labels


In [18]:
# run this once
# convert_folder_to_npy('data/train_eegs', 'data/npy_train')
# convert_folder_to_npy('data/test_eegs', 'data/npy_test')

In [19]:
flattened_train_eeg, flattened_test_eeg, train_labels, test_labels = load_data('data', num_files=100, chunk_size=50)

NaN values in train_eeg chunk 0: False
NaN values in train_eeg chunk 1: False


In [20]:
xt = np.array(flattened_train_eeg)