In [17]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
def to_categorical(y, num_classes=None):
    """Convert a class vector (integers) to binary class matrix.

    Args:
        y: class vector to be converted into a matrix
        (integers from 0 to `num_classes` - 1)
        num_classes: total number of classes

    Returns:
        categorical: A binary matrix representation of `y`
    """
    y = np.array(y, dtype=np.int32)
    input_shape = y.shape
    if input_shape and input_shape[-1] == 1 and len(input_shape) > 1:
        input_shape = tuple(input_shape[:-1])
    y = y.ravel()
    if not num_classes:
        num_classes = np.max(y) + 1
    n = y.shape[0]
    categorical = np.zeros((n, num_classes), dtype=np.float32)
    categorical[np.arange(n), y] = 1
    output_shape = input_shape + (num_classes,)
    categorical = np.reshape(categorical, output_shape)
    return categorical

In [3]:
train_data_file = "../processed_data/train_ids_and_labels.txt"
data_train = pd.read_csv(train_data_file)
y = data_train['class'].values - 1
X = data_train.drop(['class'], axis=1).values

# Convert a class vector (integers) to binary class matrix
y = to_categorical(y)

In [15]:
# Randomly shuffle data
np.random.seed(42)
indices_shuffled = np.random.permutation(range(len(X)))
X_shuffled = X[indices_shuffled]
y_shuffled = y[indices_shuffled]

In [27]:
val_sample_index = -1 * int(0.1 * len(y))
X_train, X_val = X_shuffled[:val_sample_index], X_shuffled[val_sample_index:]
y_train, y_val = y_shuffled[:val_sample_index], y_shuffled[val_sample_index:]

In [20]:
save_dir = "../processed_data/word/train"

if not os.path.exists(save_dir):
    os.makedirs(save_dir)

batch_size = 128
num_batches = ((len(X) - 1) // batch_size) + 1
for i in tqdm(range(num_batches)):
    start_index = i * batch_size
    end_index = min(batch_size * (i + 1), len(X))
    save_path = os.path.join(save_dir, '%d.npz' % i)
    X_batch, y_batch = X[start_index:end_index], y[start_index:end_index]
    np.savez(save_path, X=X_batch, y=y_batch)

100%|██████████████████████████████████████████| 40/40 [00:00<00:00, 85.47it/s]


In [28]:
def save_train_data_to_batches(save_dir, X, y, batch_size=128):
    """Save training (or validation) set to small batches."""
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
        
    num_samples = len(X)
    num_batches = ((num_samples - 1) // batch_size) + 1
    for i in tqdm(range(num_batches)):
        start_index = batch_size * i
        end_index = min(batch_size * (i + 1), num_samples)
        save_path = os.path.join(save_dir, '%d.npz' % i)
        X_batch, y_bath = X[start_index:end_index], y[start_index:end_index]
        np.savez(save_path, X=X_batch, y=y_batch)

In [30]:
save_dir = "../processed_data/word/train/"
save_train_data_to_batches(save_dir, X_train, y_train, batch_size=128)

100%|█████████████████████████████████████████| 40/40 [00:00<00:00, 108.11it/s]


In [31]:
file_path = "../processed_data/word/train/0.npz"
temp = np.load(file_path)