In [1]:
import warnings
warnings.filterwarnings(action="ignore")
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.decomposition import IncrementalPCA

In [2]:
# Parameters
input_file_train = '../data/v8.2/progress.txt'  # Path to your large file
padding = 10
batch_size = 1000                       # Number of samples per batch to load
n_components = 30                        # Number of principal components
# Parámetros
test_file_test = '../data/v8.2/progress2.txt'  # Ruta a tu archivo de test

# Once fitted, transform the entire dataset
transformed_train_data = []

# Transformar los datos del conjunto de test usando el PCA ajustado previamente
transformed_test_data = []

In [3]:
max_len = 0
# Read data in batches and fit IPCA
with open(input_file_train, 'r') as f:
    line = f.readline()
    max_len= max(max_len, len([float(x) for x in line.strip().split(',')]))

# Read data in batches and fit IPCA
with open(test_file_test, 'r') as f:
    line = f.readline()
    max_len= max(max_len, len([float(x) for x in line.strip().split(',')]))

max_len = max_len + padding

In [4]:
max_len

229899

In [5]:
# Initialize IPCA
ipca = IncrementalPCA(n_components=n_components)

# Read data in batches and fit IPCA
with open(input_file_train, 'r', encoding='utf-8') as f:
    while True:
        data = []
        # Read batch_size lines from the file
        for _ in range(batch_size):
            line = f.readline()
            if not line:
                break
            line = line.replace('\x13', '')  # Remove specific problematic character
            data.append([float(x) for x in line.strip().split(',')])
        
        if not data:
            break
        
        # Pad each row with zeros on the left to match the maximum length
        padded_data = [([0] * (max_len - len(row)) + row) for row in data]

        # Convert to a numpy array
        padded_data = np.array(padded_data)

        # Incrementally fit the model
        ipca.partial_fit(padded_data)

with open(input_file_train, 'r', encoding='utf-8') as f:
    while True:
        data = []
        for _ in range(batch_size):
            line = f.readline()
            if not line:
                break
            line = line.replace('\x13', '')  # Remove specific problematic character
            data.append([float(x) for x in line.strip().split(',')])
        
        if not data:
            break
        
        # Pad each row with zeros on the left to match the maximum length
        padded_data = [([0] * (max_len - len(row)) + row) for row in data]

        # Convert to a numpy array
        padded_data = np.array(padded_data)

        transformed_batch = ipca.transform(padded_data)
        transformed_train_data.append(transformed_batch)

# Convert list of arrays to a single numpy array
transformed_train_data = np.vstack(transformed_train_data)

In [6]:
# Save transformed data
np.savetxt(f'../data/v8.2/transformed_train_data_comp_{n_components}.csv', transformed_train_data, delimiter=',')

In [7]:
with open(test_file_test, 'r') as f:
    while True:
        data = []
        # Leer batch_size líneas del archivo de test
        for _ in range(batch_size):
            line = f.readline()
            if not line:
                break
            data.append([float(x) for x in line.strip().split(',')])
        
        if not data:
            break
        
        # Pad each row with zeros on the left to match the maximum length
        padded_data = [([0] * (max_len - len(row)) + row) for row in data]

        # Convert to a numpy array
        padded_data = np.array(padded_data)

        # Aplicar la transformación con el PCA ajustado previamente
        transformed_batch = ipca.transform(padded_data)
        transformed_test_data.append(transformed_batch)

# Convertir la lista de arrays a un solo numpy array
transformed_test_data = np.vstack(transformed_test_data)

# Guardar los datos transformados
np.savetxt(f'../data/v8.2/transformed_test_data_comp_{n_components}.csv', transformed_test_data, delimiter=',')