In [1]:
import warnings
warnings.filterwarnings(action="ignore")
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.decomposition import IncrementalPCA

In [2]:
# Parameters
input_file = '../data/v6/progress.txt'  # Path to your large file
batch_size = 1000                      # Number of samples per batch to load
n_components = 2                       # Number of principal components

In [3]:
# Initialize IPCA
ipca = IncrementalPCA(n_components=n_components)

# Read data in batches and fit IPCA
with open(input_file, 'r') as f:
    while True:
        data = []
        # Read batch_size lines from the file
        for _ in range(batch_size):
            line = f.readline()
            if not line:
                break
            data.append([float(x) for x in line.strip().split(',')])
        
        if not data:
            break
        
        data = np.array(data)
        ipca.partial_fit(data)  # Incrementally fit the model

# Once fitted, transform the entire dataset
transformed_data = []

with open(input_file, 'r') as f:
    while True:
        data = []
        for _ in range(batch_size):
            line = f.readline()
            if not line:
                break
            data.append([float(x) for x in line.strip().split(',')])
        
        if not data:
            break
        
        data = np.array(data)
        transformed_batch = ipca.transform(data)
        transformed_data.append(transformed_batch)

# Convert list of arrays to a single numpy array
transformed_data = np.vstack(transformed_data)

In [4]:
# Save transformed data
np.savetxt(f'../data/v6/transformed_train_data_comp_{n_components}.csv', transformed_data, delimiter=',')

In [5]:
# Parámetros
test_file = '../data/v6/progress2.txt'  # Ruta a tu archivo de test
batch_size = 1000                 # Tamaño de batch, el mismo que usaste antes

# Transformar los datos del conjunto de test usando el PCA ajustado previamente
transformed_test_data = []

In [6]:
with open(test_file, 'r') as f:
    while True:
        data = []
        # Leer batch_size líneas del archivo de test
        for _ in range(batch_size):
            line = f.readline()
            if not line:
                break
            data.append([float(x) for x in line.strip().split(',')])
        
        if not data:
            break
        
        data = np.array(data)
        # Aplicar la transformación con el PCA ajustado previamente
        transformed_batch = ipca.transform(data)
        transformed_test_data.append(transformed_batch)

# Convertir la lista de arrays a un solo numpy array
transformed_test_data = np.vstack(transformed_test_data)

# Guardar los datos transformados
np.savetxt(f'../data/v6/transformed_test_data_comp_{n_components}.csv', transformed_test_data, delimiter=',')