# Reducción de dimensionalidad de los datos

## Librerías utilizadas

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA, TruncatedSVD, FactorAnalysis
from sklearn.random_projection import johnson_lindenstrauss_min_dim
from sklearn.random_projection import SparseRandomProjection


## Lectura del dataframe

In [3]:
# Carga de dataframe de parquet
df_filtered = pd.read_parquet("../../dataset/features_filtered_balanced.parquet")
print(f"Parquet cargado: {df_filtered.shape}")

Parquet cargado: (465697, 201)


In [4]:
metadata_cols = ['id_original', 'text', 'sentence_num', 'model', 'domain']
feature_columns = [col for col in df_filtered.columns if col not in metadata_cols]

print(f"\nTotal de features: {len(feature_columns)}")
print(f"Primeros 10 features: {feature_columns[:10]}")


Total de features: 196
Primeros 10 features: ['POS_VERB', 'POS_NOUN', 'POS_ADJ', 'POS_ADV', 'POS_DET', 'POS_INTJ', 'POS_CONJ', 'POS_PART', 'POS_NUM', 'POS_PREP']


In [5]:
# 3. Obtener IDs únicos y crear mapping de ID a clase
unique_ids = df_filtered['id_original'].unique()
print(f"\nTotal de documentos únicos: {len(unique_ids)}")

# Mapping de ID a clase (binaria: humano vs IA)
id_to_class = df_filtered.groupby('id_original')['model'].first().to_dict()


Total de documentos únicos: 37977


In [6]:
# 4. Split de IDs (no de oraciones) - IMPORTANTE para evitar data leakage
ids_list = list(id_to_class.keys())
labels_list = [id_to_class[id_] for id_ in ids_list]

# Verificar si podemos usar stratify
class_counts = pd.Series(labels_list).value_counts()
can_stratify = class_counts.min() >= 2

if can_stratify:
    print("✓ Usando stratified split (mantiene proporción de clases)")
    train_ids, test_ids = train_test_split(
        ids_list, 
        test_size=0.2, 
        random_state=42, 
        stratify=labels_list
    )
else:
    print(f"⚠️ No se puede usar stratify (clase mínima: {class_counts.min()} documentos)")
    print("Usando split aleatorio sin stratify")
    train_ids, test_ids = train_test_split(
        ids_list, 
        test_size=0.2, 
        random_state=42, 
        stratify=None  # Sin estratificación
    )

print(f"\nDocumentos en train: {len(train_ids)}")
print(f"Documentos en test: {len(test_ids)}")

✓ Usando stratified split (mantiene proporción de clases)

Documentos en train: 30381
Documentos en test: 7596

Documentos en train: 30381
Documentos en test: 7596


In [7]:
# 5. Filtrar oraciones según los IDs
# train_sentences = train_df[train_df['id'].isin(train_ids)]
train_sentences = df_filtered[df_filtered['id_original'].isin(train_ids)]
test_sentences = df_filtered[df_filtered['id_original'].isin(test_ids)]

print(f"\nOraciones en train: {len(train_sentences)}")
print(f"Oraciones en test: {len(test_sentences)}")

# Verificar distribución en cada conjunto
print(f"\nDistribución en train:")
# print(train_sentences['is_human'].value_counts())
print(train_sentences['model'].value_counts())
print(f"\nDistribución en test:")
# print(test_sentences['is_human'].value_counts())
print(test_sentences['model'].value_counts())


Oraciones en train: 373687
Oraciones en test: 92010

Distribución en train:
model
human         130439
gpt4           57716
chatgpt        55447
llama-chat     54041
mpt            51677
mpt-chat       24367
Name: count, dtype: int64

Distribución en test:
model
human         31549
gpt4          14601
chatgpt       13648
llama-chat    13204
mpt           13057
mpt-chat       5951
Name: count, dtype: int64


In [8]:
# 2. Definir columnas de features (excluir metadatos y target)
# Excluir: id, sentence_num, model, domain
metadata_cols = ['id_original', 'text', 'sentence_num', 'model', 'domain']
feature_columns = [col for col in df_filtered.columns if col not in metadata_cols]

print(f"\nTotal de features: {len(feature_columns)}")
print(f"Primeros 10 features: {feature_columns[:10]}")


Total de features: 196
Primeros 10 features: ['POS_VERB', 'POS_NOUN', 'POS_ADJ', 'POS_ADV', 'POS_DET', 'POS_INTJ', 'POS_CONJ', 'POS_PART', 'POS_NUM', 'POS_PREP']


In [9]:
# 6. Preparar X e y
X_train = train_sentences[feature_columns].values
y_train = train_sentences['model'].apply(lambda x: 1 if x == 'human' else 0).values
X_test = test_sentences[feature_columns].values
y_test = test_sentences['model'].apply(lambda x: 1 if x == 'human' else 0).values

print(f"\n{'='*60}")
print(f"Shape de X_train: {X_train.shape}")
print(f"Shape de y_train: {y_train.shape}")
print(f"Shape de X_test: {X_test.shape}")
print(f"Shape de y_test: {y_test.shape}")
print(f"{'='*60}")


Shape de X_train: (373687, 196)
Shape de y_train: (373687,)
Shape de X_test: (92010, 196)
Shape de y_test: (92010,)


In [10]:
# Verificar si hay valores NaN en los datos
print("Verificando valores NaN en el dataset...")
# print(f"\nNaNs en train_df: {train_df.isna().sum().sum()}")
print(f"\nNaNs en train_df: {train_sentences.isna().sum().sum()}")
print(f"NaNs en feature_columns:")
nan_features = train_sentences[feature_columns].isna().sum()
nan_features_with_nans = nan_features[nan_features > 0]
if len(nan_features_with_nans) > 0:
    print(nan_features_with_nans)
else:
    print("No hay NaNs en las features ✓")

print(f"\nNaNs en X_train: {np.isnan(X_train).sum()}")
print(f"NaNs en X_test: {np.isnan(X_test).sum()}")
print(f"NaNs en y_train: {np.isnan(y_train).sum()}")
print(f"NaNs en y_test: {np.isnan(y_test).sum()}")

Verificando valores NaN en el dataset...

NaNs en train_df: 932
NaNs en feature_columns:

NaNs en train_df: 932
NaNs en feature_columns:
POS_VERB    1
POS_NOUN    1
POS_ADJ     1
POS_ADV     1
POS_DET     1
           ..
DMC         1
OR          1
QAS         1
PA          1
PR          1
Length: 196, dtype: int64
POS_VERB    1
POS_NOUN    1
POS_ADJ     1
POS_ADV     1
POS_DET     1
           ..
DMC         1
OR          1
QAS         1
PA          1
PR          1
Length: 196, dtype: int64

NaNs en X_train: 932
NaNs en X_test: 171
NaNs en y_train: 0
NaNs en y_test: 0

NaNs en X_train: 932
NaNs en X_test: 171
NaNs en y_train: 0
NaNs en y_test: 0


In [11]:
# Solución: Imputar valores NaN antes de entrenar
from sklearn.impute import SimpleImputer

# Opción 1: Imputar con la media de cada feature
imputer = SimpleImputer(strategy='mean')

# Ajustar el imputer con los datos de entrenamiento
X_train_clean = imputer.fit_transform(X_train)
# Transformar los datos de test con el mismo imputer
X_test_clean = imputer.transform(X_test)

print("✓ Valores NaN imputados con la media de cada feature")
print(f"NaNs en X_train_clean: {np.isnan(X_train_clean).sum()}")
print(f"NaNs en X_test_clean: {np.isnan(X_test_clean).sum()}")

✓ Valores NaN imputados con la media de cada feature
NaNs en X_train_clean: 0
NaNs en X_test_clean: 0
NaNs en X_train_clean: 0
NaNs en X_test_clean: 0


## Reducción de dimensionalidad

### PCA

In [12]:
pca = PCA().fit(X_train_clean)

explained_variance = pca.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance)
num_components = np.argmax(cumulative_variance >= 0.95) + 1

print(f"Número de componentes para explicar el 95% de la varianza: {num_components}")

Número de componentes para explicar el 95% de la varianza: 19


In [13]:
pca = PCA(n_components=19)
X_train_pca = pca.fit_transform(X_train_clean)
X_test_pca = pca.transform(X_test_clean)

### LDA

In [14]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# Aplicar LDA para reducir la dimensionalidad
lda = LinearDiscriminantAnalysis(n_components=1)  # La cantidad de componentes debe ser <= número de clases - 1
x_train_lda = lda.fit_transform(X_train_clean, y_train)
x_test_lda = lda.transform(X_test_clean)

### FA

In [15]:
fa = FactorAnalysis().fit(X_train_clean)

singular_values = fa.components_
explained_variance = np.var(singular_values, axis=1) / np.var(X_train_clean, axis=0).sum()

# Calcula la varianza explicada acumulativa
cumulative_explained_variance = np.cumsum(explained_variance)

# Determina el número de factores necesarios para explicar al menos el 95% de la varianza
num_factors = np.argmax(cumulative_explained_variance >= 0.95) + 1

print(f"Number of factors to retain: {num_factors}")

Number of factors to retain: 1


In [16]:
fa = FactorAnalysis(n_components=1)
x_train_fa = fa.fit_transform(X_train_clean)
x_test_fa = fa.transform(X_test_clean)

### SVD

In [17]:
svd = TruncatedSVD(n_components=min(X_train_clean.shape) - 1)
svd.fit(X_train_clean)

# Calcula la varianza explicada por los componentes
explained_variance = svd.explained_variance_ratio_

# Calcula la varianza explicada acumulativa
cumulative_explained_variance = np.cumsum(explained_variance)

# Determina el número de componentes necesarios para explicar al menos el 95% de la varianza
num_components = np.argmax(cumulative_explained_variance >= 0.95) + 1

print(f"Number of components to retain: {num_components}")

Number of components to retain: 20


In [18]:
svd = TruncatedSVD(n_components=20)
x_train_svd = svd.fit_transform(X_train_clean)
x_test_svd = svd.transform(X_test_clean)

### JL

In [19]:
epsilon = 0.9

# Calcula el número mínimo de componentes necesarios usando la Proyección de Johnson-Lindenstrauss
n_samples = X_train_clean.shape[0]
n_components = johnson_lindenstrauss_min_dim(n_samples, eps=epsilon)
print(f"Number of components to retain using JL: {n_components}")

Number of components to retain using JL: 316


In [20]:
jl = SparseRandomProjection(n_components=224)
x_train_jl = jl.fit_transform(X_train_clean)
x_test_jl = jl.transform(X_test_clean)



## Guardar datos transformados

In [21]:
import os

# Crear directorio para los datos transformados si no existe
output_dir = "."
os.makedirs(output_dir, exist_ok=True)

# Diccionario con los datos a guardar
reduction_methods = {
    'pca': (X_train_pca, X_test_pca),
    'lda': (x_train_lda, x_test_lda),
    'fa': (x_train_fa, x_test_fa),
    'svd': (x_train_svd, x_test_svd),
    'jl': (x_train_jl, x_test_jl)
}

# Guardar cada método en archivos NPZ (comprimido)
for method_name, (X_train_transformed, X_test_transformed) in reduction_methods.items():
    if X_test_transformed is not None:
        # Guardar train y test juntos
        np.savez_compressed(
            os.path.join(output_dir, f'{method_name}_reduced.npz'),
            X_train=X_train_transformed,
            y_train=y_train,
            X_test=X_test_transformed,
            y_test=y_test
        )
        print(f"✓ Guardado {method_name}_reduced.npz (train: {X_train_transformed.shape}, test: {X_test_transformed.shape})")
    else:
        # Solo train (caso t-SNE)
        np.savez_compressed(
            os.path.join(output_dir, f'{method_name}_reduced.npz'),
            X_train=X_train_transformed,
            y_train=y_train
        )
        print(f"✓ Guardado {method_name}_reduced.npz (solo train: {X_train_transformed.shape})")

print(f"\n{'='*70}")
print("Todos los archivos guardados exitosamente en el directorio actual")
print(f"{'='*70}")

✓ Guardado pca_reduced.npz (train: (373687, 19), test: (92010, 19))
✓ Guardado lda_reduced.npz (train: (373687, 1), test: (92010, 1))
✓ Guardado lda_reduced.npz (train: (373687, 1), test: (92010, 1))
✓ Guardado fa_reduced.npz (train: (373687, 1), test: (92010, 1))
✓ Guardado fa_reduced.npz (train: (373687, 1), test: (92010, 1))
✓ Guardado svd_reduced.npz (train: (373687, 20), test: (92010, 20))
✓ Guardado svd_reduced.npz (train: (373687, 20), test: (92010, 20))
✓ Guardado jl_reduced.npz (train: (373687, 224), test: (92010, 224))

Todos los archivos guardados exitosamente en el directorio actual
✓ Guardado jl_reduced.npz (train: (373687, 224), test: (92010, 224))

Todos los archivos guardados exitosamente en el directorio actual


### Ejemplo de carga de datos

Para usar estos archivos en otros notebooks:

In [22]:
# Ejemplo de cómo cargar los datos en otro archivo:
# 
# import numpy as np
# 
# # Cargar datos de PCA
# data = np.load('pca_reduced.npz')
# X_train_pca = data['X_train']
# y_train = data['y_train']
# X_test_pca = data['X_test']
# y_test = data['y_test']
# 
# # Para t-SNE (solo tiene train)
# data_tsne = np.load('tsne_reduced.npz')
# X_train_tsne = data_tsne['X_train']
# y_train = data_tsne['y_train']