In [1]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA, TruncatedSVD, FactorAnalysis
from sklearn.random_projection import johnson_lindenstrauss_min_dim
from sklearn.random_projection import SparseRandomProjection

In [2]:
# Verificar que parquet funciona
df = pd.read_parquet("features_combined.parquet")
print(f"Parquet cargado: {df.shape}")

Parquet cargado: (4251000, 201)


In [3]:
# pdtrainDF = pd.DataFrame(df)
# pdtrainDF.to_csv('train_df.csv', index=False)

In [4]:
# Ver los primeros registros
print(df.head())

                            id_original  model     domain  sentence_num  \
0  e5e058ce-be2b-459d-af36-32532aaba5ff  human  abstracts             0   
1  e5e058ce-be2b-459d-af36-32532aaba5ff  human  abstracts             1   
2  e5e058ce-be2b-459d-af36-32532aaba5ff  human  abstracts             2   
3  e5e058ce-be2b-459d-af36-32532aaba5ff  human  abstracts             3   
4  e5e058ce-be2b-459d-af36-32532aaba5ff  human  abstracts             4   

                                                text  POS_VERB  POS_NOUN  \
0  The recent advancements in artificial intellig...  0.127660  0.446809   
1  Notwithstanding the successes and future poten...  0.139535  0.255814   
2  Despite these concerns and risks, there are cu...  0.107143  0.321429   
3  To bridge this gap, this paper introduces a ca...  0.156250  0.250000   
4  These guiding principles are named FUTURE-AI a...  0.160000  0.360000   

    POS_ADJ   POS_ADV   POS_DET  ...   RE  ASF  ASM   OM  RCI       DMC   OR  \
0  0.148936 

In [5]:
# Verificar documentos con solo una oración (sentence_num = 0 únicamente)
# Contar oraciones por ID
oraciones_por_id = df.groupby('id_original')['sentence_num'].apply(lambda x: x.tolist())

# Filtrar IDs que solo tienen sentence_num = 0
ids_una_oracion = oraciones_por_id[oraciones_por_id.apply(lambda x: x == [0])].index

print(f"Número de documentos con solo una oración: {len(ids_una_oracion)}")
print(f"Total de documentos: {df['id_original'].nunique()}")
print(f"Porcentaje: {len(ids_una_oracion) / df['id_original'].nunique() * 100:.2f}%")

# Mostrar df con los documentos con solo una oración
df_una_oracion = df[df['id_original'].isin(ids_una_oracion)]
display(df_una_oracion)

Número de documentos con solo una oración: 24608
Total de documentos: 408435
Porcentaje: 6.02%


Unnamed: 0,id_original,model,domain,sentence_num,text,POS_VERB,POS_NOUN,POS_ADJ,POS_ADV,POS_DET,...,RE,ASF,ASM,OM,RCI,DMC,OR,QAS,PA,PR
13589,8098ecbb-b1a5-4c28-8490-823f24e6fd36,mpt,abstracts,0,"There have been significant interest, investme...",0.230769,0.230769,0.153846,0.230769,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.00000,0.0,0.000000
13681,49e7772f-7100-4d77-be0f-64d4f3860697,mpt,abstracts,0,https://arxivnotebook-dataverse3d9m5c01hz4vhwu...,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.00000,0.0,0.000000
13715,8f1824b0-4f23-417a-82b3-2da431cdfd6a,mpt,abstracts,0,"<div align=""center""> <img src=""http://imgur.co...",0.000000,1.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.00000,0.0,0.000000
13773,0a65cc0c-0b77-43ae-8e8f-75321a878a1d,mpt,abstracts,0,http://arxiv​.​org/abs/​1605﻿...c95e9a6f4b582a...,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.00000,0.0,0.000000
13911,2a6d1b68-faf8-4551-a116-23e68dcd41ad,mpt,abstracts,0,A simple but effective visual tracking algorit...,0.140496,0.371901,0.157025,0.033058,0.024793,...,0.000000,0.008264,0.024793,0.008264,0.0,0.016529,0.008264,0.00000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4248178,5ebd895c-a74e-41c9-af53-9de045227d5e,mpt,wiki,0,*{{Infobox railroadstation|image = [[Image;Fil...,0.008929,0.071429,0.000000,0.004464,0.000000,...,0.004464,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.00000,0.0,0.000000
4248203,94c8ec93-71ec-473b-8505-867dda26211b,mpt,wiki,0,A B C D E F G H I J L M N O P R S T U V W X Y ...,0.045113,0.556391,0.015038,0.007519,0.007519,...,0.000000,0.000000,0.000000,0.007519,0.0,0.000000,0.000000,0.00000,0.0,0.000000
4248481,f77a2b9b-6ec6-44bd-81ea-16a6d4d25d2e,mpt,wiki,0,[1] Cornelie or Cornele (also known as Kornél)...,0.146965,0.389776,0.067093,0.031949,0.012780,...,0.003195,0.000000,0.006390,0.009585,0.0,0.003195,0.006390,0.01278,0.0,0.003195
4248680,08e6aa7e-f86a-468b-9124-e21994ab57a9,mpt,wiki,0,This band name has already been used by other ...,0.046296,0.060185,0.023148,0.013889,0.018519,...,0.000000,0.000000,0.004630,0.004630,0.0,0.000000,0.000000,0.00463,0.0,0.000000


In [6]:
# Crear dataframe excluyendo documentos con solo una oración
# Identificar documentos con más de una oración
oraciones_por_doc = df.groupby('id_original')['sentence_num'].max()
ids_multiples_oraciones = oraciones_por_doc[oraciones_por_doc > 0].index

# Crear nuevo dataframe filtrado
df_filtered = df[df['id_original'].isin(ids_multiples_oraciones)].copy()

print("="*60)
print("COMPARACIÓN: DATAFRAME ORIGINAL vs FILTRADO")
print("="*60)
print(f"\nDataframe ORIGINAL:")
print(f"  Total de oraciones: {len(df):,}")
print(f"  Total de documentos: {df['id_original'].nunique():,}")
print(f"  Oraciones por documento (promedio): {len(df) / df['id_original'].nunique():.2f}")

print(f"\nDataframe FILTRADO (sin docs de 1 oración):")
print(f"  Total de oraciones: {len(df_filtered):,}")
print(f"  Total de documentos: {df_filtered['id_original'].nunique():,}")
print(f"  Oraciones por documento (promedio): {len(df_filtered) / df_filtered['id_original'].nunique():.2f}")

print(f"\nDocumentos eliminados: {df['id_original'].nunique() - df_filtered['id_original'].nunique():,}")
print(f"Oraciones eliminadas: {len(df) - len(df_filtered):,}")
print(f"Porcentaje de datos retenidos: {len(df_filtered) / len(df) * 100:.2f}%")

# Verificar distribución por clase
print(f"\nDistribución por clase (FILTRADO):")
print(df_filtered['model'].value_counts())


COMPARACIÓN: DATAFRAME ORIGINAL vs FILTRADO

Dataframe ORIGINAL:
  Total de oraciones: 4,251,000
  Total de documentos: 408,435
  Oraciones por documento (promedio): 10.41

Dataframe FILTRADO (sin docs de 1 oración):
  Total de oraciones: 4,226,384
  Total de documentos: 383,823
  Oraciones por documento (promedio): 11.01

Documentos eliminados: 24,612
Oraciones eliminadas: 24,616
Porcentaje de datos retenidos: 99.42%

Distribución por clase (FILTRADO):
model
gpt2            645296
llama-chat      560308
mistral         496884
mpt             485978
mistral-chat    410953
gpt4            309527
chatgpt         290701
mpt-chat        271297
cohere          242967
cohere-chat     197063
human           161988
gpt3            153422
Name: count, dtype: int64


In [7]:
print("Información del dataset de entrenamiento:")
print(f"Forma del dataset: {df_filtered.shape}")
print(f"Columnas: {list(df.columns)}")
print(f"Modelos unicos: {df['model'].unique()}")
print(f"Dominios unicos: {df['domain'].unique()}")

Información del dataset de entrenamiento:
Forma del dataset: (4226384, 201)
Columnas: ['id_original', 'model', 'domain', 'sentence_num', 'text', 'POS_VERB', 'POS_NOUN', 'POS_ADJ', 'POS_ADV', 'POS_DET', 'POS_INTJ', 'POS_CONJ', 'POS_PART', 'POS_NUM', 'POS_PREP', 'POS_PRO', 'L_REF', 'L_HASHTAG', 'L_MENTION', 'L_RT', 'L_LINKS', 'L_CONT_A', 'L_FUNC_A', 'L_CONT_T', 'L_FUNC_T', 'L_PLURAL_NOUNS', 'L_SINGULAR_NOUNS', 'L_PROPER_NAME', 'L_PERSONAL_NAME', 'L_NOUN_PHRASES', 'L_PUNCT', 'L_PUNCT_DOT', 'L_PUNCT_COM', 'L_PUNCT_SEMC', 'L_PUNCT_COL', 'L_PUNCT_DASH', 'L_POSSESSIVES', 'L_ADJ_POSITIVE', 'L_ADJ_COMPARATIVE', 'L_ADJ_SUPERLATIVE', 'L_ADV_POSITIVE', 'L_ADV_COMPARATIVE', 'L_ADV_SUPERLATIVE', 'PS_CONTRADICTION', 'PS_AGREEMENT', 'PS_EXAMPLES', 'PS_CONSEQUENCE', 'PS_CAUSE', 'PS_LOCATION', 'PS_TIME', 'PS_CONDITION', 'PS_MANNER', 'SY_QUESTION', 'SY_NARRATIVE', 'SY_NEGATIVE_QUESTIONS', 'SY_SPECIAL_QUESTIONS', 'SY_TAG_QUESTIONS', 'SY_GENERAL_QUESTIONS', 'SY_EXCLAMATION', 'SY_IMPERATIVE', 'SY_SUBORD_SEN

In [8]:
# 1. Crear etiqueta binaria: 1 = Humano (model == 'human'), 0 = IA (resto)
print("Distribución de clases:")
print((df_filtered['model'] == 'human').value_counts())
print(f"\nTotal de oraciones: {len(df)}")

Distribución de clases:
model
False    4064396
True      161988
Name: count, dtype: int64

Total de oraciones: 4251000


In [9]:
# 3. Obtener IDs únicos y crear mapping de ID a clase
unique_ids = df_filtered['id_original'].unique()
print(f"\nTotal de documentos únicos: {len(unique_ids)}")

# Mapping de ID a clase (binaria: humano vs IA)
id_to_class = df_filtered.groupby('id_original')['model'].first().to_dict()


Total de documentos únicos: 383823


In [10]:
# Verificar cuántos documentos hay por clase
print("Distribución de documentos por clase:")
class_distribution = pd.Series(id_to_class.values()).value_counts()
print(class_distribution)

# Contar correctamente humanos vs IA
human_count = sum(1 for model in id_to_class.values() if model == 'human')
ai_count = sum(1 for model in id_to_class.values() if model != 'human')

print(f"\nClase IA (no-human): {ai_count} documentos")
print(f"Clase Humano: {human_count} documentos")
print(f"\nTotal de documentos: {human_count + ai_count}")
print(f"Proporción Humano/IA: {human_count / (human_count + ai_count) * 100:.2f}% / {ai_count / (human_count + ai_count) * 100:.2f}%")


Distribución de documentos por clase:
mistral-chat    44736
llama-chat      44692
mpt-chat        44664
mistral         42295
gpt2            42222
mpt             41645
gpt4            23154
chatgpt         23044
cohere          22370
cohere-chat     22029
gpt3            21242
human           11730
Name: count, dtype: int64

Clase IA (no-human): 372093 documentos
Clase Humano: 11730 documentos

Total de documentos: 383823
Proporción Humano/IA: 3.06% / 96.94%


In [11]:
# Balancear dataset manteniendo proporción 5:1 (IA:Humano) máximo
# Balanceo por modelo y dominio, sin fraccionar documentos

print("="*70)
print("BALANCEO DE DATASET (RATIO MÁXIMO 5:1)")
print("="*70)

# 1. Analizar distribución actual
print("\n1. DISTRIBUCIÓN ACTUAL:")
print(f"   Total documentos: {df_filtered['id_original'].nunique()}")
print(f"   Total oraciones: {len(df_filtered)}")

# Contar documentos por modelo
docs_por_modelo = df_filtered.groupby('model')['id_original'].nunique()
print(f"\n   Documentos por modelo:")
for modelo, count in docs_por_modelo.items():
    print(f"   - {modelo}: {count}")

# Contar documentos humanos vs IA
human_docs = df_filtered[df_filtered['model'] == 'human']['id_original'].nunique()
ai_docs = df_filtered[df_filtered['model'] != 'human']['id_original'].nunique()
ratio_actual = ai_docs / human_docs if human_docs > 0 else 0

print(f"\n   Humanos: {human_docs} docs")
print(f"   IA: {ai_docs} docs")
print(f"   Ratio actual (IA/Humano): {ratio_actual:.2f}:1")

# 2. Calcular cuántos documentos IA necesitamos para ratio 5:1
MAX_RATIO = 5
target_ai_docs = min(ai_docs, int(human_docs * MAX_RATIO))

print(f"\n2. OBJETIVO DE BALANCEO:")
print(f"   Mantener {human_docs} docs humanos")
print(f"   Reducir a {target_ai_docs} docs IA (ratio {target_ai_docs/human_docs:.2f}:1)")

# 3. Balancear por modelo y dominio
# Obtener modelos IA únicos
ai_models = df_filtered[df_filtered['model'] != 'human']['model'].unique()
num_ai_models = len(ai_models)

# Distribuir documentos IA equitativamente entre modelos
docs_per_ai_model = target_ai_docs // num_ai_models
remainder = target_ai_docs % num_ai_models

print(f"\n3. ESTRATEGIA DE SAMPLING:")
print(f"   {num_ai_models} modelos IA detectados: {list(ai_models)}")
print(f"   Documentos por modelo IA: ~{docs_per_ai_model}")

# 4. Seleccionar documentos balanceados
selected_doc_ids = []

# Mantener todos los documentos humanos
human_doc_ids = df_filtered[df_filtered['model'] == 'human']['id_original'].unique()
selected_doc_ids.extend(human_doc_ids)

# Para cada modelo IA, seleccionar documentos balanceados por dominio
for i, ai_model in enumerate(ai_models):
    # Obtener documentos de este modelo
    model_df = df_filtered[df_filtered['model'] == ai_model]
    model_doc_ids = model_df['id_original'].unique()
    
    # Calcular cuántos docs seleccionar (distribución equitativa con resto)
    n_docs_to_select = docs_per_ai_model + (1 if i < remainder else 0)
    
    # Obtener dominios disponibles para este modelo
    domains = model_df['domain'].unique()
    
    # Balancear por dominio
    docs_per_domain = n_docs_to_select // len(domains)
    domain_remainder = n_docs_to_select % len(domains)
    
    model_selected = []
    for j, domain in enumerate(domains):
        # Docs de este modelo y dominio
        domain_docs = model_df[model_df['domain'] == domain]['id_original'].unique()
        
        # Cuántos seleccionar de este dominio
        n_select = docs_per_domain + (1 if j < domain_remainder else 0)
        n_select = min(n_select, len(domain_docs))
        
        # Sample aleatorio
        if n_select > 0:
            sampled = pd.Series(domain_docs).sample(n=n_select, random_state=42).tolist()
            model_selected.extend(sampled)
    
    selected_doc_ids.extend(model_selected)
    print(f"   - {ai_model}: {len(model_selected)} docs seleccionados")

# 5. Crear dataframe balanceado
df_balanced = df_filtered[df_filtered['id_original'].isin(selected_doc_ids)].copy()

print(f"\n{'='*70}")
print("RESULTADO DEL BALANCEO:")
print(f"{'='*70}")

# Estadísticas finales
balanced_human_docs = df_balanced[df_balanced['model'] == 'human']['id_original'].nunique()
balanced_ai_docs = df_balanced[df_balanced['model'] != 'human']['id_original'].nunique()
balanced_ratio = balanced_ai_docs / balanced_human_docs if balanced_human_docs > 0 else 0

print(f"\nDocumentos ANTES del balanceo:")
print(f"  Humanos: {human_docs} | IA: {ai_docs} | Ratio: {ratio_actual:.2f}:1")
print(f"\nDocumentos DESPUÉS del balanceo:")
print(f"  Humanos: {balanced_human_docs} | IA: {balanced_ai_docs} | Ratio: {balanced_ratio:.2f}:1")

print(f"\nOraciones ANTES: {len(df_filtered):,}")
print(f"Oraciones DESPUÉS: {len(df_balanced):,}")
print(f"Reducción: {(1 - len(df_balanced)/len(df_filtered))*100:.1f}%")

print(f"\nDistribución por modelo (documentos):")
docs_por_modelo_balanced = df_balanced.groupby('model')['id_original'].nunique()
for modelo, count in docs_por_modelo_balanced.items():
    print(f"  - {modelo}: {count}")

print(f"\nDistribución por dominio (documentos):")
docs_por_dominio_balanced = df_balanced.groupby('domain')['id_original'].nunique()
for dominio, count in docs_por_dominio_balanced.items():
    print(f"  - {dominio}: {count}")

print(f"\n{'='*70}")

# Actualizar df_filtered con el dataset balanceado
df_filtered = df_balanced.copy()
print(f"✓ df_filtered actualizado con dataset balanceado")
print(f"  {len(df_filtered):,} oraciones de {df_filtered['id_original'].nunique()} documentos")
# ocumentos para mantener solo modelos específicos
modelos_deseados = ['human', 'gpt4', 'chatgpt', 'llama-chat', 'mpt', 'mpt-chat']

# Filtrar el dataframe df_filtered para incluir solo los modelos deseados
df_filtered = df_filtered[df_filtered['model'].isin(modelos_deseados)].copy()

# Recrear el mapping id_to_class con los datos filtrados
id_to_class = df_filtered.groupby('id_original')['model'].first().to_dict()

print("="*60)
print("FILTRADO POR MODELOS ESPECÍFICOS")
print("="*60)
print(f"Modelos incluidos: {modelos_deseados}")
print(f"\nDistribución de documentos por modelo:")
model_distribution = pd.Series(id_to_class.values()).value_counts()
print(model_distribution)

# Contar correctamente humanos vs IA
human_count = sum(1 for model in id_to_class.values() if model == 'human')
ai_count = sum(1 for model in id_to_class.values() if model != 'human')

print(f"\n{'='*60}")
print(f"Clase IA (no-human): {ai_count} documentos")
print(f"Clase Humano: {human_count} documentos")
print(f"\nTotal de documentos: {human_count + ai_count}")
print(f"Proporción Humano/IA: {human_count / (human_count + ai_count) * 100:.2f}% / {ai_count / (human_count + ai_count) * 100:.2f}%")
print(f"{'='*60}")

print(f"\nTotal de oraciones en df_filtered: {len(df_filtered):,}")


BALANCEO DE DATASET (RATIO MÁXIMO 5:1)

1. DISTRIBUCIÓN ACTUAL:
   Total documentos: 383823
   Total oraciones: 4226384

   Documentos por modelo:
   - chatgpt: 23044
   - cohere: 22370
   - cohere-chat: 22029
   - gpt2: 42222
   - gpt3: 21242
   - gpt4: 23154
   - human: 11730
   - llama-chat: 44692
   - mistral: 42295
   - mistral-chat: 44736
   - mpt: 41645
   - mpt-chat: 44664

   Humanos: 11730 docs
   IA: 372093 docs
   Ratio actual (IA/Humano): 31.72:1

2. OBJETIVO DE BALANCEO:
   Mantener 11730 docs humanos
   Reducir a 58650 docs IA (ratio 5.00:1)

3. ESTRATEGIA DE SAMPLING:
   11 modelos IA detectados: ['llama-chat', 'mpt', 'mpt-chat', 'gpt2', 'mistral', 'mistral-chat', 'gpt3', 'cohere', 'chatgpt', 'gpt4', 'cohere-chat']
   Documentos por modelo IA: ~5331
   - llama-chat: 5332 docs seleccionados
   - mpt: 5332 docs seleccionados
   - mpt-chat: 4920 docs seleccionados
   - gpt2: 5332 docs seleccionados
   - mistral: 5332 docs seleccionados
   - mistral-chat: 5332 docs seleccio

In [12]:
# # Guardar el dataframe filtrado y balanceado
df_filtered.to_parquet("features_filtered_balanced.parquet", index=False)
print("Dataframe filtrado y balanceado guardado en 'features_filtered_balanced.parquet'")

Dataframe filtrado y balanceado guardado en 'features_filtered_balanced.parquet'
