In [13]:
"""
NOTEBOOK 02.5: TRADUCCI√ìN DE NOMBRES DE COLUMNAS A ESPA√ëOL
===========================================================

Objetivo: Renombrar columnas a espa√±ol para an√°lisis m√°s profesional
"""

import pandas as pd

print("="*80)
print("üîÑ TRADUCCI√ìN DE NOMBRES DE COLUMNAS A ESPA√ëOL")
print("="*80)

# ================================================================
# PASO 1: Cargar dataset y ver columnas actuales
# ================================================================

df = pd.read_csv('PCOS_data_winsorized.csv')
# Limpiar espacios en nombres de columnas
df.columns = df.columns.str.strip()  # Quita espacios al inicio/final
df.columns = df.columns.str.replace(r'\s+', ' ', regex=True)  # Normaliza espacios internos

print(f"\nüìä Dataset original:")
print(f"   Filas: {len(df)}")
print(f"   Columnas: {len(df.columns)}")

print(f"\nüìù COLUMNAS ACTUALES (INGL√âS):")
for i, col in enumerate(df.columns, 1):
    print(f"   {i:2}. '{col}'")

# ================================================================
# PASO 2: Diccionario de traducci√≥n
# ================================================================

print("\n" + "="*80)
print("üîÑ APLICANDO TRADUCCI√ìN")
print("="*80)

# Diccionario completo de traducci√≥n
translation = {
    # Variable objetivo
    'PCOS (Y/N)': 'SOP (S/N)',
    
    # Demogr√°ficas
    'Age (yrs)': 'Edad (a√±os)',
    ' Age (yrs)': 'Edad (a√±os)',  # Por si tiene espacio
    'Weight (Kg)': 'Peso (Kg)',
    'Height(Cm)': 'Altura (cm)',
    'Height(Cm) ': 'Altura (cm)',  # Por si tiene espacio
    'BMI': 'IMC',
    'Blood Group': 'Grupo Sangu√≠neo',
    'Marraige Status (Yrs)': 'A√±os Casada',
    
    # Signos vitales
    'Pulse rate(bpm)': 'Frecuencia Cardiaca (lpm)',
    'Pulse rate(bpm) ': 'Frecuencia Cardiaca (lpm)',
    'RR (breaths/min)': 'Frecuencia Respiratoria (rpm)',
    'Hb(g/dl)': 'Hemoglobina (g/dl)',
    'BP _Systolic (mmHg)': 'Presi√≥n Sist√≥lica (mmHg)',
    'BP _Diastolic (mmHg)': 'Presi√≥n Diast√≥lica (mmHg)',
    
    # Ciclo menstrual
    'Cycle(R/I)': 'Ciclo (R/I)',
    'Cycle length(days)': 'Duraci√≥n Ciclo (d√≠as)',
    'Pregnant(Y/N)': 'Embarazada (S/N)',
    'No. of abortions': 'N√∫mero Abortos',
    
    # Hormonas
    'FSH(mIU/mL)': 'FSH (mUI/mL)',
    'LH(mIU/mL)': 'LH (mUI/mL)',
    'FSH/LH': 'Ratio FSH/LH',
    'TSH (mIU/L)': 'TSH (mUI/L)',
    'AMH(ng/mL)': 'AMH (ng/mL)',
    'PRL(ng/mL)': 'Prolactina (ng/mL)',
    'PRG(ng/mL)': 'Progesterona (ng/mL)',
    'II beta-HCG(mIU/mL)': 'beta-HCG II (mUI/mL)',
    'I beta-HCG(mIU/mL)': 'beta-HCG I (mUI/mL)',  # ‚Üê FALTA AGREGAR
    
    # Antropometr√≠a
    'Waist(inch)': 'Cintura (pulg)',
    'Hip(inch)': 'Cadera (pulg)',
    'Waist:Hip Ratio': 'Ratio Cintura-Cadera',
    
    # S√≠ntomas
    'Weight gain(Y/N)': 'Aumento Peso (S/N)',
    'hair growth(Y/N)': 'Crecimiento Vello (S/N)',
    'Skin darkening (Y/N)': 'Oscurecimiento Piel (S/N)',
    'Hair loss(Y/N)': 'P√©rdida Cabello (S/N)',
    'Pimples(Y/N)': 'Acn√© (S/N)',
    
    # Estilo de vida
    'Fast food (Y/N)': 'Comida R√°pida (S/N)',
    'Reg.Exercise(Y/N)': 'Ejercicio Regular (S/N)',
    
    # Laboratorio
    'RBS(mg/dl)': 'Glucosa (mg/dl)',
    'Vit D3 (ng/mL)': 'Vitamina D3 (ng/mL)',
    
    # Ecograf√≠a
    'Follicle No. (L)': 'Num Fol√≠culos (I)',
    'Follicle No. (R)': 'Num Fol√≠culos (D)',
    'Avg. F size (L) (mm)': 'Tama√±o Fol√≠culo Prom (I) (mm)',
    'Avg. F size (R) (mm)': 'Tama√±o Fol√≠culo Prom (D) (mm)',
    'Endometrium (mm)': 'Endometrio (mm)'
}

# Aplicar traducci√≥n
df_spanish = df.rename(columns=translation)

print(f"\n‚úÖ Traducci√≥n aplicada")
print(f"   Columnas renombradas: {len([k for k in translation.keys() if k in df.columns])}")

# ================================================================
# PASO 3: Verificar columnas traducidas
# ================================================================

print(f"\nüìù COLUMNAS NUEVAS (ESPA√ëOL):")
for i, col in enumerate(df_spanish.columns, 1):
    # Marcar las que cambiaron
    original = df.columns[i-1]
    changed = "‚úÖ" if original != col else "  "
    print(f"   {changed} {i:2}. '{col}'")

# ================================================================
# PASO 4: Identificar columnas sin traducir
# ================================================================

print("\n" + "="*80)
print("‚ö†Ô∏è COLUMNAS SIN TRADUCIR (si las hay)")
print("="*80)

untranslated = []
for orig_col, new_col in zip(df.columns, df_spanish.columns):
    if orig_col == new_col and orig_col != 'SOP (S/N)':
        untranslated.append(orig_col)

if len(untranslated) > 0:
    print(f"\n‚ö†Ô∏è {len(untranslated)} columnas sin traducir:")
    for col in untranslated:
        print(f"   - '{col}'")
    print("\nüí° Agrega estas traducciones al diccionario si es necesario")
else:
    print("\n‚úÖ Todas las columnas fueron traducidas")

# ================================================================
# PASO 5: Guardar dataset con nombres en espa√±ol
# ================================================================

print("\n" + "="*80)
print("üíæ GUARDANDO DATASET EN ESPA√ëOL")
print("="*80)

# Guardar versi√≥n en espa√±ol
df_spanish.to_csv('PCOS_data_espanol.csv', index=False)
print(f"\n‚úÖ Dataset guardado: PCOS_data_espanol.csv")
print(f"   Filas: {len(df_spanish)}")
print(f"   Columnas: {len(df_spanish.columns)}")

# ================================================================
# PASO 6: Generar diccionario de mapeo para el c√≥digo
# ================================================================

print("\n" + "="*80)
print("üìã DICCIONARIO DE MAPEO PARA C√ìDIGO")
print("="*80)

print("\n# Usar este diccionario en el c√≥digo de an√°lisis:")
print("\nmapeo_columnas = {")
for orig, trans in translation.items():
    if orig in df.columns:
        print(f"    '{orig}': '{trans}',")
print("}")

# ================================================================
# PASO 7: Clasificaci√≥n de variables en espa√±ol
# ================================================================

print("\n" + "="*80)
print("üìä CLASIFICACI√ìN DE VARIABLES (ESPA√ëOL)")
print("="*80)

# Variables categ√≥ricas/binarias
categoricas_esp = [
    'Grupo Sangu√≠neo',
    'Ciclo (R/I)',
    'Embarazada (S/N)',
    'Aumento Peso (S/N)',
    'Crecimiento Vello (S/N)',
    'Oscurecimiento Piel (S/N)',
    'P√©rdida Cabello (S/N)',
    'Acn√© (S/N)',
    'Comida R√°pida (S/N)',
    'Ejercicio Regular (S/N)'
]

categoricas_esp = [col for col in categoricas_esp if col in df_spanish.columns]

# Variables count (discretas)
count_esp = [
    'N√∫mero Abortos',
    'Duraci√≥n Ciclo (d√≠as)',
    'A√±os Casada',
    'Num Fol√≠culos (I)',
    'Num Fol√≠culos (D)'
]

count_esp = [col for col in count_esp if col in df_spanish.columns]

# Variables num√©ricas continuas
continuas_esp = [col for col in df_spanish.columns 
                 if col not in categoricas_esp 
                 and col not in count_esp 
                 and col != 'SOP (S/N)']

print(f"\nüìä RESUMEN:")
print(f"   Categ√≥ricas/Binarias: {len(categoricas_esp)}")
print(f"   Count (discretas): {len(count_esp)}")
print(f"   Continuas: {len(continuas_esp)}")
print(f"   TOTAL: {len(categoricas_esp) + len(count_esp) + len(continuas_esp)}")

print(f"\nüìù Variables categ√≥ricas:")
for var in categoricas_esp:
    print(f"   - {var}")

print(f"\nüìù Variables count:")
for var in count_esp:
    print(f"   - {var}")

print(f"\nüìù Variables continuas:")
for var in continuas_esp:
    print(f"   - {var}")

# Guardar clasificaci√≥n
clasificacion = pd.DataFrame({
    'Variable': categoricas_esp + count_esp + continuas_esp,
    'Tipo': ['Categ√≥rica']*len(categoricas_esp) + ['Count']*len(count_esp) + ['Continua']*len(continuas_esp)
})

clasificacion.to_csv('clasificacion_variables_esp.csv', index=False)
print(f"\nüíæ Clasificaci√≥n guardada: clasificacion_variables_esp.csv")

print("\n" + "="*80)
print("‚úÖ TRADUCCI√ìN COMPLETADA")
print("="*80)
print("\nüéØ Siguiente paso: Ejecutar an√°lisis estad√≠stico con nombres en espa√±ol")
print("   Usar: PCOS_data_espanol.csv")

üîÑ TRADUCCI√ìN DE NOMBRES DE COLUMNAS A ESPA√ëOL

üìä Dataset original:
   Filas: 538
   Columnas: 42

üìù COLUMNAS ACTUALES (INGL√âS):
    1. 'PCOS (Y/N)'
    2. 'Age (yrs)'
    3. 'Weight (Kg)'
    4. 'Height(Cm)'
    5. 'BMI'
    6. 'Blood Group'
    7. 'Pulse rate(bpm)'
    8. 'RR (breaths/min)'
    9. 'Hb(g/dl)'
   10. 'Cycle(R/I)'
   11. 'Cycle length(days)'
   12. 'Marraige Status (Yrs)'
   13. 'Pregnant(Y/N)'
   14. 'No. of abortions'
   15. 'I beta-HCG(mIU/mL)'
   16. 'II beta-HCG(mIU/mL)'
   17. 'FSH(mIU/mL)'
   18. 'LH(mIU/mL)'
   19. 'FSH/LH'
   20. 'Hip(inch)'
   21. 'Waist(inch)'
   22. 'Waist:Hip Ratio'
   23. 'TSH (mIU/L)'
   24. 'AMH(ng/mL)'
   25. 'PRL(ng/mL)'
   26. 'Vit D3 (ng/mL)'
   27. 'PRG(ng/mL)'
   28. 'RBS(mg/dl)'
   29. 'Weight gain(Y/N)'
   30. 'hair growth(Y/N)'
   31. 'Skin darkening (Y/N)'
   32. 'Hair loss(Y/N)'
   33. 'Pimples(Y/N)'
   34. 'Fast food (Y/N)'
   35. 'Reg.Exercise(Y/N)'
   36. 'BP _Systolic (mmHg)'
   37. 'BP _Diastolic (mmHg)'
   38.