# Laboratorio 2: Data Understanding

**Universidad del Valle de Guatemala**  
**Facultad de Ingeniería**  
**Departamento de Ciencias de la Computación**  
**Machine Learning Operations** 

## Integrantes

- Arturo Argueta - 21527 
- Edwin de León - 22809 
- Diego Leiva - 21752 
- Pablo Orellana - 21970

## Librerías

In [1]:
# %% [markdown]
# --- Limpieza avanzada de codificación para cliente.csv ---

# %%
import pandas as pd
from pathlib import Path
import unicodedata
from itertools import product
from functools import lru_cache

# (recomendado) pip install ftfy rapidfuzz
try:
    from ftfy import fix_text
except ImportError:
    def fix_text(s):  # fallback no-op si ftfy no está
        return s

# === Parámetros ===
DATA_DIR = Path("data/raw")
OUT_DIR  = Path("data/processed")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# === Utilidades de diagnóstico ===
MOJIBAKE_TOKENS = ['Ã', 'Â', 'Ì', '�', '', '©', '™']

def stats(text: str):
    if not isinstance(text, str):
        return {}
    n_comb = sum(1 for ch in text if unicodedata.combining(ch))
    n_moji = sum(text.count(tok) for tok in MOJIBAKE_TOKENS)
    return {"combining": n_comb, "mojibake_tokens": n_moji}

# === Reparador “mangled” ===

# diacríticos combinantes comunes que sí queremos permitir (el resto los limpiamos)
_ALLOWED_COMBINING = {
    0x0300, # grave
    0x0301, # agudo
    0x0302, # circunflejo
    0x0303, # tilde
    0x0308, # diéresis
    0x0327, # cedilla
}

# encodings a probar en rutas decode/encode/decode
_ENCODINGS = ["utf-8", "cp1252", "latin1", "cp1258", "mac_roman"]

def _normalize_keep_common_marks(s: str) -> str:
    # NFD → elimina combinantes “raros” → NFC
    nfd = unicodedata.normalize("NFD", s)
    cleaned = "".join(
        ch for ch in nfd
        if not unicodedata.combining(ch) or (ord(ch) in _ALLOWED_COMBINING)
    )
    return unicodedata.normalize("NFC", cleaned)

def _score(t: str):
    # Menos tokens mojibake y combinantes, más letras latinas extendidas “bien puestas”
    bad = sum(t.count(x) for x in ["Ã", "Â", "Ì", "�", ""])
    combining = sum(1 for ch in t if unicodedata.combining(ch))
    latin_ext = sum(1 for ch in t if "LATIN" in unicodedata.name(ch, "") and ord(ch) > 127)
    letters_spaces = sum(ch.isalpha() or ch.isspace() for ch in t)
    return (-bad, -combining, latin_ext, letters_spaces, -len(t))

def _routes():
    routes = []
    # 1 paso: solo decodificar desde bytes
    for e1 in _ENCODINGS:
        routes.append((f"decode:{e1}",))
    # 3 pasos: decode -> encode -> decode (clásico mangled)
    for e1, e2, e3 in product(_ENCODINGS, repeat=3):
        routes.append((f"decode:{e1}", f"encode:{e2}", f"decode:{e3}"))
    return routes

_ROUTES = _routes()

def _try_route(raw_bytes: bytes, route):
    """
    route: tupla de "decode:enc" o "encode:enc".
    Inicia desde bytes y debe terminar en decode:* para retornar str.
    """
    obj = raw_bytes
    try:
        for step in route:
            op, enc = step.split(":")
            if op == "decode":
                obj = obj.decode(enc, errors="ignore")
            elif op == "encode":
                obj = obj.encode(enc, errors="ignore")
            else:
                raise ValueError("Paso inválido")
        if not isinstance(obj, str):
            return None
        return _normalize_keep_common_marks(obj)
    except Exception:
        return None

@lru_cache(maxsize=100000)
def fix_cell_mangled_cached(s: str) -> str:
    # 1) limpieza rápida
    candidate0 = _normalize_keep_common_marks(s)
    best = candidate0 if _score(candidate0) >= _score(s) else s
    best_score = _score(best)

    # 2) vuelve a bytes “como si” vinieran de latin1 (1:1 codepoint→byte)
    raw = s.encode("latin1", errors="ignore")

    # 3) prueba rutas
    for r in _ROUTES:
        out = _try_route(raw, r)
        if isinstance(out, str):
            sc = _score(out)
            if sc > best_score:
                best, best_score = out, sc
                # atajo: ya está limpio
                if best_score[0] == 0 and best_score[1] == 0:
                    break
    return best

def fix_cell_mangled(s: object) -> object:
    if not isinstance(s, str) or s == "":
        return s
    # ftfy primero (si está disponible) — a veces basta
    s_ftfy = fix_text(s)
    if _score(s_ftfy) > _score(s):
        s = s_ftfy
    return fix_cell_mangled_cached(s)

def fix_dataframe_text(df: pd.DataFrame) -> pd.DataFrame:
    text_cols = df.select_dtypes(include=["object", "string"]).columns
    df[text_cols] = df[text_cols].apply(lambda col: col.map(fix_cell_mangled))
    return df

# === (Opcional) Gazetteer para topónimos frecuentes ===
USE_GAZETTEER = True
GAZ = [
    "Asnières-sur-Seine",
    "Orléans",
    "Göteborg",
    "Örebro",
    "Doctor Juan León Mallorquín",
    "København",
    "Hà Nội",
    "Thành phố Hồ Chí Minh",
    "Đà Nẵng",
    "Trần Ngọc Hải",
]

try:
    from rapidfuzz import process, fuzz
    def gazetteer_fix(s: object, score_cutoff=92):
        if not USE_GAZETTEER or not isinstance(s, str) or not s:
            return s
        match = process.extractOne(s, GAZ, scorer=fuzz.WRatio, score_cutoff=score_cutoff)
        if match:
            cand, score, _ = match
            return cand
        return s
except ImportError:
    def gazetteer_fix(s: object, score_cutoff=92):
        return s  # sin rapidfuzz, no-op

def apply_gazetteer(df: pd.DataFrame) -> pd.DataFrame:
    # Heurística: aplicar a columnas cuyo nombre sugiera lugar/ciudad/municipio…
    place_hints = ("city", "ciudad", "municipio", "localidad", "poblacion",
                   "comuna", "provincia", "departamento", "region", "ville", "villé")
    cols = [c for c in df.columns if any(h in c.lower() for h in place_hints)]
    if not cols:  # si no encontramos, aplicar a todas las columnas de texto (suave)
        cols = df.select_dtypes(include=["object", "string"]).columns
    df[cols] = df[cols].apply(lambda col: col.map(gazetteer_fix))
    return df

# === Lectura + reparación SOLO para cliente.csv ===
def read_and_fix_csv(path: Path) -> pd.DataFrame:
    try:
        df = pd.read_csv(path, encoding="utf-8-sig")
    except UnicodeDecodeError:
        df = pd.read_csv(path, encoding="latin1")
    df = fix_dataframe_text(df)
    df = apply_gazetteer(df)  # opcional, si USE_GAZETTEER=True
    return df

# %% [markdown]
# --- Cargar y limpiar cliente.csv ---

# %%
src = DATA_DIR / "cliente.csv"

# Antes: muestra ejemplos sospechosos (para validar el cambio)
try:
    df_cliente_raw = pd.read_csv(src, encoding="latin1")
except UnicodeDecodeError:
    df_cliente_raw = pd.read_csv(src, encoding="utf-8-sig")

def _row_looks_bad(row: pd.Series) -> bool:
    for v in row.values:
        if isinstance(v, str):
            s = unicodedata.normalize("NFC", v)
            st = stats(s)
            if st.get("mojibake_tokens", 0) > 0 or st.get("combining", 0) > 0:
                return True
    return False

sample_bad = df_cliente_raw[df_cliente_raw.apply(_row_looks_bad, axis=1)].head(10)

print("Muestras problemáticas (antes de limpiar):")
display(sample_bad)

# Limpiar
df_cliente = read_and_fix_csv(src)

# Después: verificar que mejoró
sample_bad_after = df_cliente.loc[sample_bad.index.intersection(df_cliente.index)]
print("Mismas filas tras limpieza:")
display(sample_bad_after)

# Vista rápida
df_cliente.head(20)


Muestras problemáticas (antes de limpiar):


Unnamed: 0,id,nombre,apellido,nacimiento,genero,empresa,idioma,nit,puesto,ciudad,correo,telefono
5,350566.0,Donna,Hansen,9/5/59,Female,Eazzy,German,980-60-6458,Graphic Designer,Doctor Juan LeÌ_n MallorquÌ_n,dhansen5@nymag.com,595-(777)710-9393
9,1406787.0,Stephanie,Hudson,1/6/91,Female,Agimba,Chinese,365-74-2748,Technical Writer,ThÈÜ Tr¼´n N®¡ÈÝc Hai,shudson9@cdbaby.com,84-(468)360-5193
15,61806.0,Louis,Sullivan,6/18/69,Male,Blogspan,Tswana,173-76-5361,Marketing Manager,AsniÌ¬res-sur-Seine,lsullivanf@yahoo.com,33-(877)605-5009
16,1331125.0,Charles,Brown,1/27/59,Male,Zoomcast,Kurdish,729-75-6825,Data Coordiator,TsagaandÌ¦rvÌ¦lj,cbrowng@disqus.com,976-(850)945-3086
17,911093.0,Jimmy,Day,1/31/47,Male,Dabfeed,Filipino,288-18-5216,Editor,BªezovÌÁ,jdayh@delicious.com,420-(837)677-9354
40,1252850.0,Virginia,Myers,10/29/48,Female,Eamia,Fijian,838-98-1024,Structural Engineer,SakÌ©tÌ©,vmyers14@creativecommons.org,229-(166)878-9589
41,34648.0,Lisa,Alexander,4/8/91,Female,Meevee,Chinese,416-28-2357,Administrative Assistant I,BollÌ¬ne,lalexander15@infoseek.co.jp,33-(789)744-3806
49,291283.0,Janice,Carroll,7/3/73,Female,Photojam,Haitian Creole,136-27-4932,Junior Executive,Khallat _liü©,jcarroll1d@desdev.cn,970-(718)913-3710
62,138131.0,Debra,Woods,11/11/66,Female,Zooveo,Aymara,820-51-4878,Nurse Practicioner,GaviÌ£o,dwoods1q@answers.com,351-(695)784-1916
76,455557.0,Norma,Burns,6/23/88,Female,Youopia,Kazakh,708-65-3349,GIS Technical Architect,VeverskÌÁ BÌ_tÌ_Áka,nburns24@gnu.org,420-(725)556-5416


Mismas filas tras limpieza:


Unnamed: 0,id,nombre,apellido,nacimiento,genero,empresa,idioma,nit,puesto,ciudad,correo,telefono
5,350566.0,Donna,Hansen,9/5/59,Female,Eazzy,German,980-60-6458,Graphic Designer,Doctor Juan Le_n Mallorqu_n,dhansen5@nymag.com,595-(777)710-9393
9,1406787.0,Stephanie,Hudson,1/6/91,Female,Agimba,Chinese,365-74-2748,Technical Writer,ThÈÜ Tr¼´n N®¡ÈÝc Hai,shudson9@cdbaby.com,84-(468)360-5193
15,61806.0,Louis,Sullivan,6/18/69,Male,Blogspan,Tswana,173-76-5361,Marketing Manager,Asnires-sur-Seine,lsullivanf@yahoo.com,33-(877)605-5009
16,1331125.0,Charles,Brown,1/27/59,Male,Zoomcast,Kurdish,729-75-6825,Data Coordiator,Tsagaandrvlj,cbrowng@disqus.com,976-(850)945-3086
17,911093.0,Jimmy,Day,1/31/47,Male,Dabfeed,Filipino,288-18-5216,Editor,Bezov,jdayh@delicious.com,420-(837)677-9354
40,1252850.0,Virginia,Myers,10/29/48,Female,Eamia,Fijian,838-98-1024,Structural Engineer,Sakt,vmyers14@creativecommons.org,229-(166)878-9589
41,34648.0,Lisa,Alexander,4/8/91,Female,Meevee,Chinese,416-28-2357,Administrative Assistant I,Bollne,lalexander15@infoseek.co.jp,33-(789)744-3806
49,291283.0,Janice,Carroll,7/3/73,Female,Photojam,Haitian Creole,136-27-4932,Junior Executive,Khallat _liü©,jcarroll1d@desdev.cn,970-(718)913-3710
62,138131.0,Debra,Woods,11/11/66,Female,Zooveo,Aymara,820-51-4878,Nurse Practicioner,Gavịo,dwoods1q@answers.com,351-(695)784-1916
76,455557.0,Norma,Burns,6/23/88,Female,Youopia,Kazakh,708-65-3349,GIS Technical Architect,Veversk B_t_ka,nburns24@gnu.org,420-(725)556-5416


Unnamed: 0,id,nombre,apellido,nacimiento,genero,empresa,idioma,nit,puesto,ciudad,correo,telefono
0,599528.0,Samuel,Ward,4/6/89,Male,Yakijo,Marathi,411-44-7088,Geologist IV,Wangjing,sward0@tamu.edu,86-(786)608-5061
1,121688.0,Willie,Gonzales,6/29/72,Male,Zoonoodle,Maltese,701-87-7540,Programmer III,El Corozo,wgonzales1@apache.org,58-(265)301-3397
2,552148.0,Betty,Spencer,9/2/83,Female,Youtags,Dhivehi,373-88-4503,Engineer III,Jinhua,bspencer2@shutterfly.com,86-(195)193-9042
3,102019.0,Beverly,Jordan,1/15/72,Female,Fivespan,Hindi,447-80-5871,Software Test Engineer IV,Salvacion,bjordan3@vimeo.com,63-(652)708-7688
4,189384.0,Cynthia,Flores,2/6/71,Female,Jabbersphere,Tsonga,803-60-8259,Speech Pathologist,Khorol,cflores4@webeden.co.uk,380-(373)389-5435
5,350566.0,Donna,Hansen,9/5/59,Female,Eazzy,German,980-60-6458,Graphic Designer,Doctor Juan Le_n Mallorqu_n,dhansen5@nymag.com,595-(777)710-9393
6,404403.0,Anthony,Reid,3/2/61,Male,Photolist,Hiri Motu,208-70-5661,Mechanical Systems Engineer,Mlawat,areid6@nyu.edu,62-(269)292-3345
7,505565.0,Sharon,Webb,3/30/73,Female,Oyoba,Hindi,467-72-1311,Desktop Support Technician,Villeta,swebb7@posterous.com,57-(932)286-3710
8,945184.0,Nancy,Banks,8/10/80,Female,Wikivu,Swahili,208-63-2112,Health Coach III,Kaliska,nbanks8@163.com,48-(284)988-8563
9,1406787.0,Stephanie,Hudson,1/6/91,Female,Agimba,Chinese,365-74-2748,Technical Writer,ThÈÜ Tr¼´n N®¡ÈÝc Hai,shudson9@cdbaby.com,84-(468)360-5193
