# Proyecto 02

## Tema: MAP - Charting Student Math Misunderstandings

### Preprococesamiento de datos


#### Configuración de limpieza

In [9]:
from pathlib import Path
import re, html, json, unicodedata
import numpy as np
import pandas as pd

# Configuraciones generales
SEED = 42
DATA_DIR = Path("./data")
TRAIN_PATH = DATA_DIR / "train.csv"
TEST_PATH  = DATA_DIR / "test.csv"
OUT_DIR    = DATA_DIR / "preprocessed"
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Parámetros de preprocesamiento
LOWERCASE_TEXT = False       # Para modelos tipo BERT "cased", conviene False
NORMALIZE_FORM = "NFC"       # NFC preserva mejor símbolos/LaTeX que NFKC
STRIP_URLS     = True        # Reemplazar URLs/emails por tokens
KEEP_PUNCT     = True        # Mantener puntuación (importante en matemáticas)

#### Utilidades de limpieza de texto

In [10]:
_ZW_CHARS = "".join([
    "\u200b",  # zero-width space
    "\u200c", "\u200d",  # zero-width non-joiner/joiner
    "\ufeff", # BOM
])

def _remove_control_chars(s: str) -> str:
    return "".join(ch for ch in s if (ch in "\n\t\r") or (ord(ch) >= 32))

def _standardize_quotes_dashes(s: str) -> str:
    # Normaliza comillas y guiones largos a ASCII simple (sin tocar signos matemáticos +−×÷=/^)
    s = s.replace("“","\"").replace("”","\"").replace("‘","'").replace("’","'")
    s = s.replace("—","-").replace("–","-")
    return s

URL_RE   = re.compile(r"https?://\S+|www\.\S+", re.IGNORECASE)
EMAIL_RE = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")

MATH_LATEX_HINTS = (
    r"\$.*?\$",        # $...$
    r"\\frac", r"\\sqrt", r"\\times", r"\\cdot", r"\\left", r"\\right",
    r"\\begin\{.*?}", r"\\end\{.*?}"
)
LATEX_RE = re.compile("|".join(MATH_LATEX_HINTS))

def clean_text(text: str,
               lowercase: bool = LOWERCASE_TEXT,
               normalize_form: str = NORMALIZE_FORM,
               strip_urls: bool = STRIP_URLS,
               keep_punct: bool = KEEP_PUNCT) -> str:
    """
    Limpieza NO destructiva pensada para explicaciones de estudiantes:
    - Unescape HTML, normaliza Unicode (por defecto NFC), elimina controles/ZW.
    - Estandariza comillas/guiones; colapsa espacios.
    - Opcional: reemplaza URLs/emails por tokens; evita remover puntuación (transformers la manejan).
    - No altera contenido LaTeX ni símbolos matemáticos.
    """
    if pd.isna(text):
        return ""

    s = str(text)

    # 1) HTML & Unicode
    s = html.unescape(s)
    try:
        s = unicodedata.normalize(normalize_form, s)
    except Exception:
        # Si el form no es válido, retomamos NFC
        s = unicodedata.normalize("NFC", s)

    # 2) Eliminar controles y ZW
    s = s.replace(_ZW_CHARS, "")
    s = _remove_control_chars(s)

    # 3) Estandarizar comillas/guiones
    s = _standardize_quotes_dashes(s)

    # 4) Sustituir URLs/emails por tokens neutros
    if strip_urls:
        s = URL_RE.sub(" <URL> ", s)
        s = EMAIL_RE.sub(" <EMAIL> ", s)

    # 5) Opcional: minúsculas (recomendado dejar False para modelos 'cased')
    if lowercase:
        s = s.lower()

    # 6) Mantener puntuación; solo limpiar excesos de espacios
    s = re.sub(r"\s+", " ", s, flags=re.MULTILINE).strip()
    return s


#### Fase de carga

In [11]:

train = pd.read_csv(TRAIN_PATH)
test  = pd.read_csv(TEST_PATH)

# Verificar que las columnas esperadas estén presentes
expected_train_cols = {"QuestionId", "QuestionText", "MC_Answer", "StudentExplanation", "Category", "Misconception"}
expected_test_cols  = {"QuestionId", "QuestionText", "MC_Answer", "StudentExplanation"}

missing_train = expected_train_cols - set(train.columns)
missing_test  = expected_test_cols  - set(test.columns)

if missing_train:
    print("ADVERTENCIA: Faltan columnas en train:", missing_train)
if missing_test:
    print("ADVERTENCIA: Faltan columnas en test:", missing_test)

train.head(2), test.head(2)


(   row_id  QuestionId                                       QuestionText  \
 0       0       31772  What fraction of the shape is not shaded? Give...   
 1       1       31772  What fraction of the shape is not shaded? Give...   
 
            MC_Answer                                 StudentExplanation  \
 0  \( \frac{1}{3} \)                  0ne third is equal to tree nineth   
 1  \( \frac{1}{3} \)  1 / 3 because 6 over 9 is 2 thirds and 1 third...   
 
        Category Misconception  
 0  True_Correct           NaN  
 1  True_Correct           NaN  ,
    row_id  QuestionId                                       QuestionText  \
 0   36696       31772  What fraction of the shape is not shaded? Give...   
 1   36697       31772  What fraction of the shape is not shaded? Give...   
 
            MC_Answer                                 StudentExplanation  
 0  \( \frac{1}{3} \)  I think that 1/3 is the answer, as it's the si...  
 1  \( \frac{3}{6} \)  i think this answer is because 

#### Limpieza de StudenExplanation

In [12]:
# ---- Aplicar limpieza de texto únicamente a StudentExplanation
train["StudentExplanation_clean"] = train["StudentExplanation"].apply(clean_text)
test["StudentExplanation_clean"]  = test["StudentExplanation"].apply(clean_text)

# ---- Señales/rasgos útiles para EDA/modelado posterior (no destructivos)
def count_chars(s): return len(s)
def count_words(s): return len(s.split()) if s else 0
def count_digits(s): return sum(ch.isdigit() for ch in s)
def count_math_ops(s): return sum(s.count(op) for op in ["+", "-", "*", "/", "^", "="])
def has_latex(s): return bool(LATEX_RE.search(s or ""))

for df in (train, test):
    col = "StudentExplanation_clean"
    df["len_chars"]   = df[col].apply(count_chars)
    df["len_words"]   = df[col].apply(count_words)
    df["num_digits"]  = df[col].apply(count_digits)
    df["num_mathops"] = df[col].apply(count_math_ops)
    df["has_latex"]   = df[col].apply(has_latex)


#### Maneja valores faltantes

In [13]:
# StudentExplanation ya está saneada a "" si venía NaN en clean_text
#  para etiquetas (solo train): Category y Misconception pueden tener NaN
train["Category"]      = train["Category"].fillna("Unknown")
train["Misconception"] = train["Misconception"].fillna("None")

# Etiquetas codificadas como enteros para modelado futuro
# Nota: NO se altera el espacio de etiquetas; solo se mapean a IDs.
cat_classes = sorted(train["Category"].unique())
miscon_classes = sorted(train["Misconception"].unique())

cat2id  = {c:i for i,c in enumerate(cat_classes)}
mis2id  = {m:i for i,m in enumerate(miscon_classes)}

train["Category_id"]      = train["Category"].map(cat2id).astype("int32")
train["Misconception_id"] = train["Misconception"].map(mis2id).astype("int32")

(Path(OUT_DIR) / "label_maps.json").write_text(
    json.dumps({"Category": cat2id, "Misconception": mis2id}, indent=2, ensure_ascii=False)
)


1048

#### Exportación de limpieza

In [14]:
# ---- Salvar datasets preprocesados
train_out = OUT_DIR / "train_preprocessed.csv"
test_out  = OUT_DIR / "test_preprocessed.csv"

train.to_csv(train_out, index=False)
test.to_csv(test_out, index=False)

train_out, test_out

assert train["StudentExplanation_clean"].isna().sum() == 0
assert test["StudentExplanation_clean"].isna().sum() == 0
print("✅ Preprocesamiento completado. Archivos en:", OUT_DIR.resolve())


✅ Preprocesamiento completado. Archivos en: /Users/gerco/UVG/8th_semester/Data_Science/DS-Proyecto02/data/preprocessed
