# Limpieza y carga del dataset Students Performance


Este cuaderno ingiere los datos crudos, normaliza los nombres de las columnas, revisa calidad de datos y exporta un CSV limpio listo para el pipeline.


In [4]:
import json
from pathlib import Path

import numpy as np
import pandas as pd

CONFIG_PATH = Path("config.json")
if not CONFIG_PATH.exists():
    CONFIG_PATH = Path("../../config.json").resolve()

with CONFIG_PATH.open(encoding="utf-8") as cfg:
    config = json.load(cfg)

project_root = CONFIG_PATH.parent
raw_path = project_root / config["data"]["raw_dataset"]
clean_path = project_root / config["data"]["clean_dataset"]
print(f"Ruta de datos crudos: {raw_path}")
print(f"Ruta de salida limpia: {clean_path}")


Ruta de datos crudos: C:\Users\juanp\OneDrive\Escritorio\ML\Proyecto\Base_de_datos.csv
Ruta de salida limpia: C:\Users\juanp\OneDrive\Escritorio\ML\Proyecto\mlops_pipeline\data\clean_students.csv


In [5]:
df_raw = pd.read_csv(raw_path)
print(f"Dimensiones originales: {df_raw.shape}")
df_raw.head()


Dimensiones originales: (1000, 8)


Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [6]:
def to_snake(name: str) -> str:
    replacements = {
        " ": "_",
        "/": "_",
        "-": "_"
    }
    cleaned = name.strip().lower()
    for old, new in replacements.items():
        cleaned = cleaned.replace(old, new)
    return cleaned

print("Normalizando nombres de columnas...")
df = df_raw.copy()
df.columns = [to_snake(col) for col in df.columns]
df.head()


Normalizando nombres de columnas...


Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [8]:
print("Revisando valores nulos y duplicados...")
nulls = df.isna().sum()
duplicates = df.duplicated().sum()

print(nulls)
print(f"Registros duplicados: {duplicates}")
print("\nResumen estadístico:")
print(df.describe(include='all'))


Revisando valores nulos y duplicados...
gender                         0
race_ethnicity                 0
parental_level_of_education    0
lunch                          0
test_preparation_course        0
math_score                     0
reading_score                  0
writing_score                  0
dtype: int64
Registros duplicados: 0

Resumen estadístico:
        gender race_ethnicity parental_level_of_education     lunch  \
count     1000           1000                        1000      1000   
unique       2              5                           6         2   
top     female        group C                some college  standard   
freq       518            319                         226       645   
mean       NaN            NaN                         NaN       NaN   
std        NaN            NaN                         NaN       NaN   
min        NaN            NaN                         NaN       NaN   
25%        NaN            NaN                         NaN       NaN  

In [9]:
clean_path.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(clean_path, index=False, encoding="utf-8")
print(f"Dataset limpio guardado en {clean_path}")


Dataset limpio guardado en C:\Users\juanp\OneDrive\Escritorio\ML\Proyecto\mlops_pipeline\data\clean_students.csv
