# Análisis exploratorio de Students Performance


Exploramos la distribución de variables, correlaciones y guardamos figuras clave en `mlops_pipeline/reports/`.


In [3]:
import json
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

CONFIG_PATH = Path("config.json")
if not CONFIG_PATH.exists():
    CONFIG_PATH = Path("../../config.json").resolve()

with CONFIG_PATH.open(encoding="utf-8") as cfg:
    config = json.load(cfg)

project_root = CONFIG_PATH.parent
clean_path = project_root / config["data"]["clean_dataset"]
reports_dir = project_root / config["paths"]["eda_reports"]
reports_dir.mkdir(parents=True, exist_ok=True)

sns.set_theme(style="whitegrid")
df = pd.read_csv(clean_path)
print(f"Datos listos: {df.shape} registros")
numeric_features = config["features"]["numeric"] + [config["features"]["target"]]
categorical_features = config["features"]["categorical"]
print("Variables numéricas:", numeric_features)
print("Variables categóricas:", categorical_features)


Datos listos: (1000, 8) registros
Variables numéricas: ['reading_score', 'writing_score', 'math_score']
Variables categóricas: ['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch', 'test_preparation_course']


In [4]:
display(df.head())

print("\nResumen estadístico solo numéricas:")
display(df[numeric_features].describe().T)

print("\nDistribución de variables categóricas:")
for col in categorical_features:
    print(f"\nColumna: {col}")
    print(df[col].value_counts())


Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75



Resumen estadístico solo numéricas:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
reading_score,1000.0,69.169,14.600192,17.0,59.0,70.0,79.0,100.0
writing_score,1000.0,68.054,15.195657,10.0,57.75,69.0,79.0,100.0
math_score,1000.0,66.089,15.16308,0.0,57.0,66.0,77.0,100.0



Distribución de variables categóricas:

Columna: gender
gender
female    518
male      482
Name: count, dtype: int64

Columna: race_ethnicity
race_ethnicity
group C    319
group D    262
group B    190
group E    140
group A     89
Name: count, dtype: int64

Columna: parental_level_of_education
parental_level_of_education
some college          226
associate's degree    222
high school           196
some high school      179
bachelor's degree     118
master's degree        59
Name: count, dtype: int64

Columna: lunch
lunch
standard        645
free/reduced    355
Name: count, dtype: int64

Columna: test_preparation_course
test_preparation_course
none         642
completed    358
Name: count, dtype: int64


In [5]:
print("Calculando matriz de correlación...")
corr = df[numeric_features].corr()
display(corr)
fig, ax = plt.subplots(figsize=(6, 4))
sns.heatmap(corr, annot=True, cmap="YlGnBu", fmt=".2f", ax=ax)
ax.set_title("Correlación de variables numéricas")
figure_path = reports_dir / "heatmap_correlaciones.png"
fig.tight_layout()
fig.savefig(figure_path, dpi=200)
plt.close(fig)
print(f"Heatmap guardado en {figure_path}")


Calculando matriz de correlación...


Unnamed: 0,reading_score,writing_score,math_score
reading_score,1.0,0.954598,0.81758
writing_score,0.954598,1.0,0.802642
math_score,0.81758,0.802642,1.0


Heatmap guardado en C:\Users\juanp\OneDrive\Escritorio\ML\Proyecto\mlops_pipeline\reports\heatmap_correlaciones.png


In [6]:
print("Generando histogramas...")
for col in numeric_features:
    fig, ax = plt.subplots(figsize=(5, 3))
    sns.histplot(df[col], kde=True, ax=ax, color="#4C72B0")
    ax.set_title(f"Histograma de {col}")
    ax.set_xlabel(col)
    fig.tight_layout()
    out_path = reports_dir / f"hist_{col}.png"
    fig.savefig(out_path, dpi=200)
    plt.close(fig)
    print(f"Histograma guardado: {out_path}")


Generando histogramas...
Histograma guardado: C:\Users\juanp\OneDrive\Escritorio\ML\Proyecto\mlops_pipeline\reports\hist_reading_score.png
Histograma guardado: C:\Users\juanp\OneDrive\Escritorio\ML\Proyecto\mlops_pipeline\reports\hist_writing_score.png
Histograma guardado: C:\Users\juanp\OneDrive\Escritorio\ML\Proyecto\mlops_pipeline\reports\hist_math_score.png


In [7]:
fig, ax = plt.subplots(figsize=(6, 4))
sns.boxplot(data=df, x="gender", y=config["features"]["target"], palette="Set2", ax=ax)
ax.set_title("Math score por género")
ax.set_xlabel("Género")
ax.set_ylabel("Math score")
fig.tight_layout()
box_path = reports_dir / "boxplot_math_por_genero.png"
fig.savefig(box_path, dpi=200)
plt.close(fig)
print(f"Boxplot guardado en {box_path}")


Boxplot guardado en C:\Users\juanp\OneDrive\Escritorio\ML\Proyecto\mlops_pipeline\reports\boxplot_math_por_genero.png



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(data=df, x="gender", y=config["features"]["target"], palette="Set2", ax=ax)
