# Análisis exploratorio de Students Performance


Exploramos la distribución de variables, correlaciones y guardamos figuras clave en `mlops_pipeline/reports/`.


In [3]:
import json
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from IPython.display import display

CONFIG_PATH = Path("config.json")
if not CONFIG_PATH.exists():
    CONFIG_PATH = Path("../../config.json").resolve()

with CONFIG_PATH.open(encoding="utf-8") as cfg:
    config = json.load(cfg)

project_root = CONFIG_PATH.parent
clean_path = project_root / config["data"]["clean_dataset"]
reports_dir = project_root / config["paths"]["eda_reports"]
reports_dir.mkdir(parents=True, exist_ok=True)

sns.set_theme(style="whitegrid")
df = pd.read_csv(clean_path)
print(f"Datos listos: {df.shape} registros")
target = config["features"]["target"]
numeric_features = config["features"]["numeric"] + [target]
categorical_features = config["features"]["categorical"]
print("Variables numéricas:", numeric_features)
print("Variables categóricas:", categorical_features)


Datos listos: (1000, 8) registros
Variables numéricas: ['reading_score', 'writing_score', 'math_score']
Variables categóricas: ['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch', 'test_preparation_course']


In [None]:
print('Diagnóstico inicial del dataset limpio')
print(f'Observaciones: {df.shape[0]} | Variables: {df.shape[1]}')
df.info()
null_tokens = ['', 'NA', 'Na', 'na', 'N/A', 'n/a', 'Null', 'NULL']
df[categorical_features] = df[categorical_features].replace(null_tokens, pd.NA)
df[categorical_features] = df[categorical_features].astype('category')
df[config['features']['numeric']] = df[config['features']['numeric']].apply(pd.to_numeric, errors='coerce')
duplicate_rows = df.duplicated().sum()
if duplicate_rows:
    df = df.drop_duplicates().reset_index(drop=True)
print(f'Duplicados eliminados: {duplicate_rows}')
null_summary = df.isna().sum().sort_values(ascending=False)
print('Nulos por columna:')
print(null_summary)
describe_df = df[numeric_features].describe().T
display(describe_df)
describe_path = reports_dir / 'describe_numeric.csv'
describe_df.to_csv(describe_path, encoding='utf-8')
print(f'report describe guardado en {describe_path}')


In [4]:
display(df.head())

print("\nResumen estadístico solo numéricas:")
display(df[numeric_features].describe().T)

print("\nDistribución de variables categóricas:")
for col in categorical_features:
    print(f"\nColumna: {col}")
    print(df[col].value_counts())


Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75



Resumen estadístico solo numéricas:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
reading_score,1000.0,69.169,14.600192,17.0,59.0,70.0,79.0,100.0
writing_score,1000.0,68.054,15.195657,10.0,57.75,69.0,79.0,100.0
math_score,1000.0,66.089,15.16308,0.0,57.0,66.0,77.0,100.0



Distribución de variables categóricas:

Columna: gender
gender
female    518
male      482
Name: count, dtype: int64

Columna: race_ethnicity
race_ethnicity
group C    319
group D    262
group B    190
group E    140
group A     89
Name: count, dtype: int64

Columna: parental_level_of_education
parental_level_of_education
some college          226
associate's degree    222
high school           196
some high school      179
bachelor's degree     118
master's degree        59
Name: count, dtype: int64

Columna: lunch
lunch
standard        645
free/reduced    355
Name: count, dtype: int64

Columna: test_preparation_course
test_preparation_course
none         642
completed    358
Name: count, dtype: int64


In [None]:
print('Calculando medidas estadísticas extendidas')
num_cols = numeric_features
mode_values = df[num_cols].mode().iloc[0]
stats_df = pd.DataFrame({
    'mean': df[num_cols].mean(),
    'median': df[num_cols].median(),
    'mode': mode_values,
    'range': df[num_cols].max() - df[num_cols].min(),
    'iqr': df[num_cols].quantile(0.75) - df[num_cols].quantile(0.25),
    'variance': df[num_cols].var(),
    'std_dev': df[num_cols].std(),
    'skewness': df[num_cols].skew(),
    'kurtosis': df[num_cols].kurtosis(),
})
stats_df = stats_df.round(3)
display(stats_df)
stats_path = reports_dir / 'numeric_stats.csv'
stats_df.to_csv(stats_path, encoding='utf-8')
print(f'Estadísticos guardados en {stats_path}')


In [None]:
print('Tablas pivote del target por cada variable categórica')
for col in categorical_features:
    pivot = (
        df.groupby(col)[target]
        .agg(count='count', mean='mean', median='median')
        .sort_values('mean', ascending=False)
    )
    display(pivot)
    out_path = reports_dir / f'pivot_{col}_target.csv'
    pivot.to_csv(out_path, encoding='utf-8')
    print(f'Tabla pivote guardada en {out_path}')


In [None]:
import math
n_cols = 2
n_rows = math.ceil(len(categorical_features) / n_cols)
fig, axes = plt.subplots(n_rows, n_cols, figsize=(14, 4 * n_rows))
axes = axes.flatten()
for ax, col in zip(axes, categorical_features):
    sns.countplot(data=df, x=col, palette='viridis', ax=ax)
    ax.set_title(f'Distribución de {col}')
    ax.tick_params(axis='x', rotation=30)
for ax in axes[len(categorical_features):]:
    ax.axis('off')
fig.tight_layout()
countplot_path = reports_dir / 'countplots_categoricas.png'
fig.savefig(countplot_path, dpi=200)
plt.close(fig)
print(f'Countplots guardados en {countplot_path}')


In [5]:
print("Calculando matriz de correlación...")
corr = df[numeric_features].corr()
display(corr)
fig, ax = plt.subplots(figsize=(6, 4))
sns.heatmap(corr, annot=True, cmap="YlGnBu", fmt=".2f", ax=ax)
ax.set_title("Correlación de variables numéricas")
figure_path = reports_dir / "heatmap_correlaciones.png"
fig.tight_layout()
fig.savefig(figure_path, dpi=200)
plt.close(fig)
print(f"Heatmap guardado en {figure_path}")


Calculando matriz de correlación...


Unnamed: 0,reading_score,writing_score,math_score
reading_score,1.0,0.954598,0.81758
writing_score,0.954598,1.0,0.802642
math_score,0.81758,0.802642,1.0


Heatmap guardado en C:\Users\juanp\OneDrive\Escritorio\ML\Proyecto\mlops_pipeline\reports\heatmap_correlaciones.png


In [6]:
print("Generando histogramas...")
for col in numeric_features:
    fig, ax = plt.subplots(figsize=(5, 3))
    sns.histplot(df[col], kde=True, ax=ax, color="#4C72B0")
    ax.set_title(f"Histograma de {col}")
    ax.set_xlabel(col)
    fig.tight_layout()
    out_path = reports_dir / f"hist_{col}.png"
    fig.savefig(out_path, dpi=200)
    plt.close(fig)
    print(f"Histograma guardado: {out_path}")


Generando histogramas...
Histograma guardado: C:\Users\juanp\OneDrive\Escritorio\ML\Proyecto\mlops_pipeline\reports\hist_reading_score.png
Histograma guardado: C:\Users\juanp\OneDrive\Escritorio\ML\Proyecto\mlops_pipeline\reports\hist_writing_score.png
Histograma guardado: C:\Users\juanp\OneDrive\Escritorio\ML\Proyecto\mlops_pipeline\reports\hist_math_score.png


In [7]:
fig, ax = plt.subplots(figsize=(6, 4))
sns.boxplot(data=df, x="gender", y=config["features"]["target"], palette="Set2", ax=ax)
ax.set_title("Math score por género")
ax.set_xlabel("Género")
ax.set_ylabel("Math score")
fig.tight_layout()
box_path = reports_dir / "boxplot_math_por_genero.png"
fig.savefig(box_path, dpi=200)
plt.close(fig)
print(f"Boxplot guardado en {box_path}")


Boxplot guardado en C:\Users\juanp\OneDrive\Escritorio\ML\Proyecto\mlops_pipeline\reports\boxplot_math_por_genero.png



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(data=df, x="gender", y=config["features"]["target"], palette="Set2", ax=ax)


In [None]:
pair_cols = list(dict.fromkeys(numeric_features))
pairplot = sns.pairplot(df[pair_cols + ['gender']], hue='gender', corner=True, diag_kind='hist')
pairplot.fig.suptitle('Pairplot de variables numéricas', y=1.02)
pairplot_path = reports_dir / 'pairplot_numeric.png'
pairplot.fig.savefig(pairplot_path, dpi=200)
plt.close('all')
print(f'Pairplot guardado en {pairplot_path}')


In [None]:
fig, axes = plt.subplots(1, len(config['features']['numeric']), figsize=(12, 4))
axes = np.atleast_1d(axes)
for ax, col in zip(axes, config['features']['numeric']):
    sns.scatterplot(data=df, x=col, y=target, hue='test_preparation_course', palette='Set2', ax=ax)
    ax.set_title(f'{col} vs {target}')
    ax.legend(loc='best', fontsize=8)
fig.tight_layout()
scatter_path = reports_dir / 'scatter_vs_target.png'
fig.savefig(scatter_path, dpi=200)
plt.close(fig)
print(f'Relación con el target guardada en {scatter_path}')


### Reglas de validación y atributos derivados sugeridos
- Validar que `math_score`, `reading_score` y `writing_score` permanezcan entre 0 y 100.
- Revisar que `test_preparation_course` solo tome los valores ['none', 'completed'].
- Confirmar que `lunch` conserve las etiquetas ['standard', 'free/reduced'].
- Crear atributos como `language_avg = (reading_score + writing_score)/2` o banderas binarias para `test_preparation_course` y `parental_level_of_education` ordenado.
- Monitorear cambios en la proporción de `gender` y `race_ethnicity` para detectar sesgos tempranos.
