<a href="https://colab.research.google.com/github/GuilleC1987/Modulo2_Tarea1/blob/main/Modulo_2_Tarea1_Guillermo_Canas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [114]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split, cross_val_score

In [None]:
df = pd.read_csv('/content/Student_Performance.csv')
df.head()

## EDA

In [None]:
df.info()

In [None]:
df.describe().T

In [118]:
correlation_matrix = df.corr(numeric_only=True)

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix,
            annot=True,           # Mostrar valores numéricos
            cmap='coolwarm',      # Esquema de colores
            center=0,             # Centrar el color en 0
            square=True,          # Hacer cuadrados las celdas
            fmt='.2f',            # Formato de números (2 decimales)
            cbar_kws={'shrink': 0.8})  # Ajustar barra de color

plt.title('Matriz de Correlación - Heatmap', fontsize=16, pad=20)
plt.tight_layout()
plt.show()

Se muestra alta correlación entre columnas Previous Scores y Perdormance Index.

In [None]:
# El df Student_Performance no cuenta con valores nulos en ninguna de sus columnas. Por lo anterior, no se realiza imputación de datos nulos o eliminación de
# columnas con porcentaje alto de valores nulos
null_values = df.isnull().sum()
null_values

In [None]:
sns.pairplot(df, diag_kind="kde");

In [123]:
#Se dividen las variables que se utilizarán como entrada y se separan de la variable objetivo.
X = df.drop(columns=["Performance Index"])
y = df["Performance Index"]

In [None]:
# Se identifican columnas numéricas
numeric_cols = X.select_dtypes(include="number").columns
numeric_cols

In [None]:
# Se identifican columnas categóricas. Esta deberá de transformarse en numérica previo al entrenamiento de los modelos de regresión.
categorical_cols = X.select_dtypes(include="object").columns
categorical_cols

# Escalamiento, codificacion

In [129]:
numeric_pipeline = Pipeline(
    steps=[
        ("escalador", StandardScaler())
    ]
)

# Feature Engineering. Transformación de variables categóricas a numéricas haciendo uso de  sklearn.preprocessing OneHotEncoder

In [133]:
categorical_pipeline = Pipeline(
    steps=[
        ("onehot", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))
    ]
)

In [134]:
preprocessor = ColumnTransformer(
    transformers=[
        ("numerico", numeric_pipeline, numeric_cols),
        ("categorico", categorical_pipeline, categorical_cols)
    ]
)

In [200]:
#Se definen los distintos modelos de regresión que serán aplicados en este ejercicio
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
modelo_lineal = LinearRegression()
modelo_ridge = Ridge()
modelo_lasso = Lasso()
modelo_elasticnet = ElasticNet()

# Pipeline Modelo Regresión Lineal

In [201]:
#Modelo Regresión Lineal
pipeline_reg_lineal = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("modelo", modelo_lineal)
    ]
)

In [None]:
#TRAINING
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipeline_reg_lineal.fit(X_train, y_train)

In [203]:
y_pred = pipeline_reg_lineal.predict(X_test)

In [None]:
residuos = y_test - y_pred
sns.histplot(residuos, kde=True, bins=30);

In [None]:
MAE = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"MAE: {MAE}")
print(f"R2: {r2}")

In [206]:
#CROSS - VALIDATION: Modelo de regresión linea
modelo_lineal = LinearRegression()
pipeline_reg_lineal = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("modelo", modelo_lineal)
    ]
)

In [None]:
pipeline_reg_lineal

In [208]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

In [209]:
# import KFold
from sklearn.model_selection import KFold

KFold

In [None]:
fold = KFold(n_splits=5, shuffle=True, random_state=42)
fold

In [211]:
scores_linear = cross_val_score(pipeline_reg_lineal, X_train, y_train, cv=fold, scoring="r2")

In [None]:
scores_linear

# Pipeline Modelo Ridge

In [213]:
#Modelo Ridge
pipeline_ridge = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("modelo", modelo_ridge)
    ]
)

In [None]:
#TRAINING
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipeline_ridge.fit(X_train, y_train)

In [215]:
y_pred = pipeline_ridge.predict(X_test)

In [None]:
residuos = y_test - y_pred
sns.histplot(residuos, kde=True, bins=30);

In [None]:
MAE_ridge = mean_absolute_error(y_test, y_pred)
r2_ridge = r2_score(y_test, y_pred)
print(f"MAE: {MAE_ridge}")
print(f"R2: {r2_ridge}")

In [218]:
#RIDGE CROSS - VALIDATION
modelo_ridge = Ridge()
pipeline_ridge = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("modelo", modelo_ridge)
    ]
)

In [None]:
pipeline_ridge

In [220]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

In [None]:
fold = KFold(n_splits=5, shuffle=True, random_state=42)
fold

In [None]:
scores_ridge = cross_val_score(pipeline_ridge, X_train, y_train, cv=fold, scoring="r2")
scores_ridge

# Pipeline modelo Lasso

In [223]:
#Modelo Lasso
pipeline_lasso = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("modelo", modelo_lasso)
    ]
)

In [None]:
#TRAINING
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipeline_lasso.fit(X_train, y_train)

In [225]:
y_pred = pipeline_lasso.predict(X_test)

In [None]:
residuos = y_test - y_pred
sns.histplot(residuos, kde=True, bins=30);

In [None]:
MAE_lasso = mean_absolute_error(y_test, y_pred)
r2_lasso = r2_score(y_test, y_pred)
print(f"MAE: {MAE_lasso}")
print(f"R2: {r2_lasso}")

In [228]:
#LASSO CROSS - VALIDATION
modelo_lasso = Lasso()
pipeline_lasso = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("modelo", modelo_lasso)
    ]
)

In [229]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

In [None]:
fold = KFold(n_splits=5, shuffle=True, random_state=42)
fold

In [None]:
scores_lasso = cross_val_score(pipeline_lasso, X_train, y_train, cv=fold, scoring="r2")
scores_lasso

#Pipeline Modelo ElasticNet

In [232]:
#Modelo ElasticNet
pipeline_elasticnet = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("modelo", modelo_elasticnet)
    ]
)

In [None]:
#TRAINING
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipeline_elasticnet.fit(X_train, y_train)

In [234]:
y_pred = pipeline_elasticnet.predict(X_test)

In [None]:
residuos = y_test - y_pred
sns.histplot(residuos, kde=True, bins=30);

In [None]:
MAE_elasticnet = mean_absolute_error(y_test, y_pred)
r2_elasticnet = r2_score(y_test, y_pred)
print(f"MAE: {MAE_elasticnet}")
print(f"R2: {r2_elasticnet}")

In [237]:
#ElasticNet CROSS - VALIDATION
modelo_elasticnet = ElasticNet()
pipeline_elasticnet = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("modelo", modelo_elasticnet)
    ]
)

In [238]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

In [None]:
fold = KFold(n_splits=5, shuffle=True, random_state=42)
fold

In [None]:
scores_elasticnet = cross_val_score(pipeline_elasticnet, X_train, y_train, cv=fold, scoring="r2")
scores_elasticnet

In [241]:
df_scores = pd.DataFrame({
    'LinearRegression': scores_linear,
    'Ridge': scores_ridge,
    'Lasso': scores_lasso,
    'ElasticNet': scores_elasticnet
})

In [None]:
df_scores

Para los modelos anteriores y con el fin de evitar el Overfitting se selecciona el modelo ElasticNet, dado que el modelo predice los datos con un r2 alto sin memorizar datos.