In [2]:
import pandas as pd
import numpy as np
df = pd.read_csv("StudentPerformance.csv")

#No hay valores nulos
print(df.isnull().sum())

df.head()

StudentID            0
Age                  0
Gender               0
Ethnicity            0
ParentalEducation    0
StudyTimeWeekly      0
Absences             0
Tutoring             0
ParentalSupport      0
Extracurricular      0
Sports               0
Music                0
Volunteering         0
GPA                  0
GradeClass           0
dtype: int64


Unnamed: 0,StudentID,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass
0,1001,17,1,0,2,19.833723,7,1,2,0,0,1,0,2.929196,2.0
1,1002,18,0,0,1,15.408756,0,0,1,0,0,0,0,3.042915,1.0
2,1003,15,0,2,3,4.21057,26,0,2,0,0,0,0,0.112602,4.0
3,1004,17,1,0,3,10.028829,14,0,3,1,0,0,0,2.054218,3.0
4,1005,17,1,0,2,4.672495,17,1,3,0,0,0,0,1.288061,4.0


In [10]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import ElasticNetCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [11]:
target = "GPA"
sensitive = ["Gender", "Ethnicity"]             
drop_cols = ["StudentID", target, "GradeClass"]   

In [12]:
train_cols = [c for c in df.columns if c not in drop_cols + sensitive]
num_cols = ["Age", "StudyTimeWeekly", "Absences"]
cat_cols = [c for c in train_cols if c not in num_cols]
X = df[train_cols].copy()
y = df[target].astype(float)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)
df_train = df.loc[X_train.index].copy()
df_test  = df.loc[X_test.index].copy()

In [13]:
preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore", drop=None), cat_cols),
    ],
    remainder="drop"
)


In [14]:
model = ElasticNetCV(
    l1_ratio=[0.05, 0.1, 0.5, 0.9, 0.95, 1.0],
    cv=5,
    random_state=42
)
reg = Pipeline(steps=[
    ("prep", preprocess),
    ("model", model),
])
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)


In [15]:
mae  = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2   = r2_score(y_test, y_pred)
print(f"MAE: {mae:.3f} | RMSE: {rmse:.3f} | R²: {r2:.3f}")
best_alpha = reg.named_steps["model"].alpha_
best_l1    = reg.named_steps["model"].l1_ratio_
print(f"Best alpha: {best_alpha:.6f} | Best l1_ratio: {best_l1}")

MAE: 0.156 | RMSE: 0.197 | R²: 0.953
Best alpha: 0.000842 | Best l1_ratio: 1.0


In [16]:
import joblib
joblib.dump(reg, "student_gpa_predictor.pkl")

['student_gpa_predictor.pkl']