In [61]:
import sklearn
import joblib

import pandas as pd 
import numpy as np 

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.linear_model import LinearRegression

from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [62]:
df = pd.read_csv('data.csv')
y = df['GPA']
x = df.drop(columns=['StudentID', 'GradeClass', 'GPA', 'Gender', 'Ethnicity'])
x = pd.get_dummies(x, columns=['ParentalEducation'], drop_first=True)


x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
print("Train size:", x_train.shape, y_train.shape)
print("Test size:", x_test.shape, y_test.shape)

Train size: (1913, 13) (1913,)
Test size: (479, 13) (479,)


In [63]:
# 1. Lineal
linear = LinearRegression()
linear.fit(x_train, y_train)
y_pred_lin = linear.predict(x_test)

# 2. Polinómico grado 2
poly2 = Pipeline([
    ("poly", PolynomialFeatures(degree=2, include_bias=False)),
    ("lin", LinearRegression())
])
poly2.fit(x_train, y_train)
y_pred_poly2 = poly2.predict(x_test)

# 3. Árbol de decisión
tree = DecisionTreeRegressor(random_state=42)
tree.fit(x_train, y_train)
y_pred_tree = tree.predict(x_test)

# 4. Random Forest
forest = RandomForestRegressor(n_estimators=200, random_state=42)
forest.fit(x_train, y_train)
y_pred_forest = forest.predict(x_test)

In [64]:
model_names = ["Lineal", "Polinómica (deg=2)", "Tree", "RandomForest"]
predictions = [y_pred_lin, y_pred_poly2, y_pred_tree, y_pred_forest]

results = []

for name, y_pred in zip(model_names, predictions):
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    results.append((name, rmse, r2))
    print(f"{name:20}: RMSE={rmse:.3f}, R²={r2:.3f}")

Lineal              : RMSE=0.196, R²=0.953
Polinómica (deg=2)  : RMSE=0.201, R²=0.951
Tree                : RMSE=0.332, R²=0.866
RandomForest        : RMSE=0.239, R²=0.931


In [65]:
best_model = max(results, key=lambda x: x[2]) 
print(f"\nEl modelo más óptimo según R² es: {best_model[0]} con R²={best_model[2]:.3f} y RMSE={best_model[1]:.3f}")


El modelo más óptimo según R² es: Lineal con R²=0.953 y RMSE=0.196


In [66]:
print("\n=== Importancia de variables ===")
if best_model[0] == "Lineal":
    for var, peso in zip(x_train.columns, linear.coef_):
        print(f"{var}: {peso:.4f}")
elif best_model[0] == "Polinómica (deg=2)":
    # Obtener nombres de features polinómicas
    feature_names = poly2.named_steps['poly'].get_feature_names_out(x_train.columns)
    for var, peso in zip(feature_names, poly2.named_steps['lin'].coef_):
        print(f"{var}: {peso:.4f}")
elif best_model[0] in ["Tree", "RandomForest"]:
    model_to_use = tree if best_model[0]=="Tree" else forest
    for var, imp in zip(x_train.columns, model_to_use.feature_importances_):
        print(f"{var}: {imp:.4f}")


=== Importancia de variables ===
Age: -0.0057
StudyTimeWeekly: 0.0291
Absences: -0.0995
Tutoring: 0.2576
ParentalSupport: 0.1479
Extracurricular: 0.1900
Sports: 0.1842
Music: 0.1513
Volunteering: -0.0049
ParentalEducation_1: -0.0019
ParentalEducation_2: 0.0082
ParentalEducation_3: -0.0120
ParentalEducation_4: 0.0149


In [67]:
if best_model[0] == "Lineal":
    joblib.dump(linear, 'best_model.pkl')
elif best_model[0] == "Polinómica (deg=2)":
    joblib.dump(poly2, 'best_model.pkl')
elif best_model[0] == "Tree":
    joblib.dump(tree, 'best_model.pkl')
else:
    joblib.dump(forest, 'best_model.pkl')

print("\nModelo óptimo guardado en 'best_model.pkl'")


Modelo óptimo guardado en 'best_model.pkl'
