In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Balancear el dataset: seleccionar 222 registros aleatorios por grupo étnico
df_balanced_gpa = df.groupby('Ethnicity', group_keys=False).apply(lambda x: x.sample(n=222, random_state=42)).reset_index(drop=True)

# Eliminar columnas StudentID y GradeClass
df_balanced_gpa = df_balanced_gpa.drop(['StudentID', 'GradeClass'], axis=1)

# Definir características y variable objetivo
X_gpa = df_balanced_gpa.drop('GPA', axis=1)
y_gpa = df_balanced_gpa['GPA']

# Separar en conjuntos de entrenamiento y prueba (80% train, 20% test)
X_train_gpa, X_test_gpa, y_train_gpa, y_test_gpa = train_test_split(X_gpa, y_gpa, test_size=0.2, random_state=42)

In [None]:
print("X_train_gpa shape:", X_train_gpa.shape)
print("X_test_gpa shape:", X_test_gpa.shape)
print("y_train_gpa shape:", y_train_gpa.shape)
print("y_test_gpa shape:", y_test_gpa.shape)

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
import joblib

# Definir columnas numéricas y categóricas
numeric_features = ['Age', 'StudyTimeWeekly', 'Absences']
categorical_features = ['Gender', 'Ethnicity', 'ParentalEducation', 'Tutoring', 'ParentalSupport', 'Extracurricular', 'Sports', 'Music', 'Volunteering']

# Crear el preprocesador
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

# Crear el pipeline con preprocesamiento y regresión
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('reg', Ridge())
])

# Entrenar el pipeline
pipeline.fit(X_train, y_train)

# Guardar el pipeline entrenado
joblib.dump(pipeline, 'pipeline_regresion.joblib')

['pipeline_regresion.joblib']

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
import joblib

# Definir columnas numéricas y categóricas
numeric_features = ['Age', 'StudyTimeWeekly', 'Absences']
categorical_features = ['Gender', 'Ethnicity', 'ParentalEducation', 'Tutoring', 'ParentalSupport', 'Extracurricular', 'Sports', 'Music', 'Volunteering']

# Definir categorías posibles para cada variable categórica
categorical_values = [
    [0, 1],                # Gender
    [0, 1, 2, 3],          # Ethnicity
    [0, 1, 2, 3, 4],       # ParentalEducation
    [0, 1],                # Tutoring
    [0, 1, 2, 3, 4],       # ParentalSupport
    [0, 1],                # Extracurricular
    [0, 1],                # Sports
    [0, 1],                # Music
    [0, 1]                 # Volunteering
]

# Crear el preprocesador con categorías fijas
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(categories=categorical_values, handle_unknown='ignore'), categorical_features)
    ]
)

# Crear el pipeline con preprocesamiento y regresión
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('reg', Ridge())
])

# Entrenar el pipeline
pipeline.fit(X_train, y_train)

# Guardar el pipeline entrenado
joblib.dump(pipeline, 'pipeline_regresion.joblib')

In [None]:
# Seleccionar dos registros aleatorios del conjunto X_test
muestra_aleatoria = X_test.sample(n=2, random_state=123)
print("Registros seleccionados:\n", muestra_aleatoria)

# Realizar la predicción con el pipeline cargado
predicciones_aleatorias = pipeline.predict(muestra_aleatoria)
print("Predicciones de GPA para los dos registros aleatorios:", predicciones_aleatorias)