In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/student-grade-prediction/student-mat.csv


In [2]:
pd.set_option('future.no_silent_downcasting', True)

In [3]:
dataset = pd.read_csv("/kaggle/input/student-grade-prediction/student-mat.csv")
dataset.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


In [4]:
dataset = pd.DataFrame(dataset, columns = ["sex", "age", "failures", "studytime", "paid", "higher", "internet", "goout", "Medu", "Fedu", "Mjob", "Fjob", "G3"])
dataset.head()

Unnamed: 0,sex,age,failures,studytime,paid,higher,internet,goout,Medu,Fedu,Mjob,Fjob,G3
0,F,18,0,2,no,yes,no,4,4,4,at_home,teacher,6
1,F,17,0,2,no,yes,yes,3,1,1,at_home,other,6
2,F,15,3,2,yes,yes,yes,2,1,1,at_home,other,10
3,F,15,0,3,yes,yes,yes,2,4,2,health,services,15
4,F,16,0,2,yes,yes,no,2,3,3,other,other,10


## Pré-Processamento

In [5]:
y = dataset.G3
X = dataset.drop("G3", axis = 1)

X.head()

Unnamed: 0,sex,age,failures,studytime,paid,higher,internet,goout,Medu,Fedu,Mjob,Fjob
0,F,18,0,2,no,yes,no,4,4,4,at_home,teacher
1,F,17,0,2,no,yes,yes,3,1,1,at_home,other
2,F,15,3,2,yes,yes,yes,2,1,1,at_home,other
3,F,15,0,3,yes,yes,yes,2,4,2,health,services
4,F,16,0,2,yes,yes,no,2,3,3,other,other


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib

binary_features = ['sex', 'paid', 'higher', 'internet']
binary_map = {'yes': 1, 'no': 0, 'F': 0, 'M': 1}
categorical_cols = ['Mjob', 'Fjob']
numerical_cols = ['age', 'failures', 'studytime', 'goout', 'Medu', 'Fedu']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.head()

Unnamed: 0,sex,age,failures,studytime,paid,higher,internet,goout,Medu,Fedu,Mjob,Fjob
181,M,16,0,2,yes,yes,yes,3,3,3,services,other
194,M,16,0,1,no,yes,yes,3,2,3,other,other
173,F,16,3,2,no,yes,yes,5,1,3,at_home,services
63,F,16,0,3,yes,yes,yes,4,4,3,teacher,health
253,M,16,0,1,no,yes,no,2,2,1,other,other


In [8]:
X_train[binary_features] = X_train[binary_features].replace(binary_map)
X_test[binary_features] = X_test[binary_features].replace(binary_map)

cat_encoder = OneHotEncoder(drop='first', sparse=False)
scaler = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(drop='first'), categorical_cols)
    ],
    remainder='passthrough' 
)

In [9]:
X_train_array = preprocessor.fit_transform(X_train)
X_test_array = preprocessor.transform(X_test)

In [10]:
cat_encoded_cols = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_cols)
processed_columns = list(cat_encoded_cols) + numerical_cols + binary_features  # Note order matches remainder='passthrough'

X_train_processed = pd.DataFrame(X_train_array, columns=processed_columns, index=X_train.index)
X_test_processed = pd.DataFrame(X_test_array, columns=processed_columns, index=X_test.index)

X_train_processed.head()

Unnamed: 0,Mjob_health,Mjob_other,Mjob_services,Mjob_teacher,Fjob_health,Fjob_other,Fjob_services,Fjob_teacher,age,failures,studytime,goout,Medu,Fedu,sex,paid,higher,internet
181,-0.586396,-0.456744,-0.056853,-0.144396,0.246437,0.423207,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1,1,1,1
194,-0.586396,-0.456744,-1.254553,-0.144396,-0.680636,0.423207,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1,0,1,1
173,-0.586396,3.628097,-0.056853,1.644982,-1.607709,0.423207,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,0,1,1
63,-0.586396,-0.456744,1.140847,0.750293,1.17351,0.423207,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0,1,1,1
253,-0.586396,-0.456744,-1.254553,-1.039085,-0.680636,-1.434203,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1,0,1,0


In [11]:
joblib.dump(preprocessor, 'preprocessor_pipeline.pkl')

['preprocessor_pipeline.pkl']

## Treinamento do Modelo

In [12]:
X_train_processed = X_train_processed.apply(pd.to_numeric)
X_test_processed = X_test_processed.apply(pd.to_numeric)

In [13]:
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import cross_val_predict, cross_validate

def avalia_modelo_regressao(modelo, x, y, cv=10):
    resultados = cross_validate(modelo, x, y,
                                scoring=["r2", "neg_mean_absolute_error", "neg_mean_squared_error"],
                                cv=cv, n_jobs=-1, return_estimator=True)
    
    print(f"Modelo: {modelo}\n")
    
    r2 = np.mean(resultados["test_r2"])
    mae = -np.mean(resultados["test_neg_mean_absolute_error"])
    mse = -np.mean(resultados["test_neg_mean_squared_error"])
    rmse = np.sqrt(mse)

    print("Métricas de Regressão (validação cruzada):")
    print(f"R²   --> {r2:.4f}")
    print(f"MAE  --> {mae:.4f}")
    print(f"MSE  --> {mse:.4f}")
    print(f"RMSE --> {rmse:.4f}")

    y_pred = cross_val_predict(modelo, x, y, cv=cv)
    
    print("\nExemplo de comparação real vs previsto:")
    comparacao = pd.DataFrame({'Real': y, 'Previsto': y_pred})
    print(comparacao.head())


In [14]:
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import (
    RandomForestRegressor, AdaBoostRegressor,
    ExtraTreesRegressor, GradientBoostingRegressor
)
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

In [15]:
RANDOM_STATE = 42

models = [
    ('SVR', SVR()),
    ('Linear Regression', LinearRegression()),
    ('Ridge Regression', Ridge(random_state=RANDOM_STATE)),
    ('Lasso Regression', Lasso(random_state=RANDOM_STATE)),
    ('KNN Regressor', KNeighborsRegressor()),
    ('Decision Tree', DecisionTreeRegressor(random_state=RANDOM_STATE)),
    ('Random Forest', RandomForestRegressor(random_state=RANDOM_STATE)),
    ('AdaBoost', AdaBoostRegressor(random_state=RANDOM_STATE)),
    ('Extra Trees', ExtraTreesRegressor(random_state=RANDOM_STATE)),
    ('Gradient Boosting', GradientBoostingRegressor(random_state=RANDOM_STATE)),
    ('XGBoost', XGBRegressor(random_state=RANDOM_STATE, verbosity=0)),
    ('LightGBM', LGBMRegressor(random_state=RANDOM_STATE, verbose=-1)),
    ('CatBoost', CatBoostRegressor(random_state=RANDOM_STATE, verbose=False)),
    ('MLP Regressor', MLPRegressor(random_state=RANDOM_STATE, max_iter=1000))
]

for name, model in models:
    print("\n" + "-" * 100)
    print(f"\nAvaliando o Modelo: {name}")
    avalia_modelo_regressao(model, X_train_processed, y_train)


----------------------------------------------------------------------------------------------------

Avaliando o Modelo: SVR
Modelo: SVR()

Métricas de Regressão (validação cruzada):
R²   --> 0.0983
MAE  --> 3.1541
MSE  --> 18.6470
RMSE --> 4.3182

Exemplo de comparação real vs previsto:
     Real   Previsto
181    12  11.851425
194    14  11.653194
173     0   7.951339
63      9  11.033049
253     8  11.781594

----------------------------------------------------------------------------------------------------

Avaliando o Modelo: Linear Regression
Modelo: LinearRegression()

Métricas de Regressão (validação cruzada):
R²   --> 0.0368
MAE  --> 3.3209
MSE  --> 19.4897
RMSE --> 4.4147

Exemplo de comparação real vs previsto:
     Real   Previsto
181    12  12.484393
194    14  10.552316
173     0   2.263182
63      9  10.913481
253     8  11.435653

----------------------------------------------------------------------------------------------------

Avaliando o Modelo: Ridge Regression



Modelo: MLPRegressor(max_iter=1000, random_state=42)

Métricas de Regressão (validação cruzada):
R²   --> -0.2348
MAE  --> 3.8275
MSE  --> 24.5437
RMSE --> 4.9542





Exemplo de comparação real vs previsto:
     Real   Previsto
181    12  12.905004
194    14  12.724791
173     0   1.073056
63      9  11.359972
253     8  12.189819




**Baseline - Regressão Linear**

R²   --> 0.0368

MAE  --> 3.3209

MSE  --> 19.4897

RMSE --> 4.4147

In [16]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.linear_model import Ridge, Lasso

# SVR 
svr_param_grid = {
    'C': [0.1, 1, 10],
    'epsilon': [0.1, 0.5, 1],
    'kernel': ['linear', 'rbf']
}
svr_grid = GridSearchCV(SVR(), svr_param_grid, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)
svr_grid.fit(X_train_processed, y_train)

# Ridge 
ridge_param_grid = {
    'alpha': [0.01, 0.1, 1, 10, 100]
}
ridge_grid = GridSearchCV(Ridge(random_state=42), ridge_param_grid, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)
ridge_grid.fit(X_train_processed, y_train)

# Lasso 
lasso_param_grid = {
    'alpha': [0.0001, 0.001, 0.01, 0.1, 1]
}
lasso_grid = GridSearchCV(Lasso(random_state=42), lasso_param_grid, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)
lasso_grid.fit(X_train_processed, y_train)

# Print best results
print("\nMelhor SVR:", svr_grid.best_params_)
print("Melhor Ridge:", ridge_grid.best_params_)
print("Melhor Lasso:", lasso_grid.best_params_)


Melhor SVR: {'C': 0.1, 'epsilon': 0.1, 'kernel': 'linear'}
Melhor Ridge: {'alpha': 100}
Melhor Lasso: {'alpha': 0.1}


In [17]:
models = [
    ('SVR', SVR(C = 0.1, kernel="linear")),
    ('Ridge Regression', Ridge(alpha = 100, random_state=RANDOM_STATE)),
    ('Lasso Regression', Lasso(alpha = 0.1, random_state=RANDOM_STATE)),
]

for name, model in models:
    print("\n" + "-" * 100)
    print(f"\nAvaliando o Modelo: {name}")
    avalia_modelo_regressao(model, X_train_processed, y_train)


----------------------------------------------------------------------------------------------------

Avaliando o Modelo: SVR
Modelo: SVR(C=0.1, kernel='linear')

Métricas de Regressão (validação cruzada):
R²   --> 0.0982
MAE  --> 3.2337
MSE  --> 18.6521
RMSE --> 4.3188

Exemplo de comparação real vs previsto:
     Real   Previsto
181    12  11.597760
194    14  10.516455
173     0   4.973691
63      9  11.399503
253     8  11.518044

----------------------------------------------------------------------------------------------------

Avaliando o Modelo: Ridge Regression
Modelo: Ridge(alpha=100, random_state=42)

Métricas de Regressão (validação cruzada):
R²   --> 0.1200
MAE  --> 3.1765
MSE  --> 18.0807
RMSE --> 4.2521

Exemplo de comparação real vs previsto:
     Real   Previsto
181    12  11.660469
194    14  10.394351
173     0   4.932365
63      9  11.532994
253     8  10.784688

----------------------------------------------------------------------------------------------------



In [18]:
end_model = Ridge(alpha = 100, random_state=RANDOM_STATE)

In [19]:
x_full = np.concatenate([X_train_processed, X_test_processed], axis=0)

y_full = pd.concat([y_train, y_test], axis=0)

In [20]:
end_model.fit(x_full, y_full)

In [21]:
joblib.dump(end_model, "grade_prediction_model.pkl")

['grade_prediction_model.pkl']