In [1]:
import numpy as np
import pandas as pd
import seaborn as sea 
import matplotlib.pyplot as plt

from sklearn.base import clone
from sklearn.model_selection import KFold, train_test_split
from sklearn.pipeline import make_pipeline as pipe
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import PolynomialFeatures, SplineTransformer

from sklearn.linear_model import ElasticNet

import joblib
import optuna

import warnings
warnings.filterwarnings("ignore")

In [2]:
RMSE = lambda y_true, y_pred: np.sqrt(np.mean((y_true-y_pred)**2))

In [3]:
train = pd.read_csv("/kaggle/input/student-performance-prediction-machine-learning-challenge/train.csv")
test  = pd.read_csv("/kaggle/input/student-performance-prediction-machine-learning-challenge/test.csv")
sample_submission = pd.read_csv("/kaggle/input/student-performance-prediction-machine-learning-challenge/sample_submission.csv")

train.drop(columns=["id", "student_id"], inplace=True)
test.drop(columns=["id", "student_id"], inplace=True)

In [4]:
def cross_validate(model, X, y, n_splits=5):
    kfold  = KFold(n_splits=n_splits, shuffle=True, random_state=3126)
    scores = np.zeros(n_splits) 

    for i,(train_idx,val_idx) in enumerate(kfold.split(X)):
        X_train, y_train = X.iloc[train_idx,:], y[train_idx]
        X_val,   y_val   = X.iloc[val_idx,:],   y[val_idx]
        ## Fits cloned model
        cloned_model = clone(model)
        cloned_model.fit(X_train, y_train)
        ## Stores the RMSE score
        scores[i] = RMSE(y_val, cloned_model.predict(X_val))
    return scores

### I used my other <a href="https://www.kaggle.com/code/khoatran311/tuned-lgbm-model-295">notebook</a> to obtain these 6 features.

In [5]:
top_features = [
    "study_hours_per_day",
    "sleep_hours",
    "phone_usage_hours",
    "focus_score",
    "stress_level",
    "attendance_percentage"
]

X = train[top_features]
y = train["productivity_score"]

In [6]:
# def objective(trial):
#     params = {
#         "alpha":trial.suggest_float("alpha", .01, 1.0),
#         "l1_ratio":trial.suggest_float("l1_ratio", 0.0, 1.0),
#     }
#     params["random_state"] = 3126

#     model = pipe(
#         SplineTransformer(degree=3, n_knots=5),
#         StandardScaler(),
#         ElasticNet(**params)
#     )
#     scores = cross_validate(model, X, y)
#     return np.mean(scores)


# study = optuna.create_study(
#     direction="minimize",
#     sampler=optuna.samplers.TPESampler(seed=3126),
#     pruner=optuna.pruners.MedianPruner(n_warmup_steps=15)
# )
# study.optimize(objective, n_trials=200)

In [7]:
# RMSE 0.0556
poly_params = {'alpha': 0.012428994041973219, 'l1_ratio': 0.9987648159703304, 'random_state':3126}
# RMSE 0.0373
spline_params = {'alpha': 0.010052639570436513, 'l1_ratio': 0.8594188337125841, 'random_state':3126}



polynomial_model = pipe(
    PolynomialFeatures(degree=2, include_bias=False, interaction_only=False),
    StandardScaler(),
    ElasticNet(**poly_params)
)

spline_model = pipe(
    SplineTransformer(degree=3, n_knots=5),
    StandardScaler(),
    ElasticNet(**spline_params)
)

In [8]:
cross_validate(polynomial_model, X, y)

array([0.05660995, 0.05648402, 0.05480889, 0.05387881, 0.05637703])

In [9]:
cross_validate(spline_model, X, y)

array([0.03740415, 0.03714253, 0.03722461, 0.03740488, 0.03733535])

In [10]:
spline_model.fit(X, y)

In [11]:
predictions = spline_model.predict(test[top_features])
sample_submission["productivity_score"] = predictions
sample_submission.to_csv("submission.csv", index=False)

### And just like that, we obtained very high performance with a simple model!