In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("jayaantanaath/student-habits-vs-academic-performance")

print("Path to dataset files:", path)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score, GridSearchCV
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("/home/kendama/.cache/kagglehub/datasets/jayaantanaath/student-habits-vs-academic-performance/versions/1/student_habits_performance.csv")

In [None]:
df.info()

### Split Data

In [None]:
# First drop na
df.dropna(subset=["parental_education_level"], inplace=True)

In [None]:
y = df["exam_score"]
X = df[[col for col in df.columns if col != "exam_score"]]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### EDA

#### Target

In [None]:
ax, fig = plt.subplots(figsize=(9,5))
ax = y_train.hist(grid=False, bins=20, edgecolor="lightgrey", density=True)
ax = y_train.plot(kind="kde", color="darkblue")
ax.set_title("Distribution of exam scroes")
ax.set_xlabel("Exam score")
ax.set_ylabel("")
ax.set_xlim(0, 120)
ax.tick_params(left=False, bottom=False)
ax.set_yticks([])
for ax, spine in ax.spines.items():
    spine.set_visible(False)
plt.show()

#### Categorical

In [None]:
categorical_columns = ["gender", "parental_education_level", "diet_quality", "part_time_job", "extracurricular_participation", "internet_quality"]
cmap = plt.cm.twilight
fig, axes = plt.subplots(ncols=3, nrows=2, figsize=(16,9))
for i, col in enumerate(categorical_columns):
    row_ax = i // 3
    col_ax = i % 3
    ax = axes[row_ax, col_ax]
    data = pd.concat([X_train[col], y_train], axis=1)
    sns.boxplot(
        data=data, 
        x=col, 
        y="exam_score",
        color=cmap(i*0.05),
        ax=ax,
    )
    ax.set_title(col, fontdict={"fontweight": "bold", "fontsize": 10})
    ax.set_xlabel("")
    ax.set_ylabel("exam score")
    
    # add overall mean
    overall_median = y_train.median()
    ax.axhline(overall_median, linestyle="--", color="white", linewidth=0.75)
    
    # add observations per group
    xlabels = [x.get_text() for x in ax.get_xticklabels()]
    observations = data.groupby(col).count().exam_score.to_dict()
    
    for i, label in enumerate(xlabels):
       xlabels[i] = label + f"\nn={observations[label]}" 
       
    ax.set_xticks(np.arange(len(xlabels)))
    ax.set_xticklabels(xlabels)

plt.subplots_adjust(hspace = 0.3)    
plt.show()

- gender = "other" has to few records to make and evaluate predictions on folds and will therefore be excluded
- parental_education seems to favor students with "Bachelor"-parents, while "master"-parents have the lowest median, which is somewhat unexpected (sample size is not very large though)
- poor diet_quality is detrimental and one of the strongest effects among all categorical features
- having a part_time_job is very slightly a negative condition, but probably insignificant
- extracurricular_participation shows no diffecence between its classes
- poor internet_quality seems to favor good grades (possible explenation could be that students substitute internet with books, which might provide a more through learning experience)

In [None]:
categorical_df = X_train.value_counts(subset=categorical_columns).reset_index()
fig = px.sunburst(
    categorical_df,
    path=["diet_quality", "parental_education_level", "internet_quality"],
    values="count",
    color="count",
    color_continuous_scale="rdbu_r",
    width=960,
    height=600
)
fig.show()

#### Numerical

In [None]:
train_df = pd.concat([X_train, y_train], axis=1)
numeric_cols = train_df.select_dtypes(include=np.number).columns

In [None]:
corr_df = train_df[numeric_cols].corr()
corr_df.style.background_gradient(cmap="Blues")

- Besides study hours, mental health seems to have the biggest impacts on grades
- We should keep this rating balanced across all future folds in order to get accurate evaluation scores

### Preprocess

In [None]:
def remove_other(X, y):
    idx = X.loc[X.gender == "Other"].index
    X.drop(idx, inplace=True)
    y.drop(idx, inplace=True)

In [None]:
remove_other(X_train, y_train)

In [None]:
class DatasetPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, add_job_or_extracurricular=True):
        self.add_job_or_extracurricular = add_job_or_extracurricular
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        if "student_id" in X.columns:
            X = X.drop(["student_id"], axis=1)
        X["part_time_job"] = X.part_time_job.map({"Yes": True, "No": False})
        X["extracurricular_participation"] = X.extracurricular_participation.map({"Yes": True, "No": False})
        if self.add_job_or_extracurricular:
            X["job_or_extracurricular"] = X.part_time_job | X.extracurricular_participation
            X = X.drop(["part_time_job", "extracurricular_participation"], axis=1)
        return X
        

In [None]:
pipe = make_pipeline(
        DatasetPreprocessor(add_job_or_extracurricular=True),
        ColumnTransformer([
            ("num", StandardScaler(), make_column_selector(dtype_include=np.number)),
            ("cat", OrdinalEncoder(), make_column_selector(dtype_exclude=np.number))
        ])
)

### Model Selection

In [None]:
model = [LinearRegression(), Ridge(), Lasso(), DecisionTreeRegressor(), RandomForestRegressor()]

skf = StratifiedKFold(n_splits=5)

for i, (dev_index, val_index) in enumerate(skf.split(X_train, X_train.mental_health_rating)):
    X_dev = X_train.iloc[dev_index]
    y_dev = y_train.iloc[dev_index]
    X_val = X_train.iloc[val_index]
    y_val = y_train.iloc[val_index]
    
    X_dev = pipe.fit_transform(X_dev)
    X_val = pipe.transform(X_val)
    
    model[i].fit(X_dev, y_dev)
    r2_scores = cross_val_score(model[i], X_val, y_val, cv=3, scoring="r2")
    #rmse_scores = cross_val_score(model[i], X_val, y_val, cv=3, scoring="neg_root_mean_squared_error")
    mean_scores = np.mean(r2_scores)
    sd_scores = np.std(r2_scores)
    print(r2_scores, mean_scores, sd_scores)

Ridge-Regression has best R^2

### Fine-Tuning Ridge

In [None]:
grid = {
    "alpha": [0.5, 1, 2, 10]
}


grid_search = GridSearchCV(model[1], param_grid=grid, cv=5)

X_train = pipe.fit_transform(X_train)

grid_search.fit(X_train, y_train)

In [None]:
grid_search.cv_results_

In [None]:
grid_search.best_estimator_

### Predict and Evaluate on test data

In [None]:
remove_other(X_test, y_test)
X_test = pipe.transform(X_test)
y_pred = model[1].predict(X_test)
print("RMSE: ", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R^2: ", r2_score(y_test, y_pred))