In [1]:
#libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_row', None)

In [2]:
data1 = pd.read_csv('train.csv')
data2 = pd.read_csv('test.csv')
data3 = pd.read_csv('Exam_Score_Prediction.csv')

train_df = data1.copy()
test_df = data2.copy()
previous_df = data3.copy()

print("First 5 rows of train data: \n", train_df.head())
print("First 5 rows of test data: \n", test_df.head())

First 5 rows of train data: 
    id  age  gender   course  study_hours  class_attendance internet_access  \
0   0   21  female     b.sc         7.91              98.8              no   
1   1   18   other  diploma         4.95              94.8             yes   
2   2   20  female     b.sc         4.68              92.6             yes   
3   3   19    male     b.sc         2.00              49.5             yes   
4   4   23    male      bca         7.65              86.9             yes   

   sleep_hours sleep_quality   study_method facility_rating exam_difficulty  \
0          4.9       average  online videos             low            easy   
1          4.7          poor     self-study          medium        moderate   
2          5.8          poor       coaching            high        moderate   
3          8.3       average    group study            high        moderate   
4          9.6          good     self-study            high            easy   

   exam_score  
0        7

In [3]:
sleep_map = {
    "poor": 0,
    "average": 1,
    "good": 2

}

facility_map = {
    "low": 0,
    "medium": 1,
    "high": 2
}

difficulty_map = {
    "hard": 0,
    "moderate": 1,
    "easy": 2
}

method_map = {
    "self-study": 0,
    "online videos": 1,
    "group study": 2,
    "mixed": 3,
    "coaching": 4
}

train_df['sleep_quality'] = train_df['sleep_quality'].map(sleep_map)
test_df['sleep_quality'] = test_df['sleep_quality'].map(sleep_map)

train_df['facility_rating'] = train_df['facility_rating'].map(facility_map)
test_df['facility_rating'] = test_df['facility_rating'].map(facility_map)

train_df['exam_difficulty'] = train_df['exam_difficulty'].map(difficulty_map)
test_df['exam_difficulty'] = test_df['exam_difficulty'].map(difficulty_map)

train_df['study_method'] = train_df['study_method'].map(method_map)
test_df['study_method'] = test_df['study_method'].map(method_map)

In [4]:
#dropping course

train_df = train_df.drop(columns='course')
test_df = test_df.drop(columns='course')

In [5]:
gender_map = {
    "other": "female",
    "male": "male",
    "female": "female"
}

train_df['gender'] = train_df['gender'].map(gender_map)
test_df['gender'] = test_df['gender'].map(gender_map)

In [6]:
train_df_encoded = pd.get_dummies(train_df, columns=['gender', 'internet_access'], drop_first= True)
test_df_encoded = pd.get_dummies(test_df, columns=['gender', 'internet_access'], drop_first= True)

In [7]:
#train test split

from sklearn.model_selection import train_test_split

X = train_df_encoded.drop(columns='exam_score')
y = train_df_encoded['exam_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, random_state=42)

In [8]:
import optuna
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, make_scorer

In [9]:
def objective(trial):
    """Optuna objective function for XGBRegressor hyperparameter optimization"""
    
    params = {
        # Core Parameters
        'n_estimators': trial.suggest_int('n_estimators', 100, 2000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        
        # Tree Structure
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'max_delta_step': trial.suggest_int('max_delta_step', 0, 10),
        'max_leaves': trial.suggest_int('max_leaves', 0, 256),
        
        # Regularization
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
        
        # Sampling
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.5, 1.0),
        'colsample_bynode': trial.suggest_float('colsample_bynode', 0.5, 1.0),
        
        # Learning Task
        'objective': 'reg:squarederror',
        
        # Fixed Parameters
        'random_state': 42,
        'n_jobs': -1,
        'verbosity': 0,
        'tree_method': 'hist',  # Faster than 'exact'
        'enable_categorical': False,
    }
    
    model = XGBRegressor(**params)
    
    # Use cross-validation
    cv_scores = cross_val_score(
        model, X, y, 
        cv=5, 
        scoring='neg_root_mean_squared_error',  # Or 'neg_mean_squared_error'
        n_jobs=-1
    )
    
    return np.mean(cv_scores)

In [10]:
study = optuna.create_study(direction='maximize')  # Maximize negative RMSE = minimize RMSE
study.optimize(objective, n_trials=50, timeout=3600)

print("Best trial:")
trial = study.best_trial
print(f"  Score (negative RMSE): {trial.value:.4f}")
print("  Best parameters:")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

[I 2026-01-02 21:26:29,625] A new study created in memory with name: no-name-77dcc7ac-fdcc-44f9-9738-798c834fe4e2
[I 2026-01-02 21:30:36,997] Trial 0 finished with value: -8.79746542422575 and parameters: {'n_estimators': 1198, 'learning_rate': 0.014920458434350392, 'max_depth': 11, 'min_child_weight': 13, 'gamma': 3.871737186578253, 'max_delta_step': 0, 'max_leaves': 239, 'reg_alpha': 0.21428707417688483, 'reg_lambda': 1.42220818454185, 'subsample': 0.7198587872721647, 'colsample_bytree': 0.5608722265128161, 'colsample_bylevel': 0.6279742478063277, 'colsample_bynode': 0.8250291650303799}. Best is trial 0 with value: -8.79746542422575.
[I 2026-01-02 21:34:52,425] Trial 1 finished with value: -8.796628968742544 and parameters: {'n_estimators': 1281, 'learning_rate': 0.015981094031392716, 'max_depth': 11, 'min_child_weight': 4, 'gamma': 1.91070706035072, 'max_delta_step': 3, 'max_leaves': 245, 'reg_alpha': 2.298058985412718e-06, 'reg_lambda': 4.352096762386114, 'subsample': 0.89517096901

KeyboardInterrupt: 