In [18]:
#libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_row', None)

In [19]:
data1 = pd.read_csv('train.csv')
data2 = pd.read_csv('test.csv')
data3 = pd.read_csv('Exam_Score_Prediction.csv')

train_df = data1.copy()
test_df = data2.copy()
previous_df = data3.copy()

print("First 5 rows of train data: \n", train_df.head())
print("First 5 rows of test data: \n", test_df.head())

First 5 rows of train data: 
    id  age  gender   course  study_hours  class_attendance internet_access  \
0   0   21  female     b.sc         7.91              98.8              no   
1   1   18   other  diploma         4.95              94.8             yes   
2   2   20  female     b.sc         4.68              92.6             yes   
3   3   19    male     b.sc         2.00              49.5             yes   
4   4   23    male      bca         7.65              86.9             yes   

   sleep_hours sleep_quality   study_method facility_rating exam_difficulty  \
0          4.9       average  online videos             low            easy   
1          4.7          poor     self-study          medium        moderate   
2          5.8          poor       coaching            high        moderate   
3          8.3       average    group study            high        moderate   
4          9.6          good     self-study            high            easy   

   exam_score  
0        7

In [20]:
sleep_map = {
    "poor": 0,
    "average": 1,
    "good": 2

}

facility_map = {
    "low": 0,
    "medium": 1,
    "high": 2
}

difficulty_map = {
    "hard": 0,
    "moderate": 1,
    "easy": 2
}

method_map = {
    "self-study": 0,
    "online videos": 1,
    "group study": 2,
    "mixed": 3,
    "coaching": 4
}

train_df['sleep_quality'] = train_df['sleep_quality'].map(sleep_map)
test_df['sleep_quality'] = test_df['sleep_quality'].map(sleep_map)

train_df['facility_rating'] = train_df['facility_rating'].map(facility_map)
test_df['facility_rating'] = test_df['facility_rating'].map(facility_map)

train_df['exam_difficulty'] = train_df['exam_difficulty'].map(difficulty_map)
test_df['exam_difficulty'] = test_df['exam_difficulty'].map(difficulty_map)

train_df['study_method'] = train_df['study_method'].map(method_map)
test_df['study_method'] = test_df['study_method'].map(method_map)

In [21]:
gender_map = {
    "other": "female",
    "male": "male",
    "female": "female"
}

train_df['gender'] = train_df['gender'].map(gender_map)
test_df['gender'] = test_df['gender'].map(gender_map)

In [22]:
#dropping course

train_df = train_df.drop(columns='course')
test_df = test_df.drop(columns='course')

In [23]:
train_df_encoded = pd.get_dummies(train_df, columns=['gender', 'internet_access'], drop_first= True)
test_df_encoded = pd.get_dummies(test_df, columns=['gender', 'internet_access'], drop_first= True)

In [24]:
#target to the right

target_placeholder = train_df.pop('exam_score')

train_df_encoded['exam_score'] = target_placeholder

In [25]:
from xgboost import XGBRegressor

labeled_data = train_df_encoded
unlabeled_data = test_df_encoded.copy()


In [26]:
train_df_encoded.head()

Unnamed: 0,id,age,study_hours,class_attendance,sleep_hours,sleep_quality,study_method,facility_rating,exam_difficulty,exam_score,gender_male,internet_access_yes
0,0,21,7.91,98.8,4.9,1,1,0,2,78.3,False,False
1,1,18,4.95,94.8,4.7,0,0,1,1,46.7,False,True
2,2,20,4.68,92.6,5.8,0,4,2,1,99.0,False,True
3,3,19,2.0,49.5,8.3,1,2,2,1,63.9,True,True
4,4,23,7.65,86.9,9.6,2,0,2,2,100.0,True,True


In [27]:
initial_model = XGBRegressor()

initial_model.fit(labeled_data.drop('exam_score', axis= 1), labeled_data['exam_score'])

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [28]:
pseudo_labels = initial_model.predict(unlabeled_data)

In [29]:
combined_features = pd.concat([
    labeled_data.drop('exam_score', axis=1),
    unlabeled_data.drop('id', axis=1)
], axis=0)

In [32]:
combined_labels = pd.concat([
    labeled_data['exam_score'],
    pd.Series(pseudo_labels, name='exam_score')
], axis=0)

In [33]:
final_model = XGBRegressor()
final_model.fit(combined_features, combined_labels)

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [35]:
import optuna
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

def objective(trial):
    # Suggest hyperparameters
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 10),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 10)
    }
    
    # Train final model with pseudo-labels
    model = XGBRegressor(**params, random_state=42, n_jobs=-1)
    
    # Use cross-validation on combined data
    scores = -cross_val_score(
        model, combined_features, combined_labels,
        cv=5, scoring='neg_mean_squared_error', n_jobs=-1
    )
    
    return scores.mean()

# Run Optuna study
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

# Train final model with best params
best_model = XGBRegressor(**study.best_params, random_state=42)
best_model.fit(combined_features, combined_labels)

[I 2026-01-24 15:13:33,188] A new study created in memory with name: no-name-fc4012da-665e-43ec-9c32-921a7604593a
[I 2026-01-24 15:14:10,061] Trial 0 finished with value: 55.67408260621046 and parameters: {'n_estimators': 159, 'max_depth': 6, 'learning_rate': 0.03330980815851708, 'subsample': 0.6768113702766471, 'colsample_bytree': 0.7081459904129741, 'min_child_weight': 4, 'reg_alpha': 1.8374306731111123, 'reg_lambda': 9.090638267861873}. Best is trial 0 with value: 55.67408260621046.
[I 2026-01-24 15:16:31,180] Trial 1 finished with value: 54.660036703350364 and parameters: {'n_estimators': 853, 'max_depth': 6, 'learning_rate': 0.01987527698277017, 'subsample': 0.9221923831130754, 'colsample_bytree': 0.572381408393989, 'min_child_weight': 9, 'reg_alpha': 3.871255999705844, 'reg_lambda': 5.55745395303393}. Best is trial 1 with value: 54.660036703350364.
[I 2026-01-24 15:20:42,242] Trial 2 finished with value: 58.51561737721355 and parameters: {'n_estimators': 812, 'max_depth': 10, 'le

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.6621330439151326
,device,
,early_stopping_rounds,
,enable_categorical,False


In [None]:
# X_submission = test_df_encoded.copy()

# y_pred = best_model.predict(X_submission)

# submission_df = pd.DataFrame({
#     'id': test_df_encoded['id'],
#     'exam_score': y_pred
# })

# submission_df.to_csv('submission9.csv', index=False)