In [1]:
#libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)

In [2]:
data1 = pd.read_csv('train.csv')
data2 = pd.read_csv('test.csv')
data3 = pd.read_csv('Exam_Score_Prediction.csv')

train_df = data1.copy()
test_df = data2.copy()
previous_df = data3.copy()

print("First 5 rows of train data: \n", train_df.head())
print("First 5 rows of test data: \n", test_df.head())

First 5 rows of train data: 
    id  age  gender   course  study_hours  class_attendance internet_access  \
0   0   21  female     b.sc         7.91              98.8              no   
1   1   18   other  diploma         4.95              94.8             yes   
2   2   20  female     b.sc         4.68              92.6             yes   
3   3   19    male     b.sc         2.00              49.5             yes   
4   4   23    male      bca         7.65              86.9             yes   

   sleep_hours sleep_quality   study_method facility_rating exam_difficulty  \
0          4.9       average  online videos             low            easy   
1          4.7          poor     self-study          medium        moderate   
2          5.8          poor       coaching            high        moderate   
3          8.3       average    group study            high        moderate   
4          9.6          good     self-study            high            easy   

   exam_score  
0        7

In [3]:
sleep_map = {
    "poor": 0.5,
    "average": 1,
    "good": 1.5

}

facility_map = {
    "low": 0,
    "medium": 1,
    "high": 2
}

difficulty_map = {
    "hard": 0,
    "moderate": 1,
    "easy": 2
}

method_map = {
    "self-study": 0,
    "online videos": 1,
    "group study": 2,
    "mixed": 3,
    "coaching": 4
}

train_df['sleep_quality'] = train_df['sleep_quality'].map(sleep_map)
test_df['sleep_quality'] = test_df['sleep_quality'].map(sleep_map)

train_df['facility_rating'] = train_df['facility_rating'].map(facility_map)
test_df['facility_rating'] = test_df['facility_rating'].map(facility_map)

train_df['exam_difficulty'] = train_df['exam_difficulty'].map(difficulty_map)
test_df['exam_difficulty'] = test_df['exam_difficulty'].map(difficulty_map)

train_df['study_method'] = train_df['study_method'].map(method_map)
test_df['study_method'] = test_df['study_method'].map(method_map)

In [4]:
#dropping course

train_df = train_df.drop(columns='course')
test_df = test_df.drop(columns='course')

In [5]:
gender_map = {
    "other": "female",
    "male": "male",
    "female": "female"
}

train_df['gender'] = train_df['gender'].map(gender_map)
test_df['gender'] = test_df['gender'].map(gender_map)

In [6]:
train_df_encoded = pd.get_dummies(train_df, columns=['gender', 'internet_access'], drop_first= True)
test_df_encoded = pd.get_dummies(test_df, columns=['gender', 'internet_access'], drop_first= True)

In [7]:
train_df_encoded.head()

Unnamed: 0,id,age,study_hours,class_attendance,sleep_hours,sleep_quality,study_method,facility_rating,exam_difficulty,exam_score,gender_male,internet_access_yes
0,0,21,7.91,98.8,4.9,1.0,1,0,2,78.3,False,False
1,1,18,4.95,94.8,4.7,0.5,0,1,1,46.7,False,True
2,2,20,4.68,92.6,5.8,0.5,4,2,1,99.0,False,True
3,3,19,2.0,49.5,8.3,1.0,2,2,1,63.9,True,True
4,4,23,7.65,86.9,9.6,1.5,0,2,2,100.0,True,True


In [8]:
def find_inefficient_studiers_correct(df):
    """Find students who study a lot but score poorly compared to OTHER high-study students"""
    
    # 1. identify high study
    study_75th = df['study_hours'].quantile(0.75)
    high_study_mask = df['study_hours'] > study_75th
    high_study_df = df[high_study_mask].copy()
    
    print(f"High-study students (>{study_75th:.1f} hours): {len(high_study_df)}")
    print(f"Their score distribution:")
    print(high_study_df['exam_score'].describe())
    
    # 2. test score for the group
    high_study_25th = high_study_df['exam_score'].quantile(0.25)
    print(f"\n25th percentile AMONG HIGH-STUDY students: {high_study_25th:.1f}")
    
    # 3. creating the mask
    suspicious_mask = high_study_mask & (df['exam_score'] < high_study_25th)
    
    print(f"\nHigh-study, low-score students: {suspicious_mask.sum()}")
    
    if suspicious_mask.sum() > 0:
        print("\nThese students study a lot but score poorly (compared to other high-study students):")
        print(df[suspicious_mask][['study_hours', 'exam_score']].head())
    
    return suspicious_mask

In [9]:
corrected_mask = find_inefficient_studiers_correct(train_df_encoded)

High-study students (>6.0 hours): 156730
Their score distribution:
count    156730.000000
mean         81.177432
std          12.169366
min          21.600000
25%          72.600000
50%          81.500000
75%          90.800000
max         100.000000
Name: exam_score, dtype: float64

25th percentile AMONG HIGH-STUDY students: 72.6

High-study, low-score students: 39115

These students study a lot but score poorly (compared to other high-study students):
     study_hours  exam_score
56          7.91        70.9
96          7.69        60.7
108         7.45        62.9
110         6.38        68.4
145         7.85        69.3


In [10]:
study_hours = 'study_hours'

train_df_encoded[f'{study_hours}_adjusted'] = np.where(
    corrected_mask,
    train_df_encoded[study_hours] * 0.7,  # Reducing by 30%
    train_df_encoded[study_hours]
)

print(f"Adjusted study hours for {corrected_mask.sum()} students")
print("\nBefore adjustment stats:")
print(train_df_encoded[study_hours].describe())
print("\nAfter adjustment stats:")
print(train_df_encoded[f'{study_hours}_adjusted'].describe())

Adjusted study hours for 39115 students

Before adjustment stats:
count    630000.000000
mean          4.002337
std           2.359880
min           0.080000
25%           1.970000
50%           4.000000
75%           6.050000
max           7.910000
Name: study_hours, dtype: float64

After adjustment stats:
count    630000.000000
mean          3.874047
std           2.251152
min           0.080000
25%           1.970000
50%           4.000000
75%           5.580000
max           7.910000
Name: study_hours_adjusted, dtype: float64


In [11]:
train_df_encoded['study_hours'] = train_df_encoded['study_hours_adjusted']

In [12]:
train_df_encoded = train_df_encoded.drop(columns='study_hours_adjusted')

In [13]:
#compressing class_attendancy

train_df_encoded['class_attendance'] = np.where(
    train_df_encoded['class_attendance'] > 98,
    98 + (train_df_encoded['class_attendance'] - 98)* 0.3,
    train_df_encoded['class_attendance']
)

test_df_encoded['class_attendance'] = np.where(
    test_df_encoded['class_attendance'] > 98,
    98 + (test_df_encoded['class_attendance'] - 98)* 0.3,
    test_df_encoded['class_attendance']
)

In [14]:
def find_inefficient_attendance_correct(df):
    """Find students who study a lot but score poorly compared to OTHER high-study students"""
    
    # 1. identify high study
    attendance_75th = df['class_attendance'].quantile(0.75)
    high_attendance_mask = df['class_attendance'] > attendance_75th
    high_attendance_df = df[high_attendance_mask].copy()
    
    print(f"High-attendance students (>{attendance_75th:.1f} attendance): {len(high_attendance_df)}")
    print(f"Their score distribution:")
    print(high_attendance_df['exam_score'].describe())
    
    # 2. test score for the group
    high_study_25th = high_attendance_df['exam_score'].quantile(0.25)
    print(f"\n25th percentile AMONG HIGH-ATTENDANCE students: {high_study_25th:.1f}")
    
    # 3. creating the mask
    suspicious_mask = high_attendance_mask & (df['exam_score'] < high_study_25th)
    
    print(f"\nHigh-attendance, low-score students: {suspicious_mask.sum()}")
    
    if suspicious_mask.sum() > 0:
        print("\nThese students attend a lot but score poorly (compared to other high-attendance students):")
        print(df[suspicious_mask][['class_attendance', 'exam_score']].head())
    
    return suspicious_mask

In [15]:
corrected_mask = find_inefficient_attendance_correct(train_df_encoded)

High-attendance students (>87.2 attendance): 157221
Their score distribution:
count    157221.000000
mean         71.086344
std          17.271960
min          19.599000
25%          58.800000
50%          71.300000
75%          84.500000
max         100.000000
Name: exam_score, dtype: float64

25th percentile AMONG HIGH-ATTENDANCE students: 58.8

High-attendance, low-score students: 39154

These students attend a lot but score poorly (compared to other high-attendance students):
    class_attendance  exam_score
1              94.80        46.7
8              98.09        46.7
43             92.60        57.7
53             95.50        36.9
59             94.10        44.2


In [16]:
class_attendance = 'class_attendance'

train_df_encoded[f'{class_attendance}_adjusted'] = np.where(
    corrected_mask,
    train_df_encoded[class_attendance] * 0.7,  # Reducing by 30%
    train_df_encoded[class_attendance]
)

print(f"Adjusted study hours for {corrected_mask.sum()} students")
print("\nBefore adjustment stats:")
print(train_df_encoded[class_attendance].describe())
print("\nAfter adjustment stats:")
print(train_df_encoded[f'{class_attendance}_adjusted'].describe())

Adjusted study hours for 39154 students

Before adjustment stats:
count    630000.000000
mean         71.949874
std          17.372469
min          40.600000
25%          57.000000
50%          72.600000
75%          87.200000
max          98.420000
Name: class_attendance, dtype: float64

After adjustment stats:
count    630000.000000
mean         70.205754
std          16.491343
min          40.600000
25%          57.000000
50%          68.900000
75%          83.600000
max          98.420000
Name: class_attendance_adjusted, dtype: float64


In [17]:
train_df_encoded['class_attendance'] = train_df_encoded['class_attendance_adjusted']

train_df_encoded = train_df_encoded.drop(columns='class_attendance_adjusted')

In [18]:
#train test split

from sklearn.model_selection import train_test_split

X = train_df_encoded.drop(columns='exam_score')
y = train_df_encoded['exam_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, random_state=42)

In [19]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [20]:
# #regression issues

# def train_and_eval_models(X_train, X_test, y_train, y_test):

#     #models to test
#     models = {
#         "Linear Regression" : LinearRegression(),
#         "Random Forest" : RandomForestRegressor(),
#         "XGB" : XGBRegressor()
#     }

#     print("Starting the training process.")

#     #list to store results

#     results = []

#     #scaling data just for LogisticRegression

#     for name, model in models.items():
#         print(f"Training model: {name}")

#         model.fit(X_train, y_train)

#         y_train_predict = model.predict(X_train)
#         y_test_predict = model.predict(X_test)

#         results.append({
#             "Model": name,
#             "Train MAE": mean_absolute_error(y_train, y_train_predict),
#             "Test MAE": mean_absolute_error(y_test, y_test_predict),
#             "Train RMSE": np.sqrt(mean_squared_error(y_train, y_train_predict)),
#             "Test RMSE": np.sqrt(mean_squared_error(y_test, y_test_predict)),
#             "Train R2": r2_score(y_train, y_train_predict),
#             "Test R2": r2_score(y_test, y_test_predict)
#         })

#     # Converting results into a DataFrame
#     results_df = pd.DataFrame(results)

#     print("\nModel Performance Summary:")
#     print(results_df)

#     return results_df, model

In [21]:
# results_df, best_model = train_and_eval_models(X_train, X_test, y_train, y_test)

# print(results_df)

In [22]:
# #plotting the results

# plt.figure(figsize=(10, 6))
# sns.barplot(x="Model", y="Train R2", data=results_df.sort_values(by='Train R2', ascending=False), color='skyblue')
# plt.title("Model Accuracy Comparison (Train Data)")
# plt.xticks(rotation=45)
# plt.show()

# plt.figure(figsize=(10, 6))
# sns.barplot(x="Model", y="Test R2", data=results_df.sort_values(by='Test R2', ascending=False), color='skyblue')
# plt.title("Model Accuracy Comparison (Test Data)")
# plt.xticks(rotation=45)
# plt.show()

In [23]:
# X_submission = test_df_encoded.copy()

# y_pred = best_model.predict(X_submission)

# submission_df = pd.DataFrame({
#     'id': test_df_encoded['id'],
#     'exam_score': y_pred
# })

# submission_df.to_csv('submission6.csv', index=False)

In [24]:
import optuna
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, make_scorer

In [25]:
def objective(trial):
    """Optuna objective function for XGBRegressor hyperparameter optimization"""
    
    params = {
        # Core Parameters
        'n_estimators': trial.suggest_int('n_estimators', 100, 2000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        
        # Tree Structure
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'max_delta_step': trial.suggest_int('max_delta_step', 0, 10),
        'max_leaves': trial.suggest_int('max_leaves', 0, 256),
        
        # Regularization
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
        
        # Sampling
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.5, 1.0),
        'colsample_bynode': trial.suggest_float('colsample_bynode', 0.5, 1.0),
        
        # Learning Task
        'objective': 'reg:squarederror',
        
        # Fixed Parameters
        'random_state': 42,
        'n_jobs': -1,
        'verbosity': 0,
        'tree_method': 'hist',  # Faster than 'exact'
        'enable_categorical': False,
    }
    
    model = XGBRegressor(**params)
    
    # Use cross-validation
    cv_scores = cross_val_score(
        model, X, y, 
        cv=5, 
        scoring='neg_root_mean_squared_error',  # Or 'neg_mean_squared_error'
        n_jobs=-1
    )
    
    return np.mean(cv_scores)

In [26]:
study = optuna.create_study(direction='maximize')  # Maximize negative RMSE = minimize RMSE
study.optimize(objective, n_trials=50, timeout=3600)

print("Best trial:")
trial = study.best_trial
print(f"  Score (negative RMSE): {trial.value:.4f}")
print("  Best parameters:")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

[I 2026-01-19 21:52:21,310] A new study created in memory with name: no-name-56d1fd05-0a7e-490c-9658-00c730a6360d
[I 2026-01-19 21:52:43,503] Trial 0 finished with value: -8.282637116433273 and parameters: {'n_estimators': 314, 'learning_rate': 0.13247237093699318, 'max_depth': 3, 'min_child_weight': 1, 'gamma': 3.393626479027682, 'max_delta_step': 9, 'max_leaves': 241, 'reg_alpha': 0.4386639901185709, 'reg_lambda': 0.001305440074929859, 'subsample': 0.5932049153329272, 'colsample_bytree': 0.7110953026200781, 'colsample_bylevel': 0.6283737481476068, 'colsample_bynode': 0.6500092666693689}. Best is trial 0 with value: -8.282637116433273.
[I 2026-01-19 21:55:58,981] Trial 1 finished with value: -8.185046423505067 and parameters: {'n_estimators': 1922, 'learning_rate': 0.05166487454716408, 'max_depth': 7, 'min_child_weight': 12, 'gamma': 4.325451945474183, 'max_delta_step': 4, 'max_leaves': 96, 'reg_alpha': 0.3231439834639901, 'reg_lambda': 1.073153785651199, 'subsample': 0.54597084883383

Best trial:
  Score (negative RMSE): -8.1167
  Best parameters:
    n_estimators: 1873
    learning_rate: 0.015241621622032714
    max_depth: 11
    min_child_weight: 10
    gamma: 1.869927571554772
    max_delta_step: 9
    max_leaves: 77
    reg_alpha: 5.7907662697663855e-05
    reg_lambda: 6.159928740051305e-05
    subsample: 0.7067728629782135
    colsample_bytree: 0.9987824258657209
    colsample_bylevel: 0.7316529327387755
    colsample_bynode: 0.9418467092907825


In [27]:
study.best_params

{'n_estimators': 1873,
 'learning_rate': 0.015241621622032714,
 'max_depth': 11,
 'min_child_weight': 10,
 'gamma': 1.869927571554772,
 'max_delta_step': 9,
 'max_leaves': 77,
 'reg_alpha': 5.7907662697663855e-05,
 'reg_lambda': 6.159928740051305e-05,
 'subsample': 0.7067728629782135,
 'colsample_bytree': 0.9987824258657209,
 'colsample_bylevel': 0.7316529327387755,
 'colsample_bynode': 0.9418467092907825}

In [28]:
xgb_params = {
    'n_estimators': 1873,
 'learning_rate': 0.015241621622032714,
 'max_depth': 11,
 'min_child_weight': 10,
 'gamma': 1.869927571554772,
 'max_delta_step': 9,
 'max_leaves': 77,
 'reg_alpha': 5.7907662697663855e-05,
 'reg_lambda': 6.159928740051305e-05,
 'subsample': 0.7067728629782135,
 'colsample_bytree': 0.9987824258657209,
 'colsample_bylevel': 0.7316529327387755,
 'colsample_bynode': 0.9418467092907825
}

xgb_model = XGBRegressor(**xgb_params)
cv_res = cross_val_score(xgb_model, X_train, y_train, cv=3, scoring='neg_log_loss')

print(f"Final CV scores: {cv_res}")
print(f"Mean CV score: {cv_res.mean():.4f} ± {cv_res.std():.4f}")

xgb_model.fit(X_train, y_train)

Final CV scores: [nan nan nan]
Mean CV score: nan ± nan


0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,0.7316529327387755
,colsample_bynode,0.9418467092907825
,colsample_bytree,0.9987824258657209
,device,
,early_stopping_rounds,
,enable_categorical,False


In [None]:
# X_submission = test_df_encoded

# y_pred = xgb_model.predict(X_submission)

# submission_df = pd.DataFrame({
#     'id': test_df_encoded['id'],
#     'exam_score': y_pred
# })

# submission_df.to_csv('submission7.csv', index=False)