In [1]:
#libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_row', None)

In [2]:
data1 = pd.read_csv('train.csv')
data2 = pd.read_csv('test.csv')
data3 = pd.read_csv('Exam_Score_Prediction.csv')

train_df = data1.copy()
test_df = data2.copy()
previous_df = data3.copy()

print("First 5 rows of train data: \n", train_df.head())
print("First 5 rows of test data: \n", test_df.head())

First 5 rows of train data: 
    id  age  gender   course  study_hours  class_attendance internet_access  \
0   0   21  female     b.sc         7.91              98.8              no   
1   1   18   other  diploma         4.95              94.8             yes   
2   2   20  female     b.sc         4.68              92.6             yes   
3   3   19    male     b.sc         2.00              49.5             yes   
4   4   23    male      bca         7.65              86.9             yes   

   sleep_hours sleep_quality   study_method facility_rating exam_difficulty  \
0          4.9       average  online videos             low            easy   
1          4.7          poor     self-study          medium        moderate   
2          5.8          poor       coaching            high        moderate   
3          8.3       average    group study            high        moderate   
4          9.6          good     self-study            high            easy   

   exam_score  
0        7

In [3]:
sleep_map = {
    "poor": 0,
    "average": 1,
    "good": 2

}

facility_map = {
    "low": 0,
    "medium": 1,
    "high": 2
}

difficulty_map = {
    "hard": 0,
    "moderate": 1,
    "easy": 2
}

method_map = {
    "self-study": 0,
    "online videos": 1,
    "group study": 2,
    "mixed": 3,
    "coaching": 4
}

train_df['sleep_quality'] = train_df['sleep_quality'].map(sleep_map)
test_df['sleep_quality'] = test_df['sleep_quality'].map(sleep_map)

train_df['facility_rating'] = train_df['facility_rating'].map(facility_map)
test_df['facility_rating'] = test_df['facility_rating'].map(facility_map)

train_df['exam_difficulty'] = train_df['exam_difficulty'].map(difficulty_map)
test_df['exam_difficulty'] = test_df['exam_difficulty'].map(difficulty_map)

train_df['study_method'] = train_df['study_method'].map(method_map)
test_df['study_method'] = test_df['study_method'].map(method_map)

In [4]:
gender_map = {
    "other": "female",
    "male": "male",
    "female": "female"
}

train_df['gender'] = train_df['gender'].map(gender_map)
test_df['gender'] = test_df['gender'].map(gender_map)

In [5]:
#dropping course

train_df = train_df.drop(columns='course')
test_df = test_df.drop(columns='course')

In [6]:
train_df_encoded = pd.get_dummies(train_df, columns=['gender', 'internet_access'], drop_first= True)
test_df_encoded = pd.get_dummies(test_df, columns=['gender', 'internet_access'], drop_first= True)

In [7]:
#target to the right

target_placeholder = train_df.pop('exam_score')

train_df_encoded['exam_score'] = target_placeholder

In [8]:
from xgboost import XGBRegressor

labeled_data = train_df_encoded
unlabeled_data = test_df_encoded.copy()


In [9]:
train_df_encoded.head()

Unnamed: 0,id,age,study_hours,class_attendance,sleep_hours,sleep_quality,study_method,facility_rating,exam_difficulty,exam_score,gender_male,internet_access_yes
0,0,21,7.91,98.8,4.9,1,1,0,2,78.3,False,False
1,1,18,4.95,94.8,4.7,0,0,1,1,46.7,False,True
2,2,20,4.68,92.6,5.8,0,4,2,1,99.0,False,True
3,3,19,2.0,49.5,8.3,1,2,2,1,63.9,True,True
4,4,23,7.65,86.9,9.6,2,0,2,2,100.0,True,True


In [10]:
initial_model = XGBRegressor()

initial_model.fit(labeled_data.drop('exam_score', axis= 1), labeled_data['exam_score'])

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [11]:
pseudo_labels = initial_model.predict(unlabeled_data)

In [12]:
combined_features = pd.concat([
    labeled_data.drop('exam_score', axis=1),
    unlabeled_data.drop('id', axis=1)
], axis=0)

In [13]:
combined_labels = pd.concat([
    labeled_data['exam_score'],
    pd.Series(pseudo_labels, name='exam_score')
], axis=0)

In [14]:
final_model = XGBRegressor()
final_model.fit(combined_features, combined_labels)

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [None]:
import optuna
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

def objective(trial):
    # Suggest hyperparameters
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 10),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 10)
    }
    
    # Train final model with pseudo-labels
    model = XGBRegressor(**params, random_state=42, n_jobs=-1)
    
    # Use cross-validation on combined data
    scores = -cross_val_score(
        model, combined_features, combined_labels,
        cv=5, scoring='neg_mean_squared_error', n_jobs=-1
    )
    
    return scores.mean()

# Run Optuna study
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

# Train final model with best params
best_model = XGBRegressor(**study.best_params, random_state=42)
best_model.fit(combined_features, combined_labels)

In [17]:
X_labeled = train_df_encoded.drop(columns=['id', 'exam_score'])
y_labeled = train_df_encoded['exam_score']
X_unlabeled = test_df_encoded.drop(columns='id')

In [None]:
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

# base models for pseudo labeling
base_models = [
    ('xgb', XGBRegressor(random_state=42)),
    ('rf', RandomForestRegressor(n_estimators=100, random_state=42)),
    ('ridge', Ridge(alpha=1.0)),
    ('svr', SVR(C=1.0, kernel='rbf'))
]

# stacked pseudolabeling model
stacked_model = StackingRegressor(
    estimators=base_models,
    final_estimator=XGBRegressor(random_state=42, n_estimators=200),
    cv=5,
    n_jobs=-1
)

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge
import xgboost as xgb
import lightgbm as lgb

def advanced_pseudo_labeling_with_stacking(X_labeled, y_labeled, X_unlabeled, n_folds=5):
    """
    Advanced pseudo-labeling with model stacking and diversity
    """
    # storage fod out of fold predictions
    oof_predictions_labeled = np.zeros(len(X_labeled))
    pseudo_predictions_all_models = []
    
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
    
    # models
    model_configs = {
        'xgb': lambda: xgb.XGBRegressor(
            n_estimators=300, learning_rate=0.05, max_depth=6,
            subsample=0.8, colsample_bytree=0.8, random_state=42
        ),
        'lgb': lambda: lgb.LGBMRegressor(
            n_estimators=300, learning_rate=0.05, max_depth=6,
            subsample=0.8, colsample_bytree=0.8, random_state=42
        ),
        'rf': lambda: RandomForestRegressor(
            n_estimators=200, max_depth=10, min_samples_split=5,
            random_state=42, n_jobs=-1
        ),
        'ridge': lambda: Ridge(alpha=10.0)
    }
    
    # pseudolabeling with different models
    for model_name, model_fn in model_configs.items():
        model_pseudo_labels = np.zeros(len(X_unlabeled))
        
        for fold, (train_idx, val_idx) in enumerate(kf.split(X_labeled)):
            X_train, X_val = X_labeled.iloc[train_idx], X_labeled.iloc[val_idx]
            y_train, y_val = y_labeled.iloc[train_idx], y_labeled.iloc[val_idx]
            
            # training model
            model = model_fn()
            model.fit(X_train, y_train)
            
            # out of fold predictions
            if model_name == 'xgb': 
                oof_predictions_labeled[val_idx] = model.predict(X_val)
            
            # Generate pseudo-labels on unlabeled data
            model_pseudo_labels += model.predict(X_unlabeled) / n_folds
        
        pseudo_predictions_all_models.append(model_pseudo_labels)
    
    # ensemble of pseudo labeling
    pseudo_labels_weighted = np.zeros(len(X_unlabeled))
    
    # weighting schema
    weights = [0.35, 0.35, 0.2, 0.1]  # heavier weights on models performing better
    for w, preds in zip(weights, pseudo_predictions_all_models):
        pseudo_labels_weighted += w * preds
    
    # residuals on labeled data
    residuals = y_labeled - oof_predictions_labeled
    
    # training correction model
    residual_model = xgb.XGBRegressor(
        n_estimators=150, learning_rate=0.03, max_depth=4,
        random_state=42
    )
    residual_model.fit(X_labeled, residuals)
    
    # pseudolabels corrected with residuals
    residual_corrections = residual_model.predict(X_unlabeled)
    final_pseudo_labels = pseudo_labels_weighted + residual_corrections * 0.5  # Dampen correction
    
    # Combining data
    X_combined = pd.concat([X_labeled, X_unlabeled], axis=0)
    y_combined = pd.concat([y_labeled, pd.Series(final_pseudo_labels)], axis=0)
    
    # final stacked model
    meta_models = [
        ('xgb', xgb.XGBRegressor(n_estimators=500, learning_rate=0.02, max_depth=7, random_state=42)),
        ('lgb', lgb.LGBMRegressor(n_estimators=500, learning_rate=0.02, max_depth=7, random_state=42)),
    ]
    
    final_stacked = StackingRegressor(
        estimators=meta_models,
        final_estimator=Ridge(alpha=1.0),
        cv=3,
        n_jobs=-1
    )
    
    final_stacked.fit(X_combined, y_combined)
    
    return final_stacked, final_pseudo_labels

In [22]:
final_model, final_pseudo_labels = advanced_pseudo_labeling_with_stacking(
    X_labeled=X_labeled,
    y_labeled=y_labeled,
    X_unlabeled=X_unlabeled,
    n_folds=5
)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011218 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 588
[LightGBM] [Info] Number of data points in the train set: 504000, number of used features: 10
[LightGBM] [Info] Start training from score 62.482335
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011063 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 588
[LightGBM] [Info] Number of data points in the train set: 504000, number of used features: 10
[LightGBM] [Info] Start training from score 62.502155
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010648 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not eno

In [23]:
test_predictions = final_model.predict(X_unlabeled)

In [24]:
submission_df = pd.DataFrame({
    'id': test_df_encoded['id'],
    'exam_score': test_predictions
})