In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
import optuna
import pickle
import warnings
warnings.filterwarnings("ignore")

In [None]:
train_df = pd.read_csv("file_path/dataset.csv")

In [None]:
print("First 2 Rows:")
display(train_df.head(2))

print(f"\n\nDataset Shape: {train_df.shape}\n\n")

print("Dataset Information:")
train_df.info()

In [None]:
train_df.isnull().sum()

In [None]:
num_cols = train_df.select_dtypes(include=['float64', 'int64']).columns

print(f"Found {len(num_cols)} number columns: {list(num_cols)}\n")

for col in num_cols:
    print(f"--- Inspecting column: '{col}' ---")
    print(f"Unique values count: {train_df[col].nunique()}")
    print(train_df[col].value_counts().head(10))  
    print("\n")

In [None]:
object_cols = train_df.select_dtypes(include=['object']).columns

print(f"Found {len(object_cols)} object columns: {list(object_cols)}\n")

for col in object_cols:
    print(f"--- Inspecting column: '{col}' ---")
    print(f"Unique values count: {train_df[col].nunique()}")
    print(train_df[col].value_counts().head(10))  
    print("\n")

In [None]:
gender_map = {
    "other": 0,
    "male": 1,
    "female": 2
}
train_df["gender"] = train_df["gender"].map(gender_map)


course_map = {
    "diploma": 0,
    "ba": 1,
    "b.sc": 2,
    "b.com": 3,
    "bba": 4,
    "bca": 5,
    "b.tech": 6
}
train_df["course"] = train_df["course"].map(course_map)


internet_access_map = {
    "no": 0,
    "yes": 1
}
train_df["internet_access"] = train_df["internet_access"].map(internet_access_map)


sleep_quality_map = {
    "poor": 0,
    "average": 1,
    "good": 2
}
train_df["sleep_quality"] = train_df["sleep_quality"].map(sleep_quality_map)


study_method_map = {
    "self-study": 0,
    "group study": 1,
    "online videos": 2,
    "mixed": 3,
    "coaching": 4
}
train_df["study_method"] = train_df["study_method"].map(study_method_map)


facility_rating_map = {
    "low": 0,
    "medium": 1,
    "high": 2
}
train_df["facility_rating"] = train_df["facility_rating"].map(facility_rating_map)


exam_difficulty_map = {
    "easy": 0,
    "moderate": 1,
    "hard": 2
}
train_df["exam_difficulty"] = train_df["exam_difficulty"].map(exam_difficulty_map)

In [None]:
print("First 2 Rows:")
display(train_df.head(2))

print(f"\n\nDataset Shape: {train_df.shape}\n\n")

print("Dataset Information:")
train_df.info()

In [None]:
train_df.drop(columns=["id"], inplace=True)

print(f"\n\nDataset Shape: {train_df.shape}\n\n")

In [None]:
train_df.agg(['min', 'max'])

In [None]:
train_df.corr()["exam_score"].sort_values(ascending=False)

In [None]:
X = train_df.drop("exam_score", axis=1)
y = train_df["exam_score"]

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
)

print("Train shape:", X_train.shape)
print("Test shape:", X_valid.shape)

In [None]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def calculate_performance(y_true, y_pred):
    return rmse(y_true, y_pred), r2_score(y_true, y_pred)

rmse_scorer = make_scorer(
    mean_squared_error,
    greater_is_better=False,
    squared=False
)

In [None]:
model_dict = {
    "XGB": XGBRegressor(
        objective="reg:squarederror",
        random_state=42,
        n_jobs=-1
    ),
    "LGBM": LGBMRegressor(
        random_state=42,
        n_jobs=-1
    ),
    "RF": RandomForestRegressor(
        random_state=42,
        n_jobs=-1
    ),
    "LR": LinearRegression()
}

In [None]:
results = []

for name, model in model_dict.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)

    rmse_val, r2_val = calculate_performance(y_valid, preds)

    results.append({
        "Model": name,
        "RMSE": rmse_val,
        "R² Score": r2_val
    })

results_df = pd.DataFrame(results).sort_values("RMSE")

print("Model Performance Comparison (RMSE)")
print(results_df)

In [None]:
def objective(trial):

    params = {
        "n_estimators": trial.suggest_int("n_estimators", 500, 3000),
        "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.05, log=True),

        "num_leaves": trial.suggest_int("num_leaves", 16, 256),
        "max_depth": trial.suggest_int("max_depth", 3, 14),

        "min_child_samples": trial.suggest_int("min_child_samples", 10, 150),
        "min_child_weight": trial.suggest_float("min_child_weight", 1e-3, 10.0, log=True),

        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "subsample_freq": trial.suggest_int("subsample_freq", 0, 10),

        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "feature_fraction_bynode": trial.suggest_float(
            "feature_fraction_bynode", 0.5, 1.0
        ),

        "min_split_gain": trial.suggest_float("min_split_gain", 0.0, 1.0),

        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 10.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 10.0),

        "max_bin": trial.suggest_int("max_bin", 128, 512),

        "boosting_type": trial.suggest_categorical(
            "boosting_type", ["gbdt", "dart"]
        ),

        "extra_trees": trial.suggest_categorical("extra_trees", [True, False]),

        "objective": "regression",
        "metric": "rmse",
        "random_state": 42,
        "n_jobs": -1
    }

    model = LGBMRegressor(**params)

    model.fit(
        X_train,
        y_train,
        eval_set=[(X_valid, y_valid)],
        eval_metric="rmse"
    )

    preds = model.predict(X_valid)
    rmse = np.sqrt(mean_squared_error(y_valid, preds))

    return rmse

In [None]:
study = optuna.create_study(
    direction="minimize",
    sampler=optuna.samplers.TPESampler(seed=42)
)

study.optimize(objective, n_trials=100)

In [None]:
best_params = study.best_params
best_params.update({
    "objective": "regression",
    "metric": "rmse",
    "random_state": 42,
    "n_jobs": -1
})

final_model = LGBMRegressor(**best_params)

final_model.fit(
    X,
    y,
    eval_metric="rmse"
)

In [None]:
y_pred = final_model.predict(X_valid)

rmse_val = np.sqrt(mean_squared_error(y_valid, y_pred))
r2_val = r2_score(y_valid, y_pred)

print(f"Validation RMSE: {rmse_val:.4f}")
print(f"Validation R²: {r2_val:.4f}")

In [None]:
feature_importance = pd.DataFrame({
    "feature": X_train.columns,
    "importance": final_model.feature_importances_
}).sort_values("importance", ascending=False)

display(feature_importance)

In [None]:
with open("lgbm_model.pkl", "wb") as f:
    pickle.dump(final_model, f)
print("Model saved to lgbm_model.pkl")