In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import r2_score, make_scorer
import xgboost as xgb
import optuna

# Load data
train_data = pd.read_csv("/kaggle/input/challenge-1/train.csv")
test_data = pd.read_csv("/kaggle/input/challenge-1/Test_final.csv")

# Store test IDs
test_ids = test_data['ID']

# Drop ID column if present
train_data.drop(columns=['ID'], errors='ignore', inplace=True)
test_data.drop(columns=['ID'], errors='ignore', inplace=True)

# Convert 'Session start time' to numeric and apply sqrt transformation
train_data['Session start time'] = pd.to_numeric(train_data['Session start time'], errors='coerce')
test_data['Session start time'] = pd.to_numeric(test_data['Session start time'], errors='coerce')

train_data['Session start time'] = np.sqrt(train_data['Session start time'])
test_data['Session start time'] = np.sqrt(test_data['Session start time'])

# One-hot encoding for categorical variables
train_data = pd.get_dummies(train_data, drop_first=True)
test_data = pd.get_dummies(test_data, drop_first=True)

# Align test data columns with train data
missing_cols = set(train_data.columns) - set(test_data.columns)
for col in missing_cols:
    test_data[col] = 0

test_data = test_data[train_data.columns.drop('Absenteeism time in hours', errors='ignore')]

# Handle missing values
train_data.fillna(train_data.median(numeric_only=True), inplace=True)
test_data.fillna(test_data.median(numeric_only=True), inplace=True)

# Extract target variable
target_column = 'Absenteeism time in hours'
y = train_data[target_column]
X = train_data.drop(columns=[target_column])

# Scale features using RobustScaler
feature_scaler = RobustScaler()
X_scaled = feature_scaler.fit_transform(X)
test_scaled = feature_scaler.transform(test_data)

# Scale target variable
target_scaler = RobustScaler()
y_scaled = target_scaler.fit_transform(y.values.reshape(-1, 1)).flatten()

# Define objective function for Optuna
kf = KFold(n_splits=5, shuffle=True, random_state=42)

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 0.5),
        'reg_lambda': trial.suggest_float('reg_lambda', 1, 20),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'random_state': 42,
    }
    
    model = xgb.XGBRegressor(**params)
    scores = cross_val_score(model, X_scaled, y_scaled, cv=kf, scoring=make_scorer(r2_score))
    return np.mean(scores)

# Run Optuna optimization
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=60)

# Best hyperparameters and R² score
best_r2_score = study.best_value
best_params = study.best_params

print(f"Best R² Score: {best_r2_score}")
print(f"Best Hyperparameters: {best_params}")

# Train final model with best hyperparameters
final_model = xgb.XGBRegressor(**best_params)
final_model.fit(X_scaled, y_scaled)

# Predict on test set
test_predictions_scaled = final_model.predict(test_scaled)
test_predictions = target_scaler.inverse_transform(test_predictions_scaled.reshape(-1, 1)).flatten()

# Create submission file
submission = pd.DataFrame({
    "ID": test_ids,
    "Absenteeism time in hours": np.round(test_predictions)
})

# Save submission file
submission.to_csv("chal1.csv", index=False)

print("Submission file saved as 'chal1.csv'.")
print(submission.head())     

  self.center_ = np.nanmedian(X, axis=0)
  return _nanquantile_unchecked(
  constant_mask = scale < 10 * np.finfo(scale.dtype).eps
[I 2025-04-03 08:33:57,216] A new study created in memory with name: no-name-fcc98666-4cdd-40b4-b00f-350d448f6614
[I 2025-04-03 08:33:58,596] Trial 0 finished with value: -0.022712711815498034 and parameters: {'n_estimators': 984, 'learning_rate': 0.18949188622639765, 'max_depth': 10, 'subsample': 0.749760310621294, 'colsample_bytree': 0.9307405132592013, 'gamma': 0.11040662654495992, 'reg_lambda': 5.138541614828178, 'reg_alpha': 8.52010259385279, 'min_child_weight': 8}. Best is trial 0 with value: -0.022712711815498034.
[I 2025-04-03 08:33:59,274] Trial 1 finished with value: -0.09609570527863143 and parameters: {'n_estimators': 661, 'learning_rate': 0.24965997344320495, 'max_depth': 6, 'subsample': 0.6313764135238635, 'colsample_bytree': 0.8534552665121224, 'gamma': 0.4982844067685468, 'reg_lambda': 2.7522206806518126, 'reg_alpha': 2.9685812656415367, 'mi

Best R² Score: 0.04143527958091884
Best Hyperparameters: {'n_estimators': 572, 'learning_rate': 0.03530631114258355, 'max_depth': 11, 'subsample': 0.940961664830391, 'colsample_bytree': 0.5885573541428406, 'gamma': 0.4490469750545841, 'reg_lambda': 6.619493941443242, 'reg_alpha': 5.408621590313078, 'min_child_weight': 9}
Submission file saved as 'chal1.csv'.
        ID  Absenteeism time in hours
0  2201048                        3.0
1  2302569                        6.0
2  2101843                        9.0
3  2101729                        6.0
4  2301185                        5.0
