# Hyperparameter Tuning

For selected features:

In [None]:
from sklearn.model_selection import GridSearchCV
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# Load dataset
df = pd.read_csv('selected_features.csv')

# Split data
X = df.drop(columns=['outcome'])
y = df['outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define parameter grid 
xgb_param_grid = {
    'n_estimators': [50, 100, 200, 300],  
    'learning_rate': [0.05, 0.1, 0.15], 
    'max_depth': [2, 3, 4],             
    'subsample': [0.6, 0.7, 0.8],       
    'colsample_bytree': [0.9, 1.0]        
}

# Run Grid Search for XGBoost
xgb = XGBRegressor(random_state=42)
xgb_grid_search = GridSearchCV(xgb, xgb_param_grid, cv=5, scoring='r2', verbose=2, n_jobs=-1)
xgb_grid_search.fit(X_train, y_train)
best_xgb = xgb_grid_search.best_estimator_

# Print best parameters
print("\nBest XGBoost Parameters:", xgb_grid_search.best_params_)

# Evaluate optimized model
xgb_y_pred = best_xgb.predict(X_test)
train_r2 = r2_score(y_train, best_xgb.predict(X_train))
test_r2 = r2_score(y_test, xgb_y_pred)

# Print optimized results
print(f"\nOptimized XGBoost: R² Train = {train_r2:.4f}, R² Test = {test_r2:.4f}")

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
[CV] END colsample_bytree=0.9, learning_rate=0.05, max_depth=2, n_estimators=50, subsample=0.6; total time=   1.0s
[CV] END colsample_bytree=0.9, learning_rate=0.05, max_depth=2, n_estimators=50, subsample=0.6; total time=   1.1s
[CV] END colsample_bytree=0.9, learning_rate=0.05, max_depth=2, n_estimators=50, subsample=0.7; total time=   1.1s
[CV] END colsample_bytree=0.9, learning_rate=0.05, max_depth=2, n_estimators=50, subsample=0.6; total time=   1.2s
[CV] END colsample_bytree=0.9, learning_rate=0.05, max_depth=2, n_estimators=50, subsample=0.6; total time=   1.1s
[CV] END colsample_bytree=0.9, learning_rate=0.05, max_depth=2, n_estimators=50, subsample=0.8; total time=   1.1s
[CV] END colsample_bytree=0.9, learning_rate=0.05, max_depth=2, n_estimators=50, subsample=0.6; total time=   1.1s
[CV] END colsample_bytree=0.9, learning_rate=0.05, max_depth=2, n_estimators=50, subsample=0.7; total time=   1.1s
[CV] END colsamp

In [None]:
For the processed features, the model overfitted the training data and hence showed a significantly lower test accuracy due to its inability to generalise.

For top processed features:

In [None]:
# Load dataset
df = pd.read_csv('processed_features.csv')

# Split data
X = df.drop(columns=['outcome'])
y = df['outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define parameter grid 
xgb_param_grid = {
    'n_estimators': [50, 100, 200, 300],  
    'learning_rate': [0.05, 0.1, 0.15], 
    'max_depth': [2, 3, 4],             
    'subsample': [0.6, 0.7, 0.8],       
    'colsample_bytree': [0.9, 1.0]        
}

# Run Grid Search for XGBoost
xgb = XGBRegressor(random_state=42)
xgb_grid_search = GridSearchCV(xgb, xgb_param_grid, cv=5, scoring='r2', verbose=2, n_jobs=-1)
xgb_grid_search.fit(X_train, y_train)
best_xgb = xgb_grid_search.best_estimator_

# Print best parameters
print("\nBest XGBoost Parameters:", xgb_grid_search.best_params_)

# Evaluate optimized model
xgb_y_pred = best_xgb.predict(X_test)
train_r2 = r2_score(y_train, best_xgb.predict(X_train))
test_r2 = r2_score(y_test, xgb_y_pred)

# Print optimized results
print(f"\nOptimized XGBoost: R² Train = {train_r2:.4f}, R² Test = {test_r2:.4f}")

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
[CV] END colsample_bytree=0.9, learning_rate=0.05, max_depth=2, n_estimators=50, subsample=0.7; total time=   2.6s
[CV] END colsample_bytree=0.9, learning_rate=0.05, max_depth=2, n_estimators=50, subsample=0.6; total time=   2.8s
[CV] END colsample_bytree=0.9, learning_rate=0.05, max_depth=2, n_estimators=50, subsample=0.6; total time=   2.9s
[CV] END colsample_bytree=0.9, learning_rate=0.05, max_depth=2, n_estimators=50, subsample=0.6; total time=   3.1s
[CV] END colsample_bytree=0.9, learning_rate=0.05, max_depth=2, n_estimators=50, subsample=0.8; total time=   3.1s
[CV] END colsample_bytree=0.9, learning_rate=0.05, max_depth=2, n_estimators=50, subsample=0.7; total time=   3.2s
[CV] END colsample_bytree=0.9, learning_rate=0.05, max_depth=2, n_estimators=50, subsample=0.7; total time=   3.2s
[CV] END colsample_bytree=0.9, learning_rate=0.05, max_depth=2, n_estimators=50, subsample=0.7; total time=   3.1s
[CV] END colsamp

# Final Model

In [7]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor

# Load dataset with selected features
selected_features_df = pd.read_csv("selected_features.csv")
X_train = selected_features_df.drop(columns=["outcome"])
y_train = selected_features_df["outcome"]

# Load test data and encode categorical features
test_df = pd.read_csv("CW1_test.csv")
categorical_cols = ['cut', 'color', 'clarity']  # Replace with actual categorical column names
test_df = pd.get_dummies(test_df, columns=categorical_cols, drop_first=True)

# Ensure test data contains the same features as the selected features dataset
selected_features = X_train.columns  # Extract selected feature names from training data
test_df = test_df[selected_features]  # Apply same feature selection to test data

# Best hyperparameters found from Grid Search
best_params = {
    'colsample_bytree': 0.9,
    'learning_rate': 0.05,
    'max_depth': 4,
    'n_estimators': 200,
    'subsample': 0.8,
    'random_state': 42
}

# Initialize and train the model
best_xgb_model = XGBRegressor(**best_params)
best_xgb_model.fit(X_train, y_train)

# Make predictions on the test set
test_predictions = best_xgb_model.predict(test_df)

# Save predictions to CSV
submission = pd.DataFrame(test_predictions, columns=["outcome"])
submission.to_csv("CW1_submission_k23080165.csv", index=False)