In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb # Already imported, keeping for context if you still use LGBM later
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor # Make sure you have xgboost installed (pip install xgboost)
from sklearn.linear_model import LinearRegression
import joblib # To save models

In [2]:
try:
    X_tabular_only = pd.read_csv('processed_diamond_features_X.csv')
    y_tabular_only = pd.read_csv('diamond_target_y.csv').squeeze() # .squeeze() to ensure it's a Series
    print(f"Loaded X_tabular_only shape: {X_tabular_only.shape}")
    print(f"Loaded y_tabular_only shape: {y_tabular_only.shape}")
except FileNotFoundError as e:
    print(f"ERROR: Tabular-only files not found. Ensure 'processed_diamond_features_X_tabular_only.csv' and 'diamond_target_y_tabular_only.csv' exist.")
    print(e)
    exit()

Loaded X_tabular_only shape: (5952, 26)
Loaded y_tabular_only shape: (5952,)


In [3]:
X_tabular_only.head()

Unnamed: 0,Weight,X,Y,Z,Cut,Polish,Symmetry,Clarity,Colour,Fluorescence_F,...,Shape_CUSHION,Shape_EMERALD,Shape_HEART,Shape_MARQUISE,Shape_OVAL,Shape_PEAR,Shape_RADIANT,Shape_ROUND,Colour_IsFancy_0,Colour_IsFancy_1
0,-0.151866,-0.630345,-0.264321,-0.218893,3.0,3.0,2.0,3.0,7.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.197973,-0.495109,-0.208121,0.345727,3.0,3.0,2.0,5.0,3.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.120231,-0.248501,-0.418872,0.119879,3.0,3.0,2.0,3.0,7.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.120231,-0.526929,-0.081671,0.368312,2.0,3.0,1.0,4.0,7.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.120231,-0.184861,-0.362671,0.142464,3.0,3.0,2.0,6.0,7.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [5]:
print("\n--- Step 3: Split Data into Training and Testing Sets ---")

# Split the combined data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_tabular_only, y_tabular_only, test_size=0.2, random_state=37 # 20% for testing, use random_state for reproducibility
)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")



--- Step 3: Split Data into Training and Testing Sets ---
X_train shape: (4761, 26)
X_test shape: (1191, 26)
y_train shape: (4761,)
y_test shape: (1191,)


In [6]:
print("\n--- 4. Defining Models and Hyperparameter Grids ---")
param_grids = {
    "Decision Tree": {
        "max_depth": [5, 20, None],
        "min_samples_split": [2, 10],
        "min_samples_leaf": [1, 5]
    },
    "Random Forest": {
        "n_estimators": [50, 200], # Added 200 as a common value
        "max_depth": [10, None], # Added 20
        "min_samples_split": [2, 10], # Added 10
        "min_samples_leaf": [2, 4] # Added 4
    },
    "XGBoost": {
        "n_estimators": [100, 200, 300], # Adjusted values
        "learning_rate": [0.01, 0.05, 0.2], # Adjusted values
        "max_depth": [3, 5, 10], # Adjusted values
        "subsample": [0.6, 0.8, 1.0], # Adjusted values
        "colsample_bytree": [0.6, 0.8, 1.0] # Added colsample_bytree for more comprehensive search
    },
    "LightGBM": { # Adding LightGBM as a candidate too!
        "n_estimators": [100, 200, 300],
        "learning_rate": [0.01, 0.1, 0.2],
        "num_leaves": [31, 63, 127],
        "max_depth": [-1, 7, 15], # -1 means no limit
        "reg_alpha": [0, 0.1, 0.5],
        "reg_lambda": [0, 0.1, 0.5]
    }
}

models = {
    "Decision Tree": DecisionTreeRegressor(random_state=37),
    "Random Forest": RandomForestRegressor(random_state=37),
    "XGBoost": XGBRegressor(random_state=37, objective='reg:squarederror', eval_metric='rmse'), # Default objective for regression, eval_metric for consistency
    "LightGBM": lgb.LGBMRegressor(random_state=37, objective='regression_l1') # Using MAE objective like before
}

best_models = {}
evaluation_results = {} # To store test set results for comparison


--- 4. Defining Models and Hyperparameter Grids ---


In [7]:
print("\n--- 4. Performing Hyperparameter Tuning with RandomizedSearchCV ---")
for model_name, model in models.items():
    print(f"\nTuning {model_name}...")
    grid_search = RandomizedSearchCV(model, param_grids[model_name], cv=3, scoring='r2', n_jobs=-1, verbose=1, n_iter=10, random_state=37)

    try:
        grid_search.fit(X_train, y_train)

        best_models[model_name] = grid_search.best_estimator_
        print(f"Best parameters for {model_name}: {grid_search.best_params_}")
        print(f"Best R2 score on validation sets for {model_name}: {grid_search.best_score_:.4f}")

        # Evaluate the best estimator on the test set
        y_pred = best_models[model_name].predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        r2 = r2_score(y_test, y_pred)
        evaluation_results[model_name] = {'RMSE': rmse, 'R2': r2}
        print(f"Test Set Evaluation for {model_name}: RMSE = {rmse:.2f}, R2 = {r2:.4f}")

    except Exception as e:
        print(f"Error tuning {model_name}: {e}")
        print("Skipping this model and moving to the next.")



--- 4. Performing Hyperparameter Tuning with RandomizedSearchCV ---

Tuning Decision Tree...
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best parameters for Decision Tree: {'min_samples_split': 10, 'min_samples_leaf': 1, 'max_depth': 5}
Best R2 score on validation sets for Decision Tree: 0.8376
Test Set Evaluation for Decision Tree: RMSE = 604.15, R2 = 0.8571

Tuning Random Forest...
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best parameters for Random Forest: {'n_estimators': 50, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_depth': None}
Best R2 score on validation sets for Random Forest: 0.8747
Test Set Evaluation for Random Forest: RMSE = 460.00, R2 = 0.9172

Tuning XGBoost...
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best parameters for XGBoost: {'subsample': 0.6, 'n_estimators': 200, 'max_depth': 10, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
Best R2 score on validation sets for XGBoost: 0.9037
Test Set Evaluation 

In [8]:
print("\n--- 5. Adding Linear Regression (No Hyperparameter Tuning) ---")
lr = LinearRegression()
lr.fit(X_train, y_train)
best_models["Linear Regression"] = lr

y_pred_lr = lr.predict(X_test)
rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred_lr))
r2_lr = r2_score(y_test, y_pred_lr)
evaluation_results["Linear Regression"] = {'RMSE': rmse_lr, 'R2': r2_lr}
print(f"Test Set Evaluation for Linear Regression: RMSE = {rmse_lr:.2f}, R2 = {r2_lr:.4f}")



--- 5. Adding Linear Regression (No Hyperparameter Tuning) ---
Test Set Evaluation for Linear Regression: RMSE = 671.33, R2 = 0.8236


In [9]:
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.linear_model import LinearRegression, Ridge

In [10]:
print("\n--- 6. Implementing and Evaluating Stacking Regressor ---")

estimators = [
    ('dt', best_models["Decision Tree"]),
    ('rf', best_models["Random Forest"]),
    ('xgb', best_models["XGBoost"]),
    ('lgbm', best_models["LightGBM"])
]

final_estimator = Ridge(alpha=1.0)

stacking_regressor = StackingRegressor(
    estimators=estimators,
    final_estimator=final_estimator,
    cv=5,
    n_jobs=-1,
    verbose=1
)

print("Starting Stacking Regressor training...")
stacking_regressor.fit(X_train, y_train)
print("Stacking Regressor training complete.")

y_pred_stack = stacking_regressor.predict(X_test)
rmse_stack = np.sqrt(mean_squared_error(y_test, y_pred_stack))
r2_stack = r2_score(y_test, y_pred_stack)

best_models["Stacking Regressor"] = stacking_regressor
evaluation_results["Stacking Regressor"] = {'RMSE': rmse_stack, 'R2': r2_stack}
print(f"Test Set Evaluation for Stacking Regressor: RMSE = {rmse_stack:.2f}, R2 = {r2_stack:.4f}")



--- 6. Implementing and Evaluating Stacking Regressor ---
Starting Stacking Regressor training...
Stacking Regressor training complete.
Test Set Evaluation for Stacking Regressor: RMSE = 363.39, R2 = 0.9483


In [11]:
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam # Using Adam optimizer
from keras.callbacks import EarlyStopping 



In [12]:
print("\n--- 7. Implementing and Evaluating Deep Neural Network (DNN) ---")

# Define the DNN model architecture
def build_dnn_model(input_shape):
    model = Sequential([
        # Input layer and first hidden layer
        Dense(256, activation='relu', input_shape=(input_shape,)),
        Dropout(0.4), # Dropout for regularization
        # Second hidden layer
        Dense(128, activation='relu'),
        Dropout(0.3),
        # Third hidden layer
        Dense(64, activation='relu'),
        Dropout(0.4),
        # Output layer for regression (single neuron, no activation)
        Dense(1)
    ])

    # Compile the model
    # Using Adam optimizer with a custom learning rate
    # Loss: Mean Squared Error (MSE) is common for regression
    # Metrics: RMSE and MAE (Mean Absolute Error) are good to monitor
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=[tf.keras.metrics.RootMeanSquaredError(), 'mae'])
    return model

# Get the input shape from our training data
input_dim = X_train.shape[1]
dnn_model = build_dnn_model(input_dim)

# Print model summary
dnn_model.summary()

early_stopping = EarlyStopping(
    monitor='val_loss',         # Metric to monitor (validation loss)
    patience=3,                 # Number of epochs with no improvement after which training will be stopped
    mode='min',                 # 'min' because we want to minimize the loss
    restore_best_weights=True,  # Restores model weights from the epoch with the best value of the monitored metric.
    verbose=1                   # Show messages when stopping
)
# --- End Early Stopping Callback ---

# Train the DNN model
print("\nStarting DNN model training...")
# Using 50 epochs, a batch size of 32, and validating on the test set
history = dnn_model.fit(
    X_train, y_train,
    epochs=50,
    batch_size=32,
    validation_data=(X_test, y_test),
    verbose=1, # Show training progress
    #callbacks = [early_stopping]
)
print("DNN model training complete.")



--- 7. Implementing and Evaluating Deep Neural Network (DNN) ---


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



Starting DNN model training...
Epoch 1/50
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 4891626.5000 - mae: 1408.4932 - root_mean_squared_error: 2187.7485 - val_loss: 1903812.8750 - val_mae: 546.5878 - val_root_mean_squared_error: 1379.7872
Epoch 2/50
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 1389782.7500 - mae: 518.2339 - root_mean_squared_error: 1172.9293 - val_loss: 766638.0625 - val_mae: 357.9086 - val_root_mean_squared_error: 875.5787
Epoch 3/50
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 845584.7500 - mae: 428.6539 - root_mean_squared_error: 917.7016 - val_loss: 515574.0625 - val_mae: 309.1140 - val_root_mean_squared_error: 718.0349
Epoch 4/50
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 591727.5000 - mae: 385.3958 - root_mean_squared_error: 768.2875 - val_loss: 449923.5938 - val_mae: 310.1934 - val_root_mean_squared_error: 67

In [13]:
try:
    history = dnn_model.fit(
        X_train, y_train,
        epochs=50,
        batch_size=32,
        validation_data=(X_test, y_test),
        verbose=1, # Show training progress
        #callbacks = [early_stopping]
    )
    print("DNN model training complete.")

    print("\nEvaluating DNN model on test set...")
    dnn_eval_results = dnn_model.evaluate(X_test, y_test, verbose=0)
    dnn_loss = dnn_eval_results[0]
    dnn_rmse = dnn_eval_results[1]
    dnn_mae = dnn_eval_results[2]

    y_pred_dnn = dnn_model.predict(X_test).flatten()
    r2_dnn = r2_score(y_test, y_pred_dnn)

    best_models["Deep Neural Network"] = dnn_model
    evaluation_results["Deep Neural Network"] = {'RMSE': dnn_rmse, 'R2': r2_dnn, 'MAE': dnn_mae}
    print(f"Test Set Evaluation for Deep Neural Network: RMSE = {dnn_rmse:.2f}, R2 = {r2_dnn:.4f}, MAE = {dnn_mae:.2f}")

except Exception as e:
    print(f"Error training DNN: {e}")
    print("Skipping DNN model.")


Epoch 1/50
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 421665.5312 - mae: 317.7915 - root_mean_squared_error: 648.3856 - val_loss: 251833.5312 - val_mae: 192.4664 - val_root_mean_squared_error: 501.8302
Epoch 2/50
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 325189.8438 - mae: 308.7541 - root_mean_squared_error: 568.4695 - val_loss: 258557.2812 - val_mae: 208.0235 - val_root_mean_squared_error: 508.4853
Epoch 3/50
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 413472.6250 - mae: 326.1227 - root_mean_squared_error: 640.3846 - val_loss: 277777.8438 - val_mae: 206.3626 - val_root_mean_squared_error: 527.0463
Epoch 4/50
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 324448.2188 - mae: 305.2728 - root_mean_squared_error: 565.0142 - val_loss: 252354.7344 - val_mae: 204.7196 - val_root_mean_squared_error: 502.3492
Epoch 5/50
[1m149/149[0m [32m

In [14]:
print("\n--- 8. Summary of All Best Models and Test Set Performance ---")
sorted_results = sorted(evaluation_results.items(), key=lambda item: item[1]['R2'], reverse=True)

for model_name, metrics in sorted_results:
    print(f"Model: {model_name}")
    print(f"  RMSE: {metrics['RMSE']:.2f}")
    print(f"  R2: {metrics['R2']:.4f}")
    if 'MAE' in metrics:
        print(f"  MAE: {metrics['MAE']:.2f}")
    print("---")




--- 8. Summary of All Best Models and Test Set Performance ---
Model: XGBoost
  RMSE: 352.56
  R2: 0.9513
---
Model: Stacking Regressor
  RMSE: 363.39
  R2: 0.9483
---
Model: LightGBM
  RMSE: 397.25
  R2: 0.9382
---
Model: Random Forest
  RMSE: 460.00
  R2: 0.9172
---
Model: Deep Neural Network
  RMSE: 514.73
  R2: 0.8963
  MAE: 192.11
---
Model: Decision Tree
  RMSE: 604.15
  R2: 0.8571
---
Model: Linear Regression
  RMSE: 671.33
  R2: 0.8236
---


In [15]:
print("\n--- 9. Saving Only the XGBoost Model ---")

model_name_to_save = "XGBoost"

if model_name_to_save in best_models:
    xgboost_model = best_models[model_name_to_save]
    safe_model_name = model_name_to_save.replace(" ", "_").replace(".", "")
    model_filename = f"Tabular_{safe_model_name}_model.joblib"

    try:
        joblib.dump(xgboost_model, model_filename)
        print(f"Successfully saved {model_name_to_save} model to: {model_filename}")
    except Exception as e:
        print(f"Error saving {model_name_to_save} model: {e}")
else:
    print(f"Error: {model_name_to_save} model not found in best_models. Make sure it was trained successfully.")




--- 9. Saving Only the XGBoost Model ---
Successfully saved XGBoost model to: Tabular_XGBoost_model.joblib


In [16]:
import sklearn
print("Scikit-learn version:", sklearn.__version__)

import tensorflow as tf
print("TensorFlow version:", tf.__version__)

import numpy
print("NumPy version:", numpy.__version__)

import pandas
print("Pandas version:", pandas.__version__)

import xgboost
print("XGBoost version:", xgboost.__version__)

Scikit-learn version: 1.6.1
TensorFlow version: 2.19.0
NumPy version: 2.2.5
Pandas version: 2.2.3
XGBoost version: 3.0.2
