In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb # Already imported, keeping for context if you still use LGBM later
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor # Make sure you have xgboost installed (pip install xgboost)
from sklearn.linear_model import LinearRegression
import joblib # To save models

In [2]:
print("--- Step 1: Load All Preprocessed Data ---")

# Load multi-modal data (from df_hybrid)
try:
    X_multi_modal = pd.read_csv('processed_diamond_features_X_multi_modal.csv')
    y_multi_modal = pd.read_csv('diamond_target_y_multi_model.csv').squeeze() # .squeeze() to ensure it's a Series
    print(f"Loaded X_multi_modal shape: {X_multi_modal.shape}")
    print(f"Loaded y_multi_modal shape: {y_multi_modal.shape}")
except FileNotFoundError as e:
    print(f"ERROR: Multi-modal files not found. Ensure 'processed_diamond_features_X_multi_modal.csv' and 'diamond_target_y.csv' exist.")
    print(e)
    exit()

# Load tabular-only data (from df_tabular_only_strategy)
try:
    X_tabular_only = pd.read_csv('processed_diamond_features_X.csv')
    y_tabular_only = pd.read_csv('diamond_target_y.csv').squeeze() # .squeeze() to ensure it's a Series
    print(f"Loaded X_tabular_only shape: {X_tabular_only.shape}")
    print(f"Loaded y_tabular_only shape: {y_tabular_only.shape}")
except FileNotFoundError as e:
    print(f"ERROR: Tabular-only files not found. Ensure 'processed_diamond_features_X_tabular_only.csv' and 'diamond_target_y_tabular_only.csv' exist.")
    print(e)
    exit()

--- Step 1: Load All Preprocessed Data ---
Loaded X_multi_modal shape: (4359, 2074)
Loaded y_multi_modal shape: (4359,)
Loaded X_tabular_only shape: (5952, 26)
Loaded y_tabular_only shape: (5952,)


In [18]:
# Added section to describe target variable
print("\n--- 1.1. Descriptive Statistics of Diamond Prices (Target Variable) ---")
print(y_final.describe())




--- 1.1. Descriptive Statistics of Diamond Prices (Target Variable) ---
count    10311.000000
mean      1694.120854
std       1592.725457
min        512.460000
25%        940.740000
50%       1319.660000
75%       2042.450000
max      16751.620000
Name: Price, dtype: float64


In [3]:
print("\n--- Step 2: Concatenate Features (X) and Targets (y) ---")

# Ensure columns are aligned before concatenation.
# This is crucial if one-hot encoding produced different columns due to varying categories in subsets.
# We'll use reindex to make sure all X dataframes have the same columns, filling missing with 0.
all_columns = pd.Index(X_multi_modal.columns).union(X_tabular_only.columns)

X_multi_modal_aligned = X_multi_modal.reindex(columns=all_columns, fill_value=0)
X_tabular_only_aligned = X_tabular_only.reindex(columns=all_columns, fill_value=0)


# Concatenate the feature DataFrames vertically
X_final = pd.concat([X_multi_modal_aligned, X_tabular_only_aligned], ignore_index=True)

# Concatenate the target Series vertically (order must match X)
y_final = pd.concat([y_multi_modal, y_tabular_only], ignore_index=True)

print(f"Final combined X_final shape: {X_final.shape}")
print(f"Final combined y_final shape: {y_final.shape}")
print("Sample of final X_final (head):")
print(X_final.head())
print("\nSample of final X_final (tail, to see tabular-only data):")
print(X_final.tail())



--- Step 2: Concatenate Features (X) and Targets (y) ---
Final combined X_final shape: (10311, 2074)
Final combined y_final shape: (10311,)
Sample of final X_final (head):
   Clarity  Colour  Colour_IsFancy_0  Colour_IsFancy_1  Cut  Fluorescence_F  \
0      5.0     3.0               1.0               0.0  3.0             0.0   
1      3.0     7.0               1.0               0.0  3.0             1.0   
2      4.0     7.0               1.0               0.0  2.0             0.0   
3      6.0     7.0               1.0               0.0  3.0             1.0   
4      4.0     2.0               1.0               0.0  2.0             0.0   

   Fluorescence_M  Fluorescence_N  Fluorescence_SL  Fluorescence_ST  ...  \
0             1.0             0.0              0.0              0.0  ...   
1             0.0             0.0              0.0              0.0  ...   
2             0.0             1.0              0.0              0.0  ...   
3             0.0             0.0              0

In [4]:
print("\n--- Step 3: Split Data into Training and Testing Sets ---")

# Split the combined data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_final, y_final, test_size=0.2, random_state=37 # 20% for testing, use random_state for reproducibility
)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")



--- Step 3: Split Data into Training and Testing Sets ---
X_train shape: (8248, 2074)
X_test shape: (2063, 2074)
y_train shape: (8248,)
y_test shape: (2063,)


In [5]:
print("\n--- 4. Defining Models and Hyperparameter Grids ---")
param_grids = {
    "Decision Tree": {
        "max_depth": [5, 20, None],
        "min_samples_split": [2, 10],
        "min_samples_leaf": [1, 5]
    },
    "Random Forest": {
        "n_estimators": [50, 200], # Added 200 as a common value
        "max_depth": [10, None], # Added 20
        "min_samples_split": [2, 10], # Added 10
        "min_samples_leaf": [2, 4] # Added 4
    },
    "XGBoost": {
        "n_estimators": [100, 200, 300], # Adjusted values
        "learning_rate": [0.01, 0.05, 0.2], # Adjusted values
        "max_depth": [3, 5, 10], # Adjusted values
        "subsample": [0.6, 0.8, 1.0], # Adjusted values
        "colsample_bytree": [0.6, 0.8, 1.0] # Added colsample_bytree for more comprehensive search
    },
    "LightGBM": { # Adding LightGBM as a candidate too!
        "n_estimators": [100, 200, 300],
        "learning_rate": [0.01, 0.1, 0.2],
        "num_leaves": [31, 63, 127],
        "max_depth": [-1, 7, 15], # -1 means no limit
        "reg_alpha": [0, 0.1, 0.5],
        "reg_lambda": [0, 0.1, 0.5]
    }
}

models = {
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42),
    "XGBoost": XGBRegressor(random_state=42, objective='reg:squarederror', eval_metric='rmse'), # Default objective for regression, eval_metric for consistency
    "LightGBM": lgb.LGBMRegressor(random_state=42, objective='regression_l1') # Using MAE objective like before
}

best_models = {}
evaluation_results = {} # To store test set results for comparison


--- 4. Defining Models and Hyperparameter Grids ---


In [6]:
print("\n--- 4. Performing Hyperparameter Tuning with RandomizedSearchCV ---")
for model_name, model in models.items():
    print(f"\nTuning {model_name}...")
    grid_search = RandomizedSearchCV(model, param_grids[model_name], cv=3, scoring='r2', n_jobs=-1, verbose=1, n_iter=10, random_state=37)

    try:
        grid_search.fit(X_train, y_train)

        best_models[model_name] = grid_search.best_estimator_
        print(f"Best parameters for {model_name}: {grid_search.best_params_}")
        print(f"Best R2 score on validation sets for {model_name}: {grid_search.best_score_:.4f}")

        # Evaluate the best estimator on the test set
        y_pred = best_models[model_name].predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        r2 = r2_score(y_test, y_pred)
        evaluation_results[model_name] = {'RMSE': rmse, 'R2': r2}
        print(f"Test Set Evaluation for {model_name}: RMSE = {rmse:.2f}, R2 = {r2:.4f}")

    except Exception as e:
        print(f"Error tuning {model_name}: {e}")
        print("Skipping this model and moving to the next.")



--- 4. Performing Hyperparameter Tuning with RandomizedSearchCV ---

Tuning Decision Tree...
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best parameters for Decision Tree: {'min_samples_split': 10, 'min_samples_leaf': 5, 'max_depth': 20}
Best R2 score on validation sets for Decision Tree: 0.8670
Test Set Evaluation for Decision Tree: RMSE = 558.02, R2 = 0.8781

Tuning Random Forest...
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best parameters for Random Forest: {'n_estimators': 50, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_depth': None}
Best R2 score on validation sets for Random Forest: 0.9187
Test Set Evaluation for Random Forest: RMSE = 374.84, R2 = 0.9450

Tuning XGBoost...
Fitting 3 folds for each of 10 candidates, totalling 30 fits


3 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\2XIN\anaconda3\envs\nlp_env\lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\2XIN\anaconda3\envs\nlp_env\lib\site-packages\xgboost\core.py", line 729, in inner_f
    return func(**kwargs)
  File "C:\Users\2XIN\anaconda3\envs\nlp_env\lib\site-packages\xgboost\sklearn.py", line 1247, in fit
    self._Booster = train(
  File "C:\Users\2XIN\anaconda3\envs\nlp_env\lib\site-packages\xgboost\core.py", line 729, in inner_f
    return func(**kwargs)
  File "C:\Users\2XIN

Best parameters for XGBoost: {'subsample': 1.0, 'n_estimators': 100, 'max_depth': 3, 'learning_rate': 0.2, 'colsample_bytree': 0.8}
Best R2 score on validation sets for XGBoost: 0.9277
Test Set Evaluation for XGBoost: RMSE = 348.82, R2 = 0.9524

Tuning LightGBM...
Fitting 3 folds for each of 10 candidates, totalling 30 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.128547 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 517055
[LightGBM] [Info] Number of data points in the train set: 8248, number of used features: 2071
[LightGBM] [Info] Start training from score 1318.959961
Best parameters for LightGBM: {'reg_lambda': 0.1, 'reg_alpha': 0.5, 'num_leaves': 127, 'n_estimators': 300, 'max_depth': 7, 'learning_rate': 0.2}
Best R2 score on validation sets for LightGBM: 0.9108
Test Set Evaluation for LightGBM: RMSE = 444.16, R2 = 0.9228


In [7]:
print("\n--- 5. Adding Linear Regression (No Hyperparameter Tuning) ---")
lr = LinearRegression()
lr.fit(X_train, y_train)
best_models["Linear Regression"] = lr

y_pred_lr = lr.predict(X_test)
rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred_lr))
r2_lr = r2_score(y_test, y_pred_lr)
evaluation_results["Linear Regression"] = {'RMSE': rmse_lr, 'R2': r2_lr}
print(f"Test Set Evaluation for Linear Regression: RMSE = {rmse_lr:.2f}, R2 = {r2_lr:.4f}")



--- 5. Adding Linear Regression (No Hyperparameter Tuning) ---
Test Set Evaluation for Linear Regression: RMSE = 824.68, R2 = 0.7338


In [11]:
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.linear_model import LinearRegression, Ridge

In [12]:
print("\n--- 6. Implementing and Evaluating Stacking Regressor ---")

estimators = [
    ('dt', best_models["Decision Tree"]),
    ('rf', best_models["Random Forest"]),
    ('xgb', best_models["XGBoost"]),
    ('lgbm', best_models["LightGBM"])
]

final_estimator = Ridge(alpha=1.0)

stacking_regressor = StackingRegressor(
    estimators=estimators,
    final_estimator=final_estimator,
    cv=5,
    n_jobs=-1,
    verbose=1
)

print("Starting Stacking Regressor training...")
stacking_regressor.fit(X_train, y_train)
print("Stacking Regressor training complete.")

y_pred_stack = stacking_regressor.predict(X_test)
rmse_stack = np.sqrt(mean_squared_error(y_test, y_pred_stack))
r2_stack = r2_score(y_test, y_pred_stack)

best_models["Stacking Regressor"] = stacking_regressor
evaluation_results["Stacking Regressor"] = {'RMSE': rmse_stack, 'R2': r2_stack}
print(f"Test Set Evaluation for Stacking Regressor: RMSE = {rmse_stack:.2f}, R2 = {r2_stack:.4f}")



--- 6. Implementing and Evaluating Stacking Regressor ---
Starting Stacking Regressor training...
Stacking Regressor training complete.
Test Set Evaluation for Stacking Regressor: RMSE = 350.36, R2 = 0.9520


In [None]:
mae_lr = mean_absolute_error(y_test, y_pred_lr)

In [22]:
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam # Using Adam optimizer
from keras.callbacks import EarlyStopping 

In [36]:
print("\n--- 7. Implementing and Evaluating Deep Neural Network (DNN) ---")

# Define the DNN model architecture
def build_dnn_model(input_shape):
    model = Sequential([
        # Input layer and first hidden layer
        Dense(256, activation='relu', input_shape=(input_shape,)),
        Dropout(0.4), # Dropout for regularization
        # Second hidden layer
        Dense(128, activation='relu'),
        Dropout(0.3),
        # Third hidden layer
        Dense(64, activation='relu'),
        Dropout(0.4),
        # Output layer for regression (single neuron, no activation)
        Dense(1)
    ])

    # Compile the model
    # Using Adam optimizer with a custom learning rate
    # Loss: Mean Squared Error (MSE) is common for regression
    # Metrics: RMSE and MAE (Mean Absolute Error) are good to monitor
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=[tf.keras.metrics.RootMeanSquaredError(), 'mae'])
    return model

# Get the input shape from our training data
input_dim = X_train.shape[1]
dnn_model = build_dnn_model(input_dim)

# Print model summary
dnn_model.summary()

early_stopping = EarlyStopping(
    monitor='val_loss',         # Metric to monitor (validation loss)
    patience=3,                 # Number of epochs with no improvement after which training will be stopped
    mode='min',                 # 'min' because we want to minimize the loss
    restore_best_weights=True,  # Restores model weights from the epoch with the best value of the monitored metric.
    verbose=1                   # Show messages when stopping
)
# --- End Early Stopping Callback ---

# Train the DNN model
print("\nStarting DNN model training...")
# Using 50 epochs, a batch size of 32, and validating on the test set
history = dnn_model.fit(
    X_train, y_train,
    epochs=50,
    batch_size=32,
    validation_data=(X_test, y_test),
    verbose=1, # Show training progress
    #callbacks = [early_stopping]
)
print("DNN model training complete.")



--- 7. Implementing and Evaluating Deep Neural Network (DNN) ---


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



Starting DNN model training...
Epoch 1/50
[1m258/258[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 3958536.7500 - mae: 1216.3431 - root_mean_squared_error: 1974.6749 - val_loss: 1338988.7500 - val_mae: 470.0140 - val_root_mean_squared_error: 1157.1469
Epoch 2/50
[1m258/258[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 1324443.1250 - mae: 551.7347 - root_mean_squared_error: 1146.8818 - val_loss: 699733.4375 - val_mae: 390.6495 - val_root_mean_squared_error: 836.5007
Epoch 3/50
[1m258/258[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 902255.5000 - mae: 487.3810 - root_mean_squared_error: 947.4752 - val_loss: 567863.4375 - val_mae: 356.8448 - val_root_mean_squared_error: 753.5671
Epoch 4/50
[1m258/258[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 732304.3125 - mae: 460.1651 - root_mean_squared_error: 854.7729 - val_loss: 471080.7812 - val_mae: 315.2399 - val_root_mean_squared_error: 68

In [37]:
try:
    history = dnn_model.fit(
        X_train, y_train,
        epochs=50,
        batch_size=32,
        validation_data=(X_test, y_test),
        verbose=1, # Show training progress
        #callbacks = [early_stopping]
    )
    print("DNN model training complete.")

    print("\nEvaluating DNN model on test set...")
    dnn_eval_results = dnn_model.evaluate(X_test, y_test, verbose=0)
    dnn_loss = dnn_eval_results[0]
    dnn_rmse = dnn_eval_results[1]
    dnn_mae = dnn_eval_results[2]

    y_pred_dnn = dnn_model.predict(X_test).flatten()
    r2_dnn = r2_score(y_test, y_pred_dnn)

    best_models["Deep Neural Network"] = dnn_model
    evaluation_results["Deep Neural Network"] = {'RMSE': dnn_rmse, 'R2': r2_dnn, 'MAE': dnn_mae}
    print(f"Test Set Evaluation for Deep Neural Network: RMSE = {dnn_rmse:.2f}, R2 = {r2_dnn:.4f}, MAE = {dnn_mae:.2f}")

except Exception as e:
    print(f"Error training DNN: {e}")
    print("Skipping DNN model.")


Epoch 1/50
[1m258/258[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 358939.1250 - mae: 319.8984 - root_mean_squared_error: 597.7661 - val_loss: 440320.1875 - val_mae: 261.5034 - val_root_mean_squared_error: 663.5663
Epoch 2/50
[1m258/258[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 366992.4375 - mae: 326.6016 - root_mean_squared_error: 604.4610 - val_loss: 332709.5312 - val_mae: 290.6947 - val_root_mean_squared_error: 576.8098
Epoch 3/50
[1m258/258[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 410767.6562 - mae: 340.9216 - root_mean_squared_error: 640.2349 - val_loss: 407683.1562 - val_mae: 298.4518 - val_root_mean_squared_error: 638.5007
Epoch 4/50
[1m258/258[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 375527.2500 - mae: 329.1502 - root_mean_squared_error: 611.6431 - val_loss: 388770.1250 - val_mae: 335.3676 - val_root_mean_squared_error: 623.5143
Epoch 5/50
[1m258/258[0m [32m

In [17]:
print("\n--- 8. Summary of All Best Models and Test Set Performance ---")
sorted_results = sorted(evaluation_results.items(), key=lambda item: item[1]['R2'], reverse=True)

for model_name, metrics in sorted_results:
    print(f"Model: {model_name}")
    print(f"  RMSE: {metrics['RMSE']:.2f}")
    print(f"  R2: {metrics['R2']:.4f}")
    if 'MAE' in metrics:
        print(f"  MAE: {metrics['MAE']:.2f}")
    print("---")




--- 8. Summary of All Best Models and Test Set Performance ---
Model: XGBoost
  RMSE: 348.82
  R2: 0.9524
---
Model: Stacking Regressor
  RMSE: 350.36
  R2: 0.9520
---
Model: Random Forest
  RMSE: 374.84
  R2: 0.9450
---
Model: LightGBM
  RMSE: 444.16
  R2: 0.9228
---
Model: Decision Tree
  RMSE: 558.02
  R2: 0.8781
---
Model: Deep Neural Network
  RMSE: 560.46
  R2: 0.8771
  MAE: 215.34
---
Model: Linear Regression
  RMSE: 824.68
  R2: 0.7338
---


In [44]:
print("\n--- 9. Saving Only the XGBoost Model ---")

model_name_to_save = "XGBoost"

if model_name_to_save in best_models:
    xgboost_model = best_models[model_name_to_save]
    safe_model_name = model_name_to_save.replace(" ", "_").replace(".", "")
    model_filename = f"{safe_model_name}_model.joblib"

    try:
        joblib.dump(xgboost_model, model_filename)
        print(f"Successfully saved {model_name_to_save} model to: {model_filename}")
    except Exception as e:
        print(f"Error saving {model_name_to_save} model: {e}")
else:
    print(f"Error: {model_name_to_save} model not found in best_models. Make sure it was trained successfully.")




--- 9. Saving Only the XGBoost Model ---
Successfully saved XGBoost model to: XGBoost_model.joblib
