# **Course: Data Science for Decision Support**
**Assignment 2**

**Author:** MADHUMITHA, KESAVAN: 537467

**Regression dashboard**

In [5]:
!pip install catboost

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from catboost import CatBoostRegressor
import lightgbm as lgb
import ipywidgets as widgets
from IPython.display import display, clear_output
from google.colab import drive
drive.mount('/content/drive')
# Load data
df = pd.read_csv("/content/drive/My Drive/diamonds.csv")

# Encode categorical variables
label_encoders = {}
for column in ['cut', 'color', 'clarity']:
    le = LabelEncoder()
    df[column + '_encoded'] = le.fit_transform(df[column])
    label_encoders[column] = le

df.drop(['cut', 'color', 'clarity'], axis=1, inplace=True)

# Define features and target
X = df[['carat', 'cut_encoded', 'color_encoded', 'clarity_encoded', 'depth', 'table', 'x', 'y', 'z']]
Y = df[['price']]

# Split the data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

# Scale the data
scaler = MinMaxScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

# Define models
models = {
    "Random Forest": RandomForestRegressor(n_estimators=300, max_depth=20, min_samples_split=5, random_state=42),
    "XGBoost": xgb.XGBRegressor(learning_rate=0.1, max_depth=6, n_estimators=200, objective='reg:squarederror', random_state=42),
    "CatBoost": CatBoostRegressor(depth=10, iterations=300, learning_rate=0.1, verbose=0, random_state=42),
    "LightGBM": lgb.LGBMRegressor(learning_rate=0.05, n_estimators=300, num_leaves=31, random_state=42)
}

# Train models and calculate metrics
model_results = {}
model_forecasts = {}
for model_name, model in models.items():
    model.fit(x_train_scaled, y_train)
    y_test_pred = model.predict(x_test_scaled)
    y_train_pred = model.predict(x_train_scaled)

    # Store results
    model_results[model_name] = {
        "R2": r2_score(y_test, y_test_pred),
        "MSE": mean_squared_error(y_test, y_test_pred),
        "RMSE": np.sqrt(mean_squared_error(y_test, y_test_pred)),
        "MAE": mean_absolute_error(y_test, y_test_pred)
    }

    # Forecasting Results for Page 3
    forecast_df = pd.DataFrame({
        "True Value": y_test,
        "Forecasted Value": y_test_pred,
        "Absolute Error": np.abs(y_test - y_test_pred)
    }).sort_values(by="Absolute Error", ascending=False)
    model_forecasts[model_name] = forecast_df

# Widgets
model_dropdown = widgets.Dropdown(
    options=list(models.keys()),
    value=list(models.keys())[0],
    description="Model:"
)
metrics_select = widgets.SelectMultiple(
    options=["R2", "MSE", "RMSE", "MAE"],
    value=["R2"],
    description="Metrics:"
)
output_tab1 = widgets.Output()
output_tab2 = widgets.Output()
output_tab3 = widgets.Output()
tabs = widgets.Tab(children=[output_tab1, output_tab2, output_tab3])
tabs.set_title(0, "Data & Hyperparameters")
tabs.set_title(1, "Performance")
tabs.set_title(2, "Forecasts")

# Replace with your names and surnames
names = "Madhumitha", "Kesavan"

# Callback Function
def update_tabs(*args):
    selected_model = model_dropdown.value
    selected_metrics = metrics_select.value

    # Tab 1: Data & Hyperparameters
    with output_tab1:
        clear_output()
        display(df.head())
        print("\nHyperparameters:")
        print(models[selected_model].get_params())

    # Tab 2: Performance Metrics
    with output_tab2:
        clear_output()
        model_metrics = model_results[selected_model]
        filtered_metrics = {metric: model_metrics[metric] for metric in selected_metrics}
        filtered_metrics_df = pd.DataFrame(filtered_metrics, index=[selected_model])
        display(filtered_metrics_df)
        filtered_metrics_df.T.plot(kind="bar", legend=False)
        plt.title(f"{selected_model} - Selected Metrics")
        plt.ylabel("Values")
        plt.show()

    # Tab 3: Forecasts
    with output_tab3:
        clear_output()
        forecast_df = model_forecasts[selected_model]
        display(forecast_df)

# Attach Callback
model_dropdown.observe(update_tabs, names="value")
metrics_select.observe(update_tabs, names="value")

# Initial Update
update_tabs()

# Dashboard Layout
dashboard = widgets.VBox([
    widgets.Label(value=f"Diamond Price Prediction Dashboard by {', '.join(names)}"),
    widgets.HBox([model_dropdown, metrics_select]),
    tabs
])
display(dashboard)


Mounted at /content/drive
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003543 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1273
[LightGBM] [Info] Number of data points in the train set: 43152, number of used features: 9
[LightGBM] [Info] Start training from score 3939.490707


VBox(children=(Label(value='Diamond Price Prediction Dashboard by Madhumitha, Kesavan'), HBox(children=(Dropdo…