# About Dataset


**price**: price in US dollars (\$326--\$18,823)

**carat**: weight of the diamond (0.2--5.01)

**cut**: quality of the cut (Fair, Good, Very Good, Premium, Ideal)

**color**: diamond colour, from J (worst) to D (best)

**clarity**: a measurement of how clear the diamond is (I1 (worst), SI2, SI1, VS2, VS1, VVS2, VVS1, IF (best))

**x**: length in mm (0--10.74)

**y**: width in mm (0--58.9)

**z**: depth in mm (0--31.8)

**depth**: total depth percentage = z / mean(x, y) = 2 * z / (x + y) (43--79)

**table**: width of top of diamond relative to widest point (43--95)

**Link to dataset:**

https://www.kaggle.com/datasets/shivam2503/diamonds

# Import

In [1]:
import pandas as pd
import math
import numpy as np

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import lightgbm as lgb

from sklearn import metrics

import matplotlib.pyplot as plt

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



# Load, examine and prepare data

In [None]:
df = pd.read_csv('diamonds.csv')
df.reset_index(drop=True)
# df = df.dropna()
# df = df.drop(df.iloc[:, :0], axis=1)


df.head()
# df.tail()
# df.describe()
# df.info()
# df.shape

# df.columns


In [None]:
num_cols = len(df.columns)
num_rows = math.ceil(num_cols / 2)  # 2 histograms per row (adjustable)

# Create subplots with flexible rows and columns
fig, axes = plt.subplots(num_rows, 2, figsize=(12, num_rows * 4))  # Dynamically adjust height
axes = axes.flatten()  # Flatten the grid for easy iteration

# Plot histograms
for i, column in enumerate(df.columns):
    df[column].hist(grid=False, edgecolor='black', ax=axes[i])
    axes[i].set_title(f'Distribution of {column}')
    axes[i].set_xlabel(f'{column} Values')
    axes[i].set_ylabel('Frequency')

# Hide unused subplots
for i in range(len(df.columns), len(axes)):
    axes[i].set_visible(False)

plt.tight_layout()
plt.show()





# Regression

In [68]:
X = df.drop('price', axis=1)
y = df['price']


# One-hot encode cut, color and clarity
X = pd.get_dummies(X, columns=["cut", "color", "clarity"], drop_first=True)

X.columns = X.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)

# X.drop("Unnamed_0", axis=1, inplace=True)

# X.columns

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.22, random_state=42)


scaler = StandardScaler()
# scaler = MinMaxScaler()

for data in [X_train, X_test]:
    data['volume'] = data['x'] * data['y'] * data['z']


scaler = StandardScaler()
X_train[['carat', 'depth', 'table', 'volume']] = scaler.fit_transform(
    X_train[['carat', 'depth', 'table', 'volume']]
)
X_test[['carat', 'depth', 'table', 'volume']] = scaler.transform(
    X_test[['carat', 'depth', 'table', 'volume']]
)

# LGBM Regressor

X_train_lgmb = scaler.fit_transform(X_train)
X_test_lgmb = scaler.transform(X_test)

lgbm = lgb.LGBMRegressor()
lgbm.fit(X_train, y_train)

y_pred_lgbm = lgbm.predict(X_test)

# Linear Regression

X_train_lr = scaler.fit_transform(X_train)
X_test_lr = scaler.transform(X_test)

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred_lr = lr.predict(X_test)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005051 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1804
[LightGBM] [Info] Number of data points in the train set: 42073, number of used features: 25
[LightGBM] [Info] Start training from score 3942.695101


In [69]:
# Evaluation of LGBM
mae = metrics.mean_absolute_error(y_test, y_pred_lgbm)
mse = metrics.mean_squared_error(y_test, y_pred_lgbm)
rmse = metrics.mean_squared_error(y_test, y_pred_lgbm)


print("LGBM regressor:")
print("")
print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')

# Get R-squared score for the test set
score_train = lgbm.score(X_train, y_train)
print(f'Train score: {score_train:.5f}')


score_test = lgbm.score(X_test, y_test)
print(f'Test score: {score_test:.5f}')

print("""____""")
print("")
# Evaluation of Linear Regression
print("Linear Regression")

mae = metrics.mean_absolute_error(y_test, y_pred_lr)
mse = metrics.mean_squared_error(y_test, y_pred_lr)
rmse = metrics.mean_squared_error(y_test, y_pred_lr)

print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')

# Get R-squared score for the test set
score_train = lr.score(X_train, y_train)
print(f'Train score: {score_train:.5f}')


score_test = lr.score(X_test, y_test)
print(f'Test score: {score_test:.5f}')

LGBM regressor:

MAE: 30.512075361175544
MSE: 3686.878496998431
RMSE: 3686.878496998431
Train score: 0.99985
Test score: 0.99977
____

Linear Regression
MAE: 730.863395499346
MSE: 1259032.126197795
RMSE: 1259032.126197795
Train score: 0.92055
Test score: 0.92029


# Neural network

In [86]:
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.activations import relu
from sklearn.metrics import r2_score

import pandas as pd

# Load and prepare data

In [81]:
# Load dataset
data = pd.read_csv('diamonds.csv')

# Preprocessing
X = data.drop(columns=['price'])
y = data['price']

# Feature engineering, create volume
X['volume'] = X['x'] * X['y'] * X['z']

# One-hot encoding
X = pd.get_dummies(X, columns=['cut', 'color', 'clarity'], drop_first=True)

# Train and test NN

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.22, random_state=42)

# Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Functional API Model
input_layer = Input(shape=(X_train.shape[1],))  # Input layer

# First hidden layer
x = Dense(128)(input_layer)
x = BatchNormalization()(x)
x = tf.keras.activations.relu(x)
x = Dropout(0.3)(x)

# Second hidden layer
x = Dense(64)(x)
x = BatchNormalization()(x)
x = tf.keras.activations.relu(x)
x = Dropout(0.3)(x)

output_layer = Dense(1, activation='linear')(x)

model = Model(inputs=input_layer, outputs=output_layer)

model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Callback to reduce learning rate
reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=5,
    min_lr=1e-6
)

# Training the model
history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=50,
    batch_size=32,
    callbacks=[reduce_lr]
)

# Evaluate on test data


test_loss, test_mae = model.evaluate(X_test, y_test, verbose=2)
print(f"Test Loss: {test_loss}")
print(f"Test MAE: {test_mae}")
# print(f"Test R^2: {r2}")

In [None]:
from tensorflow.keras.utils import plot_model

# Save model architecture to an image file
plot_model(model, to_file='model_architecture.png', show_shapes=True)