In [91]:
# Downloading Dataset directly from Kaggle

import opendatasets as od

od.download("https://www.kaggle.com/datasets/natedir/diamonds")

Skipping, found downloaded files in ".\diamonds" (use force=True to force download)


In [162]:
## To get the version name of teh module
# pip show pandas
import numpy as np
print(pd.__version__)
print(np.__version__)

1.5.3
1.23.5


In [150]:
import pandas as pd
from xgboost import XGBRegressor
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt

In [107]:
# Carat- Carat weight of the diamond
# Cut - The cut rating of the diamond
# Color - The color rating of the diamond
# Clarity - The clarity rating of the diamond
# Table - The table width of the diamond
# Depth- The percentage of depth of the diamond
# Price - The price (in USD) of the diamond
# X- X dimension of the diamond
# Y- Y dimension of the diamond
# Z- Z dimension of the diamond

diamonds = pd.read_csv('C:\\Users\\12392\\Untitled Folder\\diamonds\\diamonds.csv', index_col=0)

In [108]:
diamonds.head(10)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
6,0.24,Very Good,J,VVS2,62.8,57.0,336,3.94,3.96,2.48
7,0.24,Very Good,I,VVS1,62.3,57.0,336,3.95,3.98,2.47
8,0.26,Very Good,H,SI1,61.9,55.0,337,4.07,4.11,2.53
9,0.22,Fair,E,VS2,65.1,61.0,337,3.87,3.78,2.49
10,0.23,Very Good,H,VS1,59.4,61.0,338,4.0,4.05,2.39


In [109]:
# dataset features and size
diamonds.shape

(53940, 10)

In [110]:
# null values check 
diamonds.isna().sum()

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
dtype: int64

In [111]:
# datatype check
diamonds.dtypes

carat      float64
cut         object
color       object
clarity     object
depth      float64
table      float64
price        int64
x          float64
y          float64
z          float64
dtype: object

In [112]:
# Data Preparation
# cut, color, clarity are of object type, i.e. ordinal categorial values

# scale is Ideal (High)>Premium>Very Good> Good>Fair
diamonds.cut.value_counts()

Ideal        21551
Premium      13791
Very Good    12082
Good          4906
Fair          1610
Name: cut, dtype: int64

In [113]:
diamonds.color.value_counts()

G    11292
E     9797
F     9542
H     8304
D     6775
I     5422
J     2808
Name: color, dtype: int64

In [102]:
diamonds.clarity.value_counts()

SI1     13065
VS2     12258
SI2      9194
VS1      8171
VVS2     5066
VVS1     3655
IF       1790
I1        741
Name: clarity, dtype: int64

In [114]:
# Data conversion from Ordinal categorial to numeric form

# Technique: Mapping or Ordinal Encoding (sci-kit) or one hot encoding (sci-kit) 

# Encoding cut variable
cut_mapping = {'Fair':0, 'Good':1, 'Very Good':2, 'Premium':3, 'Ideal':4}
diamonds.cut = diamonds.cut.map(cut_mapping)

# Encoding color variable
color_mapping = {'J':0, 'I':1, 'D':2, 'H':3, 'F':4, 'E':5,'G':6}
diamonds.color = diamonds.color.map(color_mapping)

# Encoding clarity variable
clarity_mapping = {'I1':0, 'IF':7, 'VVS1':6, 'VVS2':5, 'VS1':4, 'SI2':1,'VS2':3, 'SI1':2}
diamonds.clarity = diamonds.clarity.map(clarity_mapping)

In [115]:
# Outlier detection and dropping in x,y,z measurements axis in data 

diamonds = diamonds.drop(diamonds[diamonds["x"]==0].index)
diamonds = diamonds.drop(diamonds[diamonds["y"]==0].index)
diamonds = diamonds.drop(diamonds[diamonds["z"]==0].index)

diamonds

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,4,5,1,61.5,55.0,326,3.95,3.98,2.43
2,0.21,3,5,2,59.8,61.0,326,3.89,3.84,2.31
3,0.23,1,5,4,56.9,65.0,327,4.05,4.07,2.31
4,0.29,3,1,3,62.4,58.0,334,4.20,4.23,2.63
5,0.31,1,0,1,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
53936,0.72,4,2,2,60.8,57.0,2757,5.75,5.76,3.50
53937,0.72,1,2,2,63.1,55.0,2757,5.69,5.75,3.61
53938,0.70,2,2,2,62.8,60.0,2757,5.66,5.68,3.56
53939,0.86,3,3,1,61.0,58.0,2757,6.15,6.12,3.74


In [116]:
# more detailed approach might be required in real-world scenerio. cut the dataset down to the 99th percentile based on 
# a few different variables to get rid of the most egregious outliers

diamonds = diamonds[diamonds['depth'] < diamonds['depth'].quantile(0.99)]
diamonds = diamonds[diamonds['table'] < diamonds['table'].quantile(0.99)]
diamonds = diamonds[diamonds['x'] < diamonds['x'].quantile(0.99)]
diamonds = diamonds[diamonds['y'] < diamonds['x'].quantile(0.99)]
diamonds = diamonds[diamonds['z'] < diamonds['x'].quantile(0.99)]

In [57]:
# Corr heatmap

In [117]:
# Train and Test data creation

dia_model = diamonds.copy()
X = dia_model.drop(['price'], axis=1)
y = dia_model['price']

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=0)


In [121]:
# model creation

xgb_reg = XGBRegressor()

# 
#  Defining parameters for gridsearch to find optimal hyperparameters
# parameters = {
#               'objective':['reg:squarederror'],
#               'learning_rate': [.0001, 0.001, .01],
#               'max_depth': [3, 5, 7],
#               'min_child_weight': [3,5,7],
#               'subsample': [0.1,0.5,1.0],
#               'colsample_bytree': [0.1, 0.5, 1.0],
#               'n_estimators': [500]
#              }

# xgb_grid = GridSearchCV(xgb1, parameters, cv = 3, n_jobs = -1, verbose=0)


xgb_reg.fit(X_train, y_train)

eval_set = [(X_train, y_train),(X_val, y_val)]

fit_model = xgb_reg.fit(
    X_train,
    y_train,
    eval_set=eval_set,
    eval_metric='mae',
    early_stopping_rounds=50,
    verbose=False)

In [139]:
# Model Test

print("Test_MAE:USD", mean_absolute_error(y_val, fit_model.predict(X_val)))
print("Test_MSE:", mean_squared_error(y_val, fit_model.predict(X_val)))
print("Test_R2:", r2_score(y_val, fit_model.predict(X_val)))

Test_MAE:USD 255.1934632403444
Test_MSE: 243867.05271914965
Test_R2: 0.9817215475705299


In [138]:
# Model validation

print("Val_AE:USD", mean_absolute_error(y_test, fit_model.predict(X_test)))
print("Val_MSE:", mean_squared_error(y_test, fit_model.predict(X_test)))
print("Val_R2:", r2_score(y_test, fit_model.predict(X_test)))

Val_AE:USD 261.1462956573404
Val_MSE: 258524.12918965024
Val_R2: 0.9810199710243127


In [141]:
# Saving the model

fit_model.save_model('xgb_diamonds_model.json')