# CodeML 2023 Model

## Import

In [1]:
# Data Librairies
import pandas as pd
import numpy as np

# Data Modeling
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb


# Data Vizualization Librairies
import seaborn as sns

In [2]:
import warnings
warnings.filterwarnings("ignore")

## Data Cleaning and Processing

In [None]:
df = pd.read_csv

## Modelization

In [3]:
df = sns.load_dataset('diamonds')
print(df.head(10))
print(df.shape)

   carat        cut color clarity  depth  table  price     x     y     z
0   0.23      Ideal     E     SI2   61.5   55.0    326  3.95  3.98  2.43
1   0.21    Premium     E     SI1   59.8   61.0    326  3.89  3.84  2.31
2   0.23       Good     E     VS1   56.9   65.0    327  4.05  4.07  2.31
3   0.29    Premium     I     VS2   62.4   58.0    334  4.20  4.23  2.63
4   0.31       Good     J     SI2   63.3   58.0    335  4.34  4.35  2.75
5   0.24  Very Good     J    VVS2   62.8   57.0    336  3.94  3.96  2.48
6   0.24  Very Good     I    VVS1   62.3   57.0    336  3.95  3.98  2.47
7   0.26  Very Good     H     SI1   61.9   55.0    337  4.07  4.11  2.53
8   0.22       Fair     E     VS2   65.1   61.0    337  3.87  3.78  2.49
9   0.23  Very Good     H     VS1   59.4   61.0    338  4.00  4.05  2.39
(53940, 10)


In [4]:
df.describe() # Describe Dataset

Unnamed: 0,carat,depth,table,price,x,y,z
count,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0
mean,0.79794,61.749405,57.457184,3932.799722,5.731157,5.734526,3.538734
std,0.474011,1.432621,2.234491,3989.439738,1.121761,1.142135,0.705699
min,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,950.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,2401.0,5.7,5.71,3.53
75%,1.04,62.5,59.0,5324.25,6.54,6.54,4.04
max,5.01,79.0,95.0,18823.0,10.74,58.9,31.8


In [5]:
df.describe(exclude=np.number) # Describe Categorical Values

Unnamed: 0,cut,color,clarity
count,53940,53940,53940
unique,5,7,8
top,Ideal,G,SI1
freq,21551,11292,13065


### Training and Test Datasets

Splitting Target and Expicative Variables

In [6]:
X, y = df.drop('price', axis=1), df['price']

Encoding Categorical Variables as dtype = 'Category'

In [7]:
cats = X.select_dtypes(exclude=np.number).columns.tolist()
for cat in cats:
    X[cat] = X[cat].astype('category')
X.dtypes

carat       float64
cut        category
color      category
clarity    category
depth       float64
table       float64
x           float64
y           float64
z           float64
dtype: object

Creating Train and Test Datasets

In [8]:
# Standard Sklearn Split 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=17)
# Converting to Xgboost Matrix
dtrain_reg = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_reg = xgb.DMatrix(X_test, y_test, enable_categorical=True)


Defining Starting Hyperparameters

In [9]:
params = {
    "objective": "reg:squarederror",
    # "tree_method": "gpu_hist"
}

Defining and Training Model

In [10]:
n = 100
model = xgb.train(
    params=params,
    dtrain=dtrain_reg,
    num_boost_round=n,
)

Predict Test Dataset

In [11]:
preds = model.predict(dtest_reg)
rmse = mean_squared_error(y_test, preds, squared=False)

print(f"RMSE of the base model: {rmse:.3f}")

RMSE of the base model: 544.353


## Validation

### Model 1

### Model 2