In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

In [2]:
diamonds_df = pd.read_csv('../data/diamonds_train.csv')

In [3]:
diamonds_df

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1.21,Premium,J,VS2,62.4,58.0,4268,6.83,6.79,4.25
1,0.32,Very Good,H,VS2,63.0,57.0,505,4.35,4.38,2.75
2,0.71,Fair,G,VS1,65.5,55.0,2686,5.62,5.53,3.65
3,0.41,Good,D,SI1,63.8,56.0,738,4.68,4.72,3.00
4,1.02,Ideal,G,SI1,60.5,59.0,4882,6.55,6.51,3.95
...,...,...,...,...,...,...,...,...,...,...
40450,1.34,Ideal,G,VS1,62.7,57.0,10070,7.10,7.04,4.43
40451,2.02,Good,F,SI2,57.1,60.0,12615,8.31,8.25,4.73
40452,1.01,Ideal,H,SI1,62.7,56.0,5457,6.37,6.42,4.01
40453,0.33,Ideal,J,VS1,61.9,54.3,456,4.45,4.47,2.76


In [4]:
diamonds_predict = pd.read_csv('../data/diamonds_test.csv')

In [5]:
diamonds_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40455 entries, 0 to 40454
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    40455 non-null  float64
 1   cut      40455 non-null  object 
 2   color    40455 non-null  object 
 3   clarity  40455 non-null  object 
 4   depth    40455 non-null  float64
 5   table    40455 non-null  float64
 6   price    40455 non-null  int64  
 7   x        40455 non-null  float64
 8   y        40455 non-null  float64
 9   z        40455 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 3.1+ MB


In [6]:
diamonds_df.shape

(40455, 10)

In [7]:
NUM_FEATS = ['carat', 'depth', 'table', 'x', 'y', 'z']
CAT_FEATS = ['cut', 'color', 'clarity']
FEATS = NUM_FEATS + CAT_FEATS
TARGET = 'price'

In [9]:
numeric_transformer = \
Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), 
                ('scaler', StandardScaler())])

In [10]:
categorical_transformer = \
Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [11]:
preprocessor = \
ColumnTransformer(transformers=[('num', numeric_transformer, NUM_FEATS),
                                ('cat', categorical_transformer, CAT_FEATS)])

In [13]:
pd.DataFrame(data=preprocessor.fit_transform(diamonds_df)).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,0.867006,0.452019,0.247981,0.978807,0.921985,1.022657,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,-1.004557,0.871099,-0.199745,-1.226738,-1.179816,-1.129259,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,-0.184434,2.617265,-1.095198,-0.097286,-0.176882,0.161891,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,-0.815298,1.429872,-0.647472,-0.933258,-0.883296,-0.770607,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.467458,-0.875068,0.695707,0.729794,0.677793,0.592274,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [14]:
from sklearn.model_selection import train_test_split

In [15]:
diamonds_train, diamonds_test = train_test_split(diamonds_df)

In [16]:
print(diamonds_train.shape)
print(diamonds_test.shape)

(30341, 10)
(10114, 10)


In [17]:
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor

model= Pipeline(steps=[('preprocessor', preprocessor),
                       ('regressor', RandomForestRegressor())])

In [18]:
model.fit(diamonds_train[FEATS], diamonds_train[TARGET]);

In [19]:
from sklearn.metrics import mean_squared_error

In [20]:
y_test = model.predict(diamonds_test[FEATS])
y_train = model.predict(diamonds_train[FEATS])

In [21]:
print(f"test error: {mean_squared_error(y_pred=y_test, y_true=diamonds_test[TARGET], squared=False)}")
print(f"train error: {mean_squared_error(y_pred=y_train, y_true=diamonds_train[TARGET], squared=False)}")

test error: 553.5731667495935
train error: 212.88185768142282


In [22]:
from sklearn.model_selection import cross_val_score

In [24]:
scores = cross_val_score(model, 
                         diamonds_df[FEATS], 
                         diamonds_df[TARGET], 
                         scoring='neg_root_mean_squared_error', 
                         cv=10, n_jobs=-1)

In [25]:
np.mean(-scores)

553.1488490888206

In [28]:
from sklearn.model_selection import RandomizedSearchCV

In [30]:
param_grid = {
    'preprocessor__num__imputer__strategy': ['mean', 'median'],
    'regressor__n_estimators': [16, 32, 64, 128, 256, 512],
    'regressor__max_depth': [2, 4, 8, 16],
}

grid_search = RandomizedSearchCV(model, 
                                 param_grid, 
                                 cv=5, 
                                 verbose=10, 
                                 scoring='neg_root_mean_squared_error', 
                                 n_jobs=-1,
                                 n_iter=32)

grid_search.fit(diamonds_df[FEATS], diamonds_df[TARGET])

Fitting 5 folds for each of 32 candidates, totalling 160 fits
[CV 1/5; 1/32] START preprocessor__num__imputer__strategy=mean, regressor__max_depth=2, regressor__n_estimators=16
[CV 1/5; 1/32] END preprocessor__num__imputer__strategy=mean, regressor__max_depth=2, regressor__n_estimators=16; total time=   0.7s
[CV 2/5; 1/32] START preprocessor__num__imputer__strategy=mean, regressor__max_depth=2, regressor__n_estimators=16
[CV 2/5; 1/32] END preprocessor__num__imputer__strategy=mean, regressor__max_depth=2, regressor__n_estimators=16; total time=   0.5s
[CV 3/5; 1/32] START preprocessor__num__imputer__strategy=mean, regressor__max_depth=2, regressor__n_estimators=16
[CV 3/5; 1/32] END preprocessor__num__imputer__strategy=mean, regressor__max_depth=2, regressor__n_estimators=16; total time=   0.5s
[CV 4/5; 1/32] START preprocessor__num__imputer__strategy=mean, regressor__max_depth=2, regressor__n_estimators=16
[CV 4/5; 1/32] END preprocessor__num__imputer__strategy=mean, regressor__max_de

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('preprocessor',
                                              ColumnTransformer(transformers=[('num',
                                                                               Pipeline(steps=[('imputer',
                                                                                                SimpleImputer(strategy='median')),
                                                                                               ('scaler',
                                                                                                StandardScaler())]),
                                                                               ['carat',
                                                                                'depth',
                                                                                'table',
                                                                                'x',
              

In [31]:
grid_search.best_params_

{'regressor__n_estimators': 256,
 'regressor__max_depth': 16,
 'preprocessor__num__imputer__strategy': 'mean'}

In [32]:
grid_search.best_score_

-557.6523938774229

In [41]:
y_pred = grid_search.predict(diamonds_predict[FEATS])

In [42]:
submission_df = pd.DataFrame({'id': diamonds_predict['id'], 'price': y_pred})

In [45]:
submission_df.to_csv('../data/diamonds_submission.csv', index=False)