In [1]:
import pandas as pd
diamonds = pd.read_csv('/Users/jc/Downloads/diamonds_train_.csv')
diamonds_predict = pd.read_csv('/Users/jc/Downloads/diamonds_predict.csv')



from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder




NUM_FEATS = ['carat', 'depth', 'table', 'x', 'y', 'z']
CAT_FEATS = ['cut', 'color', 'clarity']
FEATS = NUM_FEATS + CAT_FEATS
TARGET = 'price'


numeric_transformer = \
Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), 
                ('scaler', StandardScaler())])


categorical_transformer = \
Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = \
ColumnTransformer(transformers=[('num', numeric_transformer, NUM_FEATS),
                                ('cat', categorical_transformer, CAT_FEATS)])


preprocessor


pd.DataFrame(data=preprocessor.fit_transform(diamonds)).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,0.867006,0.452019,0.247981,0.978807,0.921985,1.022657,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,-1.004557,0.871099,-0.199745,-1.226738,-1.179816,-1.129259,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,-0.184434,2.617265,-1.095198,-0.097286,-0.176882,0.161891,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,-0.815298,1.429872,-0.647472,-0.933258,-0.883296,-0.770607,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.467458,-0.875068,0.695707,0.729794,0.677793,0.592274,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [2]:
from sklearn.model_selection import train_test_split
diamonds_train, diamonds_test = train_test_split(diamonds)
print(diamonds_train.shape)
print(diamonds_test.shape)
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import ElasticNet
from sklearn import tree


grid_parameters = {'alpha': [0.1, 1, 1.5, 5, 10], 'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9] }
model= Pipeline(steps=[('preprocessor', preprocessor),
                       ('regressor', RandomForestRegressor(n_estimators=1000))])


model.fit(diamonds_train[FEATS], diamonds_train[TARGET]);


from sklearn.metrics import mean_squared_error



y_test = model.predict(diamonds_test[FEATS])
y_train = model.predict(diamonds_train[FEATS])


print(f"test error: {mean_squared_error(y_pred=y_test, y_true=diamonds_test[TARGET], squared=False)}")
print(f"train error: {mean_squared_error(y_pred=y_train, y_true=diamonds_train[TARGET], squared=False)}")



(30341, 10)
(10114, 10)
test error: 551.2296947315239
train error: 208.73330160808183


In [4]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(model, 
                         diamonds[FEATS], 
                         diamonds[TARGET], 
                         scoring='neg_root_mean_squared_error', 
                         cv=8, n_jobs=-1, verbose = 1)

import numpy as np
np.mean(-scores)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:  9.2min finished


552.9308782339244

In [5]:
y_pred = model.predict(diamonds_predict[FEATS])

In [6]:
submission_df = pd.DataFrame({'id': diamonds_predict['id'], 'price': y_pred})

In [7]:
submission_df.head()

Unnamed: 0,id,price
0,0,2986.532
1,1,5417.385
2,2,9042.289
3,3,4258.816
4,4,1692.215


In [8]:
submission_df.price.clip(0, 20000, inplace=True)
submission_df.to_csv('/Users/jc/Downloads/diamonds_rf.csv', index=False)
