In [2]:
import pandas as pd

## Import train

In [12]:
df = pd.read_csv("./diamonds/train.csv", index_col=0)

In [13]:
df.shape

(40455, 10)

In [14]:
df.head()

Unnamed: 0_level_0,carat,cut,color,clarity,depth,table,x,y,z,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0.3,Ideal,F,SI1,61.4,55.0,4.31,4.34,2.65,630.99
1,0.41,Ideal,G,SI1,61.6,57.0,4.77,4.81,2.95,701.21
2,0.3,Very Good,H,SI1,62.2,60.0,4.24,4.28,2.65,604.26
3,1.04,Premium,F,VVS2,62.5,58.0,6.46,6.37,4.01,909.9
4,0.84,Fair,G,VS1,55.6,64.0,6.42,6.32,3.54,790.99


In [15]:
df = pd.get_dummies(df)

In [16]:
df["vol"] = df.x * df.y * df.z

In [17]:
df["surf"] = df.x * df.y + df.z * df.y + df.x * df.z

In [18]:
df.shape

(40455, 29)

## Build model

In [19]:
from sklearn.ensemble import GradientBoostingRegressor

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
from sklearn.metrics import mean_squared_error

In [22]:
from sklearn.model_selection import GridSearchCV

In [86]:
gs = GridSearchCV(
    estimator=GradientBoostingRegressor(),
    param_grid={
        "n_estimators": [400],
        "max_depth": [7, 9],
        "max_features": [0.5, 0.6, 0.7],
        "min_samples_leaf": [5, 10, 20]
    }, 
    cv=3,
    return_train_score=True,
    n_jobs=-1, 
    verbose=3,
    scoring="neg_mean_squared_error"
)

In [23]:
X = dff.drop("price", axis=1)
y = dff.price

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=666)

In [89]:
gs.fit(X_train, y_train)

Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   35.7s
[Parallel(n_jobs=-1)]: Done  50 out of  54 | elapsed:  3.4min remaining:   16.1s
[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed:  3.4min finished


GridSearchCV(cv=3, estimator=GradientBoostingRegressor(), n_jobs=-1,
             param_grid={'max_depth': [7, 9], 'max_features': [0.5, 0.6, 0.7],
                         'min_samples_leaf': [5, 10, 20],
                         'n_estimators': [400]},
             return_train_score=True, scoring='neg_mean_squared_error',
             verbose=3)

In [90]:
gs.best_estimator_

GradientBoostingRegressor(max_depth=7, max_features=0.5, min_samples_leaf=5,
                          n_estimators=400)

In [25]:
gbr = GradientBoostingRegressor(
    max_depth=7, 
    max_features=0.5,
    min_samples_leaf=5,
    n_estimators=400
)

In [26]:
gbr.fit(X_train, y_train)

GradientBoostingRegressor(max_depth=7, max_features=0.5, min_samples_leaf=5,
                          n_estimators=400)

In [91]:
results_df = pd.DataFrame(gs.cv_results_)

In [92]:
results_df[["mean_train_score", "mean_test_score"] + [c for c in results_df.columns if "param" in c]].sort_values("mean_test_score")

Unnamed: 0,mean_train_score,mean_test_score,param_max_depth,param_max_features,param_min_samples_leaf,param_n_estimators,params
5,-51.742942,-81.193573,7,0.6,20,400,"{'max_depth': 7, 'max_features': 0.6, 'min_sam..."
11,-42.858918,-80.408943,9,0.5,20,400,"{'max_depth': 9, 'max_features': 0.5, 'min_sam..."
2,-52.287046,-80.38627,7,0.5,20,400,"{'max_depth': 7, 'max_features': 0.5, 'min_sam..."
8,-50.41072,-79.807887,7,0.7,20,400,"{'max_depth': 7, 'max_features': 0.7, 'min_sam..."
15,-21.066322,-79.784525,9,0.7,5,400,"{'max_depth': 9, 'max_features': 0.7, 'min_sam..."
14,-41.864273,-79.724754,9,0.6,20,400,"{'max_depth': 9, 'max_features': 0.6, 'min_sam..."
9,-22.040688,-79.700071,9,0.5,5,400,"{'max_depth': 9, 'max_features': 0.5, 'min_sam..."
10,-32.912062,-79.681634,9,0.5,10,400,"{'max_depth': 9, 'max_features': 0.5, 'min_sam..."
7,-44.265546,-79.661054,7,0.7,10,400,"{'max_depth': 7, 'max_features': 0.7, 'min_sam..."
3,-37.937598,-79.64733,7,0.6,5,400,"{'max_depth': 7, 'max_features': 0.6, 'min_sam..."


In [28]:
mean_squared_error(y_train, gbr.predict(X_train))

44.73980952840783

In [29]:
mean_squared_error(y_test, gbr.predict(X_test))

76.65663545482904

## Predict test

In [12]:
test = pd.read_csv("./diamonds/test.csv", index_col=0)

In [13]:
test.shape

(13485, 9)

In [14]:
test.head()

Unnamed: 0_level_0,carat,cut,color,clarity,depth,table,x,y,z
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0.7,Very Good,H,SI1,61.0,58.0,5.67,5.7,3.47
1,1.52,Ideal,H,VS2,61.8,54.0,7.42,7.43,4.59
2,1.03,Ideal,E,SI1,62.0,57.0,6.44,6.47,4.0
3,0.7,Very Good,G,VVS1,63.3,57.0,5.59,5.63,3.55
4,1.28,Ideal,H,SI1,62.4,56.0,6.93,6.91,4.32


In [15]:
test["price"] = price_mean

In [16]:
test.head()

Unnamed: 0_level_0,carat,cut,color,clarity,depth,table,x,y,z,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0.7,Very Good,H,SI1,61.0,58.0,5.67,5.7,3.47,778.527384
1,1.52,Ideal,H,VS2,61.8,54.0,7.42,7.43,4.59,778.527384
2,1.03,Ideal,E,SI1,62.0,57.0,6.44,6.47,4.0,778.527384
3,0.7,Very Good,G,VVS1,63.3,57.0,5.59,5.63,3.55,778.527384
4,1.28,Ideal,H,SI1,62.4,56.0,6.93,6.91,4.32,778.527384


## Prepare submission

In [17]:
my_submission = test["price"]

In [18]:
my_submission

id
0        778.527384
1        778.527384
2        778.527384
3        778.527384
4        778.527384
            ...    
13480    778.527384
13481    778.527384
13482    778.527384
13483    778.527384
13484    778.527384
Name: price, Length: 13485, dtype: float64

In [19]:
my_submission.to_csv("./diamonds/my_submission_day16_16h30.csv")

## Check submission format

In [20]:
!head -n 5 ./diamonds/my_submission_day16_13h08.csv

id,price
0,778.5273842541095
1,778.5273842541095
2,778.5273842541095
3,778.5273842541095


## Go to kaggle and submit!