In [115]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [116]:
diamonds_train = pd.read_csv('data/diamonds_train.csv')

In [117]:
diamonds_predict = pd.read_csv('data/diamonds_test.csv')

## GETTING RID OF 0 IN xyz

In [118]:
diamonds_train = diamonds_train[(diamonds_train['x'] != 0) & (diamonds_train['y'] != 0) & (diamonds_train['z'] != 0)]
diamonds_train.shape

(40439, 10)

In [119]:
# Since we cannot drop the rows of our predict csv, we gonna fill it with the mean value
diamonds_predict['x'] = diamonds_predict['x'].apply(lambda x: diamonds_predict['x'].mean() if x==0 else x)
diamonds_predict['y'] = diamonds_predict['y'].apply(lambda x: diamonds_predict['y'].mean() if x==0 else x)
diamonds_predict['z'] = diamonds_predict['z'].apply(lambda x: diamonds_predict['z'].mean() if x==0 else x)

In [120]:
diamonds_train = diamonds_train[(diamonds_train["depth"]<71) & (diamonds_train["depth"]>54)]

diamonds_train = diamonds_train[(diamonds_train["table"]<70) & (diamonds_train["table"]>52)]

diamonds_train = diamonds_train[(diamonds_train["x"]<10) & (diamonds_train['y']<10)] 
#no dropeamos nada de z porque hay bastantes valores en torno a 6

diamonds_train = diamonds_train[diamonds_train['carat'] < 2.67]

## ENCODE CATEGORIES

In [121]:
cut_num = {'Ideal': 5, 'Premium': 4, 'Very Good': 3, 'Good': 2, 'Fair': 1}
color_num = {'D': 7, 'E': 6, 'F': 5, 'G': 4, 'H': 3, 'I': 2, 'J': 1}
clarity_num = {'IF': 8, 'VVS1': 7, 'VVS2': 6, 'VS1': 5, 'VS2': 4, 'SI1': 3, 'SI2': 2, 'I1': 1}

In [122]:
diamonds_train['cut_num'] = diamonds_train['cut'].replace(cut_num)
diamonds_train['color_num'] = diamonds_train['color'].replace(color_num)
diamonds_train['clarity_num'] = diamonds_train['clarity'].replace(clarity_num)

diamonds_predict['cut_num'] = diamonds_predict['cut'].replace(cut_num)
diamonds_predict['color_num'] = diamonds_predict['color'].replace(color_num)
diamonds_predict['clarity_num'] = diamonds_predict['clarity'].replace(clarity_num)

## Volume

In [123]:
diamonds_train['volume'] = diamonds_train['x'] * diamonds_train['y'] * diamonds_train['z']
diamonds_predict['volume'] = diamonds_predict['x'] * diamonds_predict['y'] * diamonds_predict['z']

In [124]:
diamonds_train.describe()

Unnamed: 0,carat,depth,table,price,x,y,z,cut_num,color_num,clarity_num,volume
count,40308.0,40308.0,40308.0,40308.0,40308.0,40308.0,40308.0,40308.0,40308.0,40308.0,40308.0
mean,0.794862,61.751546,57.447435,3915.694602,5.725494,5.727689,3.535797,3.909125,4.402873,4.053538,129.360916
std,0.469421,1.405603,2.196913,3976.722032,1.117091,1.109045,0.689854,1.114014,1.700711,1.646859,75.794517
min,0.2,54.2,52.4,326.0,3.77,3.72,1.07,1.0,1.0,1.0,32.262375
25%,0.4,61.0,56.0,945.0,4.71,4.72,2.91,3.0,3.0,3.0,64.887315
50%,0.7,61.8,57.0,2393.0,5.69,5.7,3.52,4.0,4.0,4.0,114.64446
75%,1.04,62.5,59.0,5316.25,6.54,6.54,4.03,5.0,6.0,5.0,170.726151
max,2.66,70.8,69.0,18823.0,9.05,8.94,6.16,5.0,7.0,8.0,432.825408


In [125]:
diamonds_train[diamonds_train['carat']>2.65]

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,cut_num,color_num,clarity_num,volume
5988,2.66,Good,H,SI2,63.8,57.0,16239,8.71,8.65,5.54,2,3,2,417.39191
20676,2.66,Good,H,SI2,63.8,57.0,16239,8.71,8.65,5.54,2,3,2,417.39191


## l/w Ratio

In [126]:
diamonds_train['l/w ratio'] = diamonds_train.x/diamonds_train.y
diamonds_predict['l/w ratio'] = diamonds_predict.x/diamonds_predict.y

## Carat length Ratio

In [127]:
diamonds_train['carat/length ratio'] = diamonds_train['carat']/diamonds_train['x']
diamonds_predict['carat/length ratio'] = diamonds_predict['carat']/diamonds_predict['x']

# Width carat ratio

In [128]:
diamonds_train['carat/width ratio'] = diamonds_train['carat']/diamonds_train['y']
diamonds_predict['carat/width ratio'] = diamonds_predict['carat']/diamonds_predict['y']

## Carat depth Ratio

In [129]:
diamonds_train['carat/depth ratio'] = diamonds_train['carat']/diamonds_train['z']
diamonds_predict['carat/depth ratio'] = diamonds_predict['carat']/diamonds_predict['z']

In [130]:
diamonds_train['density'] = diamonds_train['carat']/diamonds_train['volume']
diamonds_predict['density'] = diamonds_predict['carat']/diamonds_predict['volume']

# ESCALAR

In [180]:
diamonds_train = diamonds_train.drop(['cut', 'color', 'clarity'], axis=1)
diamonds_predict = diamonds_predict.drop(['cut', 'color', 'clarity'], axis=1)

In [184]:
features = ['carat', 'depth', 'table', 'x', 'y', 'z', 'cut_num', 'color_num', 'clarity_num']
target = 'price'

In [132]:
from sklearn.preprocessing import StandardScaler

In [182]:
scaler = StandardScaler()

In [185]:
X = scaler.fit_transform(diamonds_train[features])
y = diamonds_train[target]

In [186]:
X_pred = scaler.fit_transform(diamonds_predict[features])

In [136]:
from sklearn.model_selection import train_test_split 

In [187]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [138]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import cross_val_score, RandomizedSearchCV


In [188]:
gbr_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=8, random_state=0, loss='ls',verbose = 1)
#max_depth 1, mal, 2, mal, 4 ok (553 rmse), 8 ok (542)

In [189]:
gbr_model.fit(X_train , y_train)
y_pred = gbr_model.predict(X_test)


      Iter       Train Loss   Remaining Time 
         1    12938276.6729           11.60s
         2    10556917.9328            9.34s
         3     8627325.5127            8.92s
         4     7058615.8410            8.52s
         5     5784660.5470            8.10s
         6     4751417.7110            8.00s
         7     3908321.3646            7.83s
         8     3224708.3522            7.60s
         9     2668297.9502            7.47s
        10     2215279.2375            7.54s
        20      477126.2843            6.29s
        30      224997.6441            5.38s
        40      171703.0663            4.68s
        50      151097.6157            3.92s
        60      136149.1878            3.14s
        70      123186.2440            2.36s
        80      113783.4332            1.58s
        90      107135.0424            0.79s
       100      102691.5994            0.00s


In [190]:
cv_score = cross_val_score(estimator=gbr_model, X=X_train, y=y_train, cv=5,verbose = 1)
print('Cross validation score : %.4f' % gbr_model.score(X_test, y_test))
print(cv_score)
mae = mean_absolute_error(y_test, y_pred)
print("mae: %f" %(mae))
r2 = gbr_model.score(X_test,y_pred)
print("R2: %f" %(r2))
mse = mean_squared_error(y_test, y_pred)
print("mse: %f" %(mse))
rmse = (mean_squared_error(y_test, y_pred))**0.5
print("rmse: %f" %(rmse))

      Iter       Train Loss   Remaining Time 
         1    13024146.4744            8.74s
         2    10625804.5889            7.12s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


         3     8679132.0306            7.41s
         4     7099452.9354            7.26s
         5     5816131.6440            6.83s
         6     4770792.9031            6.54s
         7     3924723.4902            6.47s
         8     3235699.8172            6.26s
         9     2676263.7082            6.11s
        10     2222191.3045            5.95s
        20      468836.7143            5.00s
        30      217988.6985            4.28s
        40      164082.8527            3.64s
        50      143267.9616            3.02s
        60      127094.7009            2.42s
        70      114964.0715            1.83s
        80      102009.2786            1.22s
        90       94268.2911            0.61s
       100       89625.4615            0.00s
      Iter       Train Loss   Remaining Time 
         1    12988884.2398            5.69s
         2    10598033.4269            5.63s
         3     8658194.0402            5.58s
         4     7083376.8720            5.86s
         

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   30.5s finished


In [191]:
rfr_model = RandomForestRegressor(max_depth=30, min_samples_split=5, n_estimators=200 )
rfr_model.fit(X_train, y_train)
y_pred = rfr_model.predict(X_test)

In [144]:
#random_grid = {'bootstrap': [True, False],
               'max_depth': [10, 30, 70],
               'max_features': ['auto', 'sqrt'],
               'min_samples_leaf': [2, 4, 6],
               'min_samples_split': [2, 5],
               'n_estimators': [100, 200]}
#rfr_random = RandomizedSearchCV(scoring="neg_mean_squared_error", estimator = rfr_model, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

In [145]:
#model = rfr_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [146]:
print(model.best_estimator_.get_params())

{'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'mse', 'max_depth': 30, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 4, 'min_samples_split': 5, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 200, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


In [192]:
cv_score = cross_val_score(estimator=rfr_model, X=X_train, y=y_train, cv=5,verbose = 1)
print('Cross validation score : %.4f' % rfr_model.score(X_test, y_test))
print(cv_score)
mae = mean_absolute_error(y_test,y_pred)
print("mae: %f" %(mae))
r2 = rfr_model.score(X_test,y_pred)
print("R2: %f" %(r2))
mse = mean_squared_error(y_test,y_pred)
print("mse: %f" %(mse))
rmse = (mean_squared_error(y_test,y_pred))**0.5
print("rmse: %f" %(rmse))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.2min finished


Cross validation score : 0.9819
[0.98076046 0.98153496 0.98098354 0.97906926 0.97955158]
mae: 264.378208
R2: 1.000000
mse: 283749.617611
rmse: 532.681535


In [None]:
Cross validation score : 0.9826
[0.98149232 0.98167895 0.98121593 0.97904049 0.98002653]
mae: 259.216508
R2: 1.000000
mse: 271734.577583
rmse: 521.281668

In [193]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor

In [194]:
hgbr_model = HistGradientBoostingRegressor(max_depth=10, max_iter=200)

In [195]:
hgbr_model.fit(X_train, y_train)
y_pred = hgbr_model.predict(X_test)

In [196]:
cv_score = cross_val_score(estimator=hgbr_model, X=X_train, y=y_train, cv=5,verbose = 1)
print('Cross validation score : %.4f' % hgbr_model.score(X_test, y_test))
print(cv_score)
mae = mean_absolute_error(y_test, y_pred)
print("mae: %f" %(mae))
r2 = hgbr_model.score(X_test,y_pred)
print("R2: %.4f" %(r2))
mse = mean_squared_error(y_test, y_pred)
print("mse: %f" %(mse))
rmse = (mean_squared_error(y_test, y_pred))**0.5
print("rmse: %f" %(rmse))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Cross validation score : 0.9823
[0.98142979 0.98218929 0.98289987 0.98047688 0.98044596]
mae: 273.640740
R2: 1.0000
mse: 276887.827974
rmse: 526.201319


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    3.4s finished


In [197]:
from sklearn.ensemble import ExtraTreesRegressor

In [198]:
etr_model = ExtraTreesRegressor()
etr_model.fit(X_train, y_train)
y_pred = etr_model.predict(X_test)

In [199]:
cv_score = cross_val_score(estimator=etr_model, X=X_train, y=y_train, cv=5,verbose = 1)
print('Cross validation score : %.4f' % etr_model.score(X_test, y_test))
print(cv_score)
mae = mean_absolute_error(y_test, y_pred)
print("mae: %f" %(mae))
r2 = etr_model.score(X_test,y_pred)
print("R2: %.4f" %(r2))
mse = mean_squared_error(y_test, y_pred)
print("mse: %f" %(mse))
rmse = (mean_squared_error(y_test, y_pred))**0.5
print("rmse: %f" %(rmse))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   28.0s finished


Cross validation score : 0.9816
[0.98122523 0.98117258 0.98215499 0.98005559 0.97937691]
mae: 263.126575
R2: 1.0000
mse: 288442.871140
rmse: 537.068777


In [200]:
X_predict = X_pred # diamonds_predict[features] if not scaled
predictions = hgbr_model.predict(X_predict)
diamonds_id = diamonds_predict['id']

In [201]:
submission = pd.DataFrame({'id': diamonds_id, 'price': predictions})

In [202]:
submission.shape

(13485, 2)

In [203]:
submission.to_csv('hgbr(526).csv', index=False)