In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
diamonds_train = pd.read_csv('data/diamonds_train.csv')

In [3]:
diamonds_predict = pd.read_csv('data/diamonds_test.csv')

## GETTING RID OF 0 IN xyz

In [4]:
diamonds_train = diamonds_train[(diamonds_train['x'] != 0) & (diamonds_train['y'] != 0) & (diamonds_train['z'] != 0)]
diamonds_train.shape

(40439, 10)

In [5]:
# Since we cannot drop the rows of our predict csv, we gonna fill it with the mean value
diamonds_predict['x'] = diamonds_predict['x'].apply(lambda x: diamonds_predict['x'].mean() if x==0 else x)
diamonds_predict['y'] = diamonds_predict['y'].apply(lambda x: diamonds_predict['y'].mean() if x==0 else x)
diamonds_predict['z'] = diamonds_predict['z'].apply(lambda x: diamonds_predict['z'].mean() if x==0 else x)

In [6]:
diamonds_train = diamonds_train[(diamonds_train["depth"]<71) & (diamonds_train["depth"]>54)]

diamonds_train = diamonds_train[(diamonds_train["table"]<70) & (diamonds_train["table"]>52)]

diamonds_train = diamonds_train[(diamonds_train["x"]<10) & (diamonds_train['y']<10)] 
#no dropeamos nada de z porque hay bastantes valores en torno a 6

diamonds_train = diamonds_train[diamonds_train['carat'] < 2.67]

In [7]:
diamonds_train.shape

(40308, 10)

## ENCODE CATEGORIES

In [8]:
cut_num = {'Ideal': 5, 'Premium': 4, 'Very Good': 3, 'Good': 2, 'Fair': 1}
color_num = {'D': 7, 'E': 6, 'F': 5, 'G': 4, 'H': 3, 'I': 2, 'J': 1}
clarity_num = {'IF': 8, 'VVS1': 7, 'VVS2': 6, 'VS1': 5, 'VS2': 4, 'SI1': 3, 'SI2': 2, 'I1': 1}

In [9]:
diamonds_train['cut_num'] = diamonds_train['cut'].replace(cut_num)
diamonds_train['color_num'] = diamonds_train['color'].replace(color_num)
diamonds_train['clarity_num'] = diamonds_train['clarity'].replace(clarity_num)

diamonds_predict['cut_num'] = diamonds_predict['cut'].replace(cut_num)
diamonds_predict['color_num'] = diamonds_predict['color'].replace(color_num)
diamonds_predict['clarity_num'] = diamonds_predict['clarity'].replace(clarity_num)

## Volume

In [10]:
#diamonds_train['volume'] = diamonds_train['x'] * diamonds_train['y'] * diamonds_train['z']
#diamonds_predict['volume'] = diamonds_predict['x'] * diamonds_predict['y'] * diamonds_predict['z']

In [11]:
diamonds_train['volume'] = (((diamonds_train['x']**2) * diamonds_train['z'])/3)
diamonds_predict['volume'] = (((diamonds_predict['x']**2) * diamonds_predict['z'])/3)

In [12]:
diamonds_train.describe()

Unnamed: 0,carat,depth,table,price,x,y,z,cut_num,color_num,clarity_num,volume
count,40308.0,40308.0,40308.0,40308.0,40308.0,40308.0,40308.0,40308.0,40308.0,40308.0,40308.0
mean,0.794862,61.751546,57.447435,3915.694602,5.725494,5.727689,3.535797,3.909125,4.402873,4.053538,43.129883
std,0.469421,1.405603,2.196913,3976.722032,1.117091,1.109045,0.689854,1.114014,1.700711,1.646859,25.344971
min,0.2,54.2,52.4,326.0,3.77,3.72,1.07,1.0,1.0,1.0,10.820955
25%,0.4,61.0,56.0,945.0,4.71,4.72,2.91,3.0,3.0,3.0,21.592524
50%,0.7,61.8,57.0,2393.0,5.69,5.7,3.52,4.0,4.0,4.0,38.122364
75%,1.04,62.5,59.0,5316.25,6.54,6.54,4.03,5.0,6.0,5.0,56.930601
max,2.66,70.8,69.0,18823.0,9.05,8.94,6.16,5.0,7.0,8.0,145.092096


In [13]:
diamonds_train[diamonds_train['carat']>2.65]

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,cut_num,color_num,clarity_num,volume
5988,2.66,Good,H,SI2,63.8,57.0,16239,8.71,8.65,5.54,2,3,2,140.095705
20676,2.66,Good,H,SI2,63.8,57.0,16239,8.71,8.65,5.54,2,3,2,140.095705


## l/w Ratio

In [14]:
diamonds_train['l/w ratio'] = diamonds_train.x/diamonds_train.y
diamonds_predict['l/w ratio'] = diamonds_predict.x/diamonds_predict.y

## Carat length Ratio

In [15]:
diamonds_train['carat/length ratio'] = diamonds_train['carat']/diamonds_train['x']
diamonds_predict['carat/length ratio'] = diamonds_predict['carat']/diamonds_predict['x']

# Width carat ratio

In [16]:
diamonds_train['carat/width ratio'] = diamonds_train['carat']/diamonds_train['y']
diamonds_predict['carat/width ratio'] = diamonds_predict['carat']/diamonds_predict['y']

## Carat depth Ratio

In [17]:
diamonds_train['carat/depth ratio'] = diamonds_train['carat']/diamonds_train['z']
diamonds_predict['carat/depth ratio'] = diamonds_predict['carat']/diamonds_predict['z']

In [18]:
diamonds_train['density'] = diamonds_train['carat']/diamonds_train['volume']
diamonds_predict['density'] = diamonds_predict['carat']/diamonds_predict['volume']

In [19]:
diamonds_train['carat log'] = diamonds_train['carat'].apply(lambda x: np.log(x))
diamonds_predict['carat log'] = diamonds_predict['carat'].apply(lambda x: np.log(x))

# ESCALAR

In [20]:
diamonds_train = diamonds_train.drop(['cut', 'color', 'clarity'], axis=1)
diamonds_predict = diamonds_predict.drop(['cut', 'color', 'clarity'], axis=1)

In [21]:
features = ['carat log', 'volume', 'table', 'depth', 'l/w ratio', 'cut_num', 'color_num', 'clarity_num']
target = 'price'

In [22]:
from sklearn.preprocessing import StandardScaler

In [23]:
scaler = StandardScaler()

In [24]:
X = scaler.fit_transform(diamonds_train[features])
y = diamonds_train[target]

In [25]:
X_pred = scaler.fit_transform(diamonds_predict[features])

In [26]:
from sklearn.model_selection import train_test_split 

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [28]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import cross_val_score, RandomizedSearchCV


In [29]:
gbr_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=8, random_state=0, loss='ls',verbose = 1)
#max_depth 1, mal, 2, mal, 4 ok (553 rmse), 8 ok (542)

In [30]:
gbr_model.fit(X_train , y_train)
y_pred = gbr_model.predict(X_test)


      Iter       Train Loss   Remaining Time 
         1    12935415.6867           10.37s
         2    10556401.5839            8.97s
         3     8625419.1327            9.22s
         4     7056198.8014            8.75s
         5     5783984.4479            8.43s
         6     4748274.4919            8.29s
         7     3906676.8169            8.19s
         8     3223079.0373            8.01s
         9     2665888.9629            8.00s
        10     2213126.4525            8.10s
        20      470971.1869            6.87s
        30      219818.6240            5.92s
        40      166771.4000            5.13s
        50      144014.0163            4.29s
        60      130106.3615            3.80s
        70      120367.6997            2.97s
        80      110903.8565            2.02s
        90      103581.2849            1.03s
       100       99688.8314            0.00s


In [31]:
cv_score = cross_val_score(estimator=gbr_model, X=X_train, y=y_train, cv=5,verbose = 1)
print('Cross validation score : %.4f' % gbr_model.score(X_test, y_test))
print(cv_score)
mae = mean_absolute_error(y_test, y_pred)
print("mae: %f" %(mae))
r2 = gbr_model.score(X_test,y_pred)
print("R2: %f" %(r2))
mse = mean_squared_error(y_test, y_pred)
print("mse: %f" %(mse))
rmse = (mean_squared_error(y_test, y_pred))**0.5
print("rmse: %f" %(rmse))

      Iter       Train Loss   Remaining Time 
         1    13022259.3993            8.06s
         2    10621256.1813            7.13s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


         3     8675463.7646            7.10s
         4     7093692.6956            7.46s
         5     5811151.0998            7.18s
         6     4767676.2163            7.02s
         7     3920750.1217            6.98s
         8     3232300.1922            6.83s
         9     2672924.3011            6.90s
        10     2217100.3545            6.79s
        20      464603.9501            6.21s
        30      211258.1098            5.37s
        40      155101.5886            4.79s
        50      132847.6815            3.91s
        60      118116.4399            3.18s
        70      108922.5384            2.42s
        80       99227.8027            1.59s
        90       92106.9953            0.78s
       100       86483.0238            0.00s
      Iter       Train Loss   Remaining Time 
         1    12988627.1345            6.22s
         2    10598601.9141            6.16s
         3     8656781.3175            6.27s
         4     7083836.3683            6.49s
         

KeyboardInterrupt: 

In [None]:
rfr_model = RandomForestRegressor(max_depth=30, min_samples_split=5, n_estimators=200 )
rfr_model.fit(X_train, y_train)
y_pred = rfr_model.predict(X_test)

In [None]:
#random_grid = {'bootstrap': [True, False],
               'max_depth': [10, 30, 70],
               'max_features': ['auto', 'sqrt'],
               'min_samples_leaf': [2, 4, 6],
               'min_samples_split': [2, 5],
               'n_estimators': [100, 200]}
#rfr_random = RandomizedSearchCV(scoring="neg_mean_squared_error", estimator = rfr_model, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

In [None]:
#model = rfr_random.fit(X_train, y_train)

In [None]:
print(model.best_estimator_.get_params())

In [None]:
cv_score = cross_val_score(estimator=rfr_model, X=X_train, y=y_train, cv=5,verbose = 1)
print('Cross validation score : %.4f' % rfr_model.score(X_test, y_test))
print(cv_score)
mae = mean_absolute_error(y_test,y_pred)
print("mae: %f" %(mae))
r2 = rfr_model.score(X_test,y_pred)
print("R2: %f" %(r2))
mse = mean_squared_error(y_test,y_pred)
print("mse: %f" %(mse))
rmse = (mean_squared_error(y_test,y_pred))**0.5
print("rmse: %f" %(rmse))

In [None]:
Cross validation score : 0.9826
[0.98149232 0.98167895 0.98121593 0.97904049 0.98002653]
mae: 259.216508
R2: 1.000000
mse: 271734.577583
rmse: 521.281668

In [30]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor

In [31]:
hgbr_model = HistGradientBoostingRegressor(min_samples_leaf=20, max_iter=150, max_depth=32) #max_depth=10, max_iter=200

In [32]:
hgbr_model.fit(X_train, y_train)
y_pred = hgbr_model.predict(X_test)

In [33]:
cv_score = cross_val_score(estimator=hgbr_model, X=X_train, y=y_train, cv=5,verbose = 1)
print('Cross validation score : %.4f' % hgbr_model.score(X_test, y_test))
print(cv_score)
mae = mean_absolute_error(y_test, y_pred)
print("mae: %f" %(mae))
r2 = hgbr_model.score(X_test,y_pred)
print("R2: %.4f" %(r2))
mse = mean_squared_error(y_test, y_pred)
print("mse: %f" %(mse))
rmse = (mean_squared_error(y_test, y_pred))**0.5
print("rmse: %f" %(rmse))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Cross validation score : 0.9840
[0.98253883 0.98327129 0.98351369 0.98144077 0.98122652]
mae: 259.097282
R2: 1.0000
mse: 250185.109564
rmse: 500.185075


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    3.7s finished


In [None]:
Cross validation score : 0.9841
[0.982491   0.98297602 0.98372245 0.98146505 0.98093746]
mae: 260.185927
R2: 1.0000
mse: 248861.919918
rmse: 498.860622

In [65]:
hgbr_grid = {'loss': ['least_squares', 'least_absolute_deviation', 'poisson'],
    'max_iter': [100, 150, 250],
    'max_depth': [8, 16, 32],
    'min_samples_leaf': [20, 30, 40]}
hgbr_random = RandomizedSearchCV(scoring="neg_mean_squared_error", estimator = hgbr_model, param_distributions = hgbr_grid, n_iter = 100, cv = 3, verbose=1, random_state=42, n_jobs = -1)

In [66]:
hgbr_random_model = hgbr_random.fit(X_train, y_train)



Fitting 3 folds for each of 81 candidates, totalling 243 fits


In [67]:
print(hgbr_random_model.best_estimator_.get_params())

{'categorical_features': None, 'early_stopping': 'auto', 'l2_regularization': 0.0, 'learning_rate': 0.1, 'loss': 'poisson', 'max_bins': 255, 'max_depth': 8, 'max_iter': 100, 'max_leaf_nodes': 31, 'min_samples_leaf': 40, 'monotonic_cst': None, 'n_iter_no_change': 10, 'random_state': None, 'scoring': 'loss', 'tol': 1e-07, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}


In [64]:
print(hgbr_random_model.best_estimator_.get_params())

{'categorical_features': None, 'early_stopping': 'auto', 'l2_regularization': 0.0, 'learning_rate': 0.1, 'loss': 'least_squares', 'max_bins': 255, 'max_depth': 8, 'max_iter': 100, 'max_leaf_nodes': 31, 'min_samples_leaf': 20, 'monotonic_cst': None, 'n_iter_no_change': 10, 'random_state': None, 'scoring': 'loss', 'tol': 1e-07, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}


In [47]:
print(hgbr_random_model.best_estimator_.get_params())

{'categorical_features': None, 'early_stopping': 'auto', 'l2_regularization': 0.0, 'learning_rate': 0.1, 'loss': 'least_squares', 'max_bins': 255, 'max_depth': None, 'max_iter': 150, 'max_leaf_nodes': 31, 'min_samples_leaf': 20, 'monotonic_cst': None, 'n_iter_no_change': 10, 'random_state': None, 'scoring': 'loss', 'tol': 1e-07, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}


In [44]:
print(hgbr_random_model.best_estimator_.get_params())

{'categorical_features': None, 'early_stopping': 'auto', 'l2_regularization': 0.0, 'learning_rate': 0.1, 'loss': 'least_squares', 'max_bins': 255, 'max_depth': 8, 'max_iter': 100, 'max_leaf_nodes': 31, 'min_samples_leaf': 20, 'monotonic_cst': None, 'n_iter_no_change': 10, 'random_state': None, 'scoring': 'loss', 'tol': 1e-07, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}


In [None]:
from sklearn.ensemble import ExtraTreesRegressor

In [None]:
etr_model = ExtraTreesRegressor()
etr_model.fit(X_train, y_train)
y_pred = etr_model.predict(X_test)

In [None]:
cv_score = cross_val_score(estimator=etr_model, X=X_train, y=y_train, cv=5,verbose = 1)
print('Cross validation score : %.4f' % etr_model.score(X_test, y_test))
print(cv_score)
mae = mean_absolute_error(y_test, y_pred)
print("mae: %f" %(mae))
r2 = etr_model.score(X_test,y_pred)
print("R2: %.4f" %(r2))
mse = mean_squared_error(y_test, y_pred)
print("mse: %f" %(mse))
rmse = (mean_squared_error(y_test, y_pred))**0.5
print("rmse: %f" %(rmse))

In [None]:
X_predict = X_pred # diamonds_predict[features] if not scaled
predictions = gbr_model.predict(X_predict)
diamonds_id = diamonds_predict['id']

In [None]:
submission = pd.DataFrame({'id': diamonds_id, 'price': predictions})

In [None]:
submission.shape

In [None]:
submission.to_csv('gbr(503).csv', index=False)