In [9]:
import pandas as pd

In [10]:
import numpy as np
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import axes3d
from matplotlib import cm
%matplotlib inline
%config Inlinebackend.figure_format = 'retina'

import seaborn as sns
sns.set_context('poster')
sns.set(rc={'figure.figsize': (16., 9.)})
sns.set_style('whitegrid')

# Modeling libraries
import statsmodels.formula.api as smf # welcome!!
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import learning_curve, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn import set_config
set_config(display='diagram')

np.random.seed(123)

import warnings
warnings.filterwarnings('ignore')

In [11]:
df = pd.read_csv('data/train.csv')

In [12]:
df

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,0.53,Very Good,G,SI1,63.4,54.0,5.09,5.13,3.24,7.057
1,1,0.41,Ideal,D,SI1,63.0,56.0,4.80,4.75,3.01,6.824
2,2,0.32,Ideal,I,VS2,61.6,56.0,4.37,4.39,2.70,6.107
3,3,0.31,Ideal,H,VVS2,61.2,56.0,4.34,4.37,2.66,6.390
4,4,1.35,Premium,J,VS2,60.5,56.0,7.19,7.12,4.33,8.741
...,...,...,...,...,...,...,...,...,...,...,...
40450,40450,0.52,Premium,D,VS2,61.2,58.0,5.16,5.20,3.17,7.508
40451,40451,0.52,Ideal,F,SI1,62.0,55.0,5.14,5.17,3.19,7.232
40452,40452,0.73,Very Good,D,VS2,63.5,58.0,5.68,5.72,3.62,8.065
40453,40453,0.31,Fair,F,VVS2,56.9,59.0,4.45,4.48,2.54,6.629


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40455 entries, 0 to 40454
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   id       40455 non-null  int64  
 1   carat    40455 non-null  float64
 2   cut      40455 non-null  object 
 3   color    40455 non-null  object 
 4   clarity  40455 non-null  object 
 5   depth    40455 non-null  float64
 6   table    40455 non-null  float64
 7   x        40455 non-null  float64
 8   y        40455 non-null  float64
 9   z        40455 non-null  float64
 10  price    40455 non-null  float64
dtypes: float64(7), int64(1), object(3)
memory usage: 3.4+ MB


In [14]:
df.shape

(40455, 11)

In [15]:
df.isna().sum()

id         0
carat      0
cut        0
color      0
clarity    0
depth      0
table      0
x          0
y          0
z          0
price      0
dtype: int64

In [16]:
df.isnull().sum()

id         0
carat      0
cut        0
color      0
clarity    0
depth      0
table      0
x          0
y          0
z          0
price      0
dtype: int64

In [17]:
df.cut.unique()

array(['Very Good', 'Ideal', 'Premium', 'Good', 'Fair'], dtype=object)

In [18]:
df.clarity.unique()

array(['SI1', 'VS2', 'VVS2', 'SI2', 'VVS1', 'VS1', 'I1', 'IF'],
      dtype=object)

In [19]:
df.color.unique()

array(['G', 'D', 'I', 'H', 'J', 'F', 'E'], dtype=object)

In [20]:
df.corr()

Unnamed: 0,id,carat,depth,table,x,y,z,price
id,1.0,-0.006186,-0.002316,-0.01084,-0.005675,-0.004038,-0.005649,-0.00404
carat,-0.006186,1.0,0.025261,0.182888,0.974744,0.950639,0.948895,0.920846
depth,-0.002316,0.025261,1.0,-0.29838,-0.028188,-0.032829,0.091738,-0.000533
table,-0.01084,0.182888,-0.29838,1.0,0.197226,0.18548,0.150606,0.15995
x,-0.005675,0.974744,-0.028188,0.197226,1.0,0.973473,0.966374,0.957934
y,-0.004038,0.950639,-0.032829,0.18548,0.973473,1.0,0.946517,0.934832
z,-0.005649,0.948895,0.091738,0.150606,0.966374,0.946517,1.0,0.931147
price,-0.00404,0.920846,-0.000533,0.15995,0.957934,0.934832,0.931147,1.0


In [21]:
df_ = pd.get_dummies(df, columns=["cut", "color", "clarity"], drop_first=True)

In [22]:
df_

Unnamed: 0,id,carat,depth,table,x,y,z,price,cut_Good,cut_Ideal,...,color_H,color_I,color_J,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,0,0.53,63.4,54.0,5.09,5.13,3.24,7.057,0,0,...,0,0,0,0,1,0,0,0,0,0
1,1,0.41,63.0,56.0,4.80,4.75,3.01,6.824,0,1,...,0,0,0,0,1,0,0,0,0,0
2,2,0.32,61.6,56.0,4.37,4.39,2.70,6.107,0,1,...,0,1,0,0,0,0,0,1,0,0
3,3,0.31,61.2,56.0,4.34,4.37,2.66,6.390,0,1,...,1,0,0,0,0,0,0,0,0,1
4,4,1.35,60.5,56.0,7.19,7.12,4.33,8.741,0,0,...,0,0,1,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40450,40450,0.52,61.2,58.0,5.16,5.20,3.17,7.508,0,0,...,0,0,0,0,0,0,0,1,0,0
40451,40451,0.52,62.0,55.0,5.14,5.17,3.19,7.232,0,1,...,0,0,0,0,1,0,0,0,0,0
40452,40452,0.73,63.5,58.0,5.68,5.72,3.62,8.065,0,0,...,0,0,0,0,0,0,0,1,0,0
40453,40453,0.31,56.9,59.0,4.45,4.48,2.54,6.629,0,0,...,0,0,0,0,0,0,0,0,0,1


In [23]:
from sklearn.model_selection import train_test_split

In [24]:
X = df_.drop("price", axis=1)

In [25]:
y = df_.price

In [26]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y, 
    test_size=0.2
)

In [20]:
from sklearn.linear_model import LinearRegression

In [21]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, mean_squared_log_error

In [22]:
lr = LinearRegression()

In [23]:
lr.fit(X_train, y_train)

In [24]:
lr.predict(X_train)

array([6.51303195, 7.6195622 , 8.93752929, ..., 7.81036365, 6.82104344,
       6.76458503])

In [25]:
y_train

28177    6.654
5281     7.652
28254    8.632
17279    8.742
19854    6.400
         ...  
7763     9.109
15377    6.299
17730    7.773
28030    6.703
15725    6.581
Name: price, Length: 32364, dtype: float64

In [26]:
lr.predict(X_test)

array([9.65886525, 6.66422873, 9.95098103, ..., 6.50509058, 7.48598022,
       8.56204845])

In [27]:
mean_squared_error(
    y_true=y_train,
    y_pred=lr.predict(X_train)
)

0.03250720743134256

In [28]:
mean_squared_error(
    y_true=y_test,
    y_pred=lr.predict(X_test)
)

0.03150454051154626

In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVR

In [30]:
from sklearn.preprocessing import StandardScaler

In [32]:
scaler = StandardScaler()

In [34]:
scaler.fit(df_)

In [35]:
df2 = pd.DataFrame(scaler.transform(df_), columns=df_.columns)

In [41]:
df2

Unnamed: 0,id,carat,depth,table,x,y,z,price,cut_Good,cut_Ideal,...,color_H,color_I,color_J,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,-1.732008,-0.561813,1.148125,-1.547175,-0.566863,-0.523303,-0.416948,-0.714240,-0.315531,-0.815320,...,-0.425274,-0.335914,-0.231275,-0.182574,1.760690,-0.449738,-0.421701,-0.544184,-0.26885,-0.324217
1,-1.731922,-0.816215,0.869501,-0.650557,-0.825883,-0.856043,-0.742014,-0.944014,-0.315531,1.226513,...,-0.425274,-0.335914,-0.231275,-0.182574,1.760690,-0.449738,-0.421701,-0.544184,-0.26885,-0.324217
2,-1.731837,-1.007017,-0.105683,-0.650557,-1.209948,-1.171270,-1.180146,-1.651088,-0.315531,1.226513,...,-0.425274,2.976955,-0.231275,-0.182574,-0.567959,-0.449738,-0.421701,1.837614,-0.26885,-0.324217
3,-1.731751,-1.028217,-0.384307,-0.650557,-1.236743,-1.188783,-1.236679,-1.372006,-0.315531,1.226513,...,2.351428,-0.335914,-0.231275,-0.182574,-0.567959,-0.449738,-0.421701,-0.544184,-0.26885,3.084356
4,-1.731665,1.176603,-0.871899,-0.650557,1.308803,1.219204,1.123581,0.946447,-0.315531,-0.815320,...,-0.425274,-0.335914,4.323854,-0.182574,-0.567959,-0.449738,-0.421701,1.837614,-0.26885,-0.324217
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40450,1.731665,-0.583013,-0.384307,0.246062,-0.504340,-0.462009,-0.515881,-0.269483,-0.315531,-0.815320,...,-0.425274,-0.335914,-0.231275,-0.182574,-0.567959,-0.449738,-0.421701,1.837614,-0.26885,-0.324217
40451,1.731751,-0.583013,0.172941,-1.098866,-0.522204,-0.488278,-0.487614,-0.541662,-0.315531,1.226513,...,-0.425274,-0.335914,-0.231275,-0.182574,1.760690,-0.449738,-0.421701,-0.544184,-0.26885,-0.324217
40452,1.731837,-0.137809,1.217781,0.246062,-0.039890,-0.006680,0.120117,0.279806,-0.315531,-0.815320,...,-0.425274,-0.335914,-0.231275,-0.182574,-0.567959,-0.449738,-0.421701,1.837614,-0.26885,-0.324217
40453,1.731922,-1.028217,-3.379515,0.694371,-1.138494,-1.092463,-1.406278,-1.136314,-0.315531,-0.815320,...,-0.425274,-0.335914,-0.231275,-0.182574,-0.567959,-0.449738,-0.421701,-0.544184,-0.26885,3.084356


In [2]:
X = df_.drop("price", axis=1)

NameError: name 'df_' is not defined

In [125]:
y = df_.price

In [126]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y, 
    test_size=0.2
)

## RandomForest

In [42]:
from sklearn.ensemble import RandomForestRegressor

In [43]:
from sklearn.model_selection import learning_curve, GridSearchCV

In [44]:
model = RandomForestRegressor()
parameter_space = {'n_estimators': [100, 300, 1000],
                   'max_features': ['sqrt', 0.5, None],
                   'max_depth': [None, 10, 30, 100],
                   'min_samples_leaf': [1, 3, 10]}

grid_search = GridSearchCV(model,
                           param_grid=parameter_space,
                           verbose=1,
                           n_jobs=-1,
                           cv=5)

grid_search.fit(X_train, y_train)


Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 16.7min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 42.3min
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed: 56.9min finished


In [45]:
grid_search.best_score_

0.9881923427134527

In [46]:
grid_search.best_params_

{'max_depth': 30,
 'max_features': None,
 'min_samples_leaf': 1,
 'n_estimators': 1000}

In [60]:
params = {'max_depth': 30,
 'max_features': None,
 'min_samples_leaf': 1,
'n_estimators': 1000}

In [127]:
model = RandomForestRegressor()

In [128]:
model.fit(X_train, y_train)

In [129]:
model.predict(X_test)

array([8.37183, 7.9824 , 7.23776, ..., 8.47319, 6.35168, 9.17953])

In [130]:
mean_squared_error(
    y_true=y_test,
    y_pred=model.predict(X_test)
)

0.011725468261475708

In [131]:
model_ = RandomForestRegressor(**params)

In [132]:
model_.fit(X_train, y_train)

In [28]:
model_.predict(X_test)

NameError: name 'model_' is not defined

In [None]:
mean_squared_error(
    y_true=y_test,
    y_pred=model_.predict(X_test)
)

In [88]:
modelo = RandomForestRegressor(
 n_estimators      = 1000, # cantidad de arboles a crear
 max_features = None,   
 min_samples_leaf  = 1,   
 )

In [90]:
modelo.fit(X_train, y_train)

In [91]:
mean_squared_error(
    y_true=y_test,
    y_pred=modelo.predict(X_test)
)

0.011182070313433249

In [146]:
data_test = pd.read_csv('data/test.csv')

In [147]:
data_test

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
0,0,0.33,Ideal,H,IF,61.9,55.0,4.44,4.42,2.74
1,1,0.41,Ideal,E,VS2,61.8,54.0,4.79,4.76,2.95
2,2,0.91,Very Good,E,SI2,62.5,59.0,6.16,6.23,3.87
3,3,0.42,Very Good,G,VS2,62.6,57.0,4.76,4.80,2.99
4,4,0.54,Ideal,G,IF,61.5,56.0,5.28,5.25,3.24
...,...,...,...,...,...,...,...,...,...,...
13480,13480,0.55,Ideal,F,SI1,61.7,56.4,5.26,5.30,3.25
13481,13481,1.12,Premium,H,VS2,60.6,59.0,6.77,6.70,4.08
13482,13482,0.37,Ideal,D,SI1,61.5,57.0,4.63,4.60,2.84
13483,13483,0.54,Good,E,SI1,59.9,63.0,5.25,5.30,3.16


In [148]:
data_test = pd.get_dummies(data_test, columns=["cut", "color", "clarity"], drop_first=True)

In [149]:
X_true_test = data_test

In [150]:
model_.predict(X_true_test)

array([6.752471, 6.905179, 8.251927, ..., 6.707207, 7.249386, 8.145099])

In [151]:
len(model_.predict(X_true_test))

13485

In [152]:
data_test['price'] = model_.predict(X_true_test)

In [154]:
data_test

Unnamed: 0,id,carat,depth,table,x,y,z,cut_Good,cut_Ideal,cut_Premium,...,color_I,color_J,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2,price
0,0,0.33,61.9,55.0,4.44,4.42,2.74,0,1,0,...,0,0,1,0,0,0,0,0,0,6.752471
1,1,0.41,61.8,54.0,4.79,4.76,2.95,0,1,0,...,0,0,0,0,0,0,1,0,0,6.905179
2,2,0.91,62.5,59.0,6.16,6.23,3.87,0,0,0,...,0,0,0,0,1,0,0,0,0,8.251927
3,3,0.42,62.6,57.0,4.76,4.80,2.99,0,0,0,...,0,0,0,0,0,0,1,0,0,6.728499
4,4,0.54,61.5,56.0,5.28,5.25,3.24,0,1,0,...,0,0,1,0,0,0,0,0,0,7.727131
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13480,13480,0.55,61.7,56.4,5.26,5.30,3.25,0,1,0,...,0,0,0,1,0,0,0,0,0,7.285636
13481,13481,1.12,60.6,59.0,6.77,6.70,4.08,0,0,1,...,0,0,0,0,0,0,1,0,0,8.674264
13482,13482,0.37,61.5,57.0,4.63,4.60,2.84,0,1,0,...,0,0,0,1,0,0,0,0,0,6.707207
13483,13483,0.54,59.9,63.0,5.25,5.30,3.16,1,0,0,...,0,0,0,1,0,0,0,0,0,7.249386


In [163]:
predictions = data_test[['id', 'price']]

In [164]:
predictions.set_index('id', inplace=True)

In [165]:
predictions

Unnamed: 0_level_0,price
id,Unnamed: 1_level_1
0,6.752471
1,6.905179
2,8.251927
3,6.728499
4,7.727131
...,...
13480,7.285636
13481,8.674264
13482,6.707207
13483,7.249386


In [171]:
predictions.to_csv("data/predictions_randomforest.csv")

## Boosting

In [94]:
from sklearn.ensemble import GradientBoostingRegressor

In [27]:
model = GradientBoostingRegressor(n_estimators=3000)

params = {'learning_rate': [0.1, 0.05, 0.02, 0.01],
          'max_depth': [4, 6],
          'min_samples_leaf': [3, 5, 9, 17],
          'max_features': [1, 0.3, 0.1]}
grid_search = GridSearchCV(model,
                           param_grid=params,
                           cv=2,
                           n_jobs=3,
                           verbose=1)
grid_search.fit(X_train, y_train)


Fitting 2 folds for each of 96 candidates, totalling 192 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:  4.9min
[Parallel(n_jobs=3)]: Done 192 out of 192 | elapsed: 21.2min finished


In [29]:
grid_search.best_params_

{'learning_rate': 0.1,
 'max_depth': 4,
 'max_features': 0.3,
 'min_samples_leaf': 9}

In [33]:
params = {'learning_rate': 0.1,
 'max_depth': 4,
 'max_features': 0.3,
 'min_samples_leaf': 9,
    'n_estimators': 3000}

In [36]:
modelo_boosting = GradientBoostingRegressor(**params)

In [68]:
modelo_boosting.fit(X_train, y_train)

In [69]:
modelo_boosting.predict(X_test)

array([9.55844722, 6.81204767, 9.69007716, ..., 6.42140169, 7.52874255,
       8.54729749])

In [39]:
mean_squared_error(
    y_true=y_test,
    y_pred=modelo_boosting.predict(X_test)
)

0.008321370192800616

In [73]:
data_test = pd.read_csv('data/test.csv')

In [74]:
data_test

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
0,0,0.33,Ideal,H,IF,61.9,55.0,4.44,4.42,2.74
1,1,0.41,Ideal,E,VS2,61.8,54.0,4.79,4.76,2.95
2,2,0.91,Very Good,E,SI2,62.5,59.0,6.16,6.23,3.87
3,3,0.42,Very Good,G,VS2,62.6,57.0,4.76,4.80,2.99
4,4,0.54,Ideal,G,IF,61.5,56.0,5.28,5.25,3.24
...,...,...,...,...,...,...,...,...,...,...
13480,13480,0.55,Ideal,F,SI1,61.7,56.4,5.26,5.30,3.25
13481,13481,1.12,Premium,H,VS2,60.6,59.0,6.77,6.70,4.08
13482,13482,0.37,Ideal,D,SI1,61.5,57.0,4.63,4.60,2.84
13483,13483,0.54,Good,E,SI1,59.9,63.0,5.25,5.30,3.16


In [75]:
data_test_1 = pd.get_dummies(data_test, columns=["cut", "color", "clarity"], drop_first=True)

In [76]:
X_true_test_1 = data_test_1

In [77]:
modelo_boosting.predict(X_true_test_1)

array([6.88558462, 6.95888865, 8.28359196, ..., 6.73201594, 7.26863952,
       8.11700025])

In [78]:
data_test_1['price'] = modelo_boosting.predict(X_true_test_1)

In [79]:
predictions_1 = data_test_1[['id', 'price']]

In [51]:
predictions_1.set_index('id', inplace=True)

In [52]:
predictions_1.to_csv("data/predictions_Boosting.csv")

In [53]:
model_= GradientBoostingRegressor(n_estimators=3000)

params = {'learning_rate': [0.1, 0.05, 0.02, 0.01],
          'max_depth': [2, 4],
          'min_samples_leaf': [7, 9, 13],
          'max_features': [0.6, 0.3, 0.2]}
grid_search = GridSearchCV(model_,
                           param_grid=params,
                           cv=2,
                           n_jobs=3,
                           verbose=1)
grid_search.fit(X_train, y_train)


Fitting 2 folds for each of 72 candidates, totalling 144 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:  6.2min
[Parallel(n_jobs=3)]: Done 144 out of 144 | elapsed: 20.5min finished


In [54]:
grid_search.best_params_

{'learning_rate': 0.1,
 'max_depth': 4,
 'max_features': 0.6,
 'min_samples_leaf': 13}

In [56]:
params = {'learning_rate': 0.1,
 'max_depth': 4,
 'max_features': 0.6,
 'min_samples_leaf': 13,
'n_estimators': 3000}

In [59]:
modelo_boosting_1 = GradientBoostingRegressor(**params)

In [60]:
modelo_boosting_1.fit(X_train, y_train)

In [61]:
modelo_boosting_1.predict(X_test)

array([9.53588359, 6.80593865, 9.69526227, ..., 6.43242364, 7.52736011,
       8.55442178])

In [63]:
mean_squared_error(
    y_true=y_test,
    y_pred=modelo_boosting_1.predict(X_test)
)

0.008159870166992267

In [64]:
data_test_1

Unnamed: 0,id,carat,depth,table,x,y,z,cut_Good,cut_Ideal,cut_Premium,...,color_I,color_J,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2,price
0,0,0.33,61.9,55.0,4.44,4.42,2.74,0,1,0,...,0,0,1,0,0,0,0,0,0,6.866463
1,1,0.41,61.8,54.0,4.79,4.76,2.95,0,1,0,...,0,0,0,0,0,0,1,0,0,6.920238
2,2,0.91,62.5,59.0,6.16,6.23,3.87,0,0,0,...,0,0,0,0,1,0,0,0,0,8.223529
3,3,0.42,62.6,57.0,4.76,4.80,2.99,0,0,0,...,0,0,0,0,0,0,1,0,0,6.729049
4,4,0.54,61.5,56.0,5.28,5.25,3.24,0,1,0,...,0,0,1,0,0,0,0,0,0,7.722652
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13480,13480,0.55,61.7,56.4,5.26,5.30,3.25,0,1,0,...,0,0,0,1,0,0,0,0,0,7.303870
13481,13481,1.12,60.6,59.0,6.77,6.70,4.08,0,0,1,...,0,0,0,0,0,0,1,0,0,8.659523
13482,13482,0.37,61.5,57.0,4.63,4.60,2.84,0,1,0,...,0,0,0,1,0,0,0,0,0,6.700961
13483,13483,0.54,59.9,63.0,5.25,5.30,3.16,1,0,0,...,0,0,0,1,0,0,0,0,0,7.285426


In [84]:
data_test_2 = pd.read_csv('data/test.csv')submission

In [85]:
data_test_2 = pd.get_dummies(data_test_2, columns=["cut", "color", "clarity"], drop_first=True)

In [86]:
X_true_test_2 = data_test_2

In [87]:
modelo_boosting_1.predict(X_true_test_2)

array([6.86750902, 6.93632392, 8.26587875, ..., 6.73083083, 7.25628548,
       8.11530517])

In [88]:
data_test_2['price'] = modelo_boosting_1.predict(X_true_test_2)

In [89]:
predictions_2 = data_test_2[['id', 'price']]

In [91]:
predictions_2.set_index('id', inplace=True)

In [92]:
predictions_2.to_csv("data/predictions_Boosting_2.csv")

In [96]:
modelazo= GradientBoostingRegressor()

params = {'learning_rate': [0.2, 0.15, 0.1, 0.08],
          'max_depth': [2, 3, 4],
          'min_samples_leaf': [11, 13, 15],
          'max_features': [0.8, 0.6, 0.5],
         'n_estimators': [100, 3000, 7000]}
grid_search = GridSearchCV(modelazo,
                           param_grid=params,
                           cv=2,
                           n_jobs=3,
                           verbose=1)
grid_search.fit(X_train, y_train)


Fitting 2 folds for each of 324 candidates, totalling 648 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:  7.6min
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed: 46.3min
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed: 107.9min
[Parallel(n_jobs=3)]: Done 648 out of 648 | elapsed: 162.1min finished


In [102]:
grid_search.best_params_

{'learning_rate': 0.08,
 'max_depth': 3,
 'max_features': 0.5,
 'min_samples_leaf': 15,
 'n_estimators': 7000}

In [103]:
params = {'learning_rate': 0.08,
 'max_depth': 3,
 'max_features': 0.5,
 'min_samples_leaf': 15,
 'n_estimators': 7000}

In [104]:
modelo_boosting_2 = GradientBoostingRegressor(**params)

In [105]:
modelo_boosting_2.fit(X_train, y_train)

In [110]:
modelo_boosting_2.predict(X_test)

array([9.56511622, 6.80872601, 9.71817443, ..., 6.41658557, 7.53049118,
       8.54957029])

In [111]:
mean_squared_error(
    y_true=y_test,
    y_pred=modelo_boosting_2.predict(X_test)
)

0.00810759609509102

In [113]:
data_test_3 = pd.read_csv('data/test.csv')

In [114]:
data_test_3 = pd.get_dummies(data_test_3, columns=["cut", "color", "clarity"], drop_first=True)

In [115]:
X_true_test_3 = data_test_3

In [116]:
modelo_boosting_2.predict(X_true_test_3)

array([6.84947356, 6.93956402, 8.2722343 , ..., 6.70850356, 7.25572761,
       8.11176372])

In [117]:
data_test_3['price'] = modelo_boosting_2.predict(X_true_test_3)

In [118]:
predictions_3 = data_test_3[['id', 'price']]

In [119]:
predictions_3.set_index('id', inplace=True)

In [120]:
predictions_3.to_csv("data/predictions_Boosting_3.csv")

In [205]:
params = {'learning_rate': 0.09,
 'max_depth': 3,
 'max_features': 0.55,
 'min_samples_leaf': 16,
 'n_estimators': 9000}

In [206]:
modelo_boosting_3 = GradientBoostingRegressor(**params)

In [207]:
modelo_boosting_3.fit(X_train, y_train)

In [208]:
modelo_boosting_3.predict(X_test)

array([9.54353587, 6.80411674, 9.73997383, ..., 6.4304802 , 7.53669796,
       8.54810771])

In [209]:
mean_squared_error(
    y_true=y_test,
    y_pred=modelo_boosting_3.predict(X_test)
)

0.008090131839023544

In [210]:
data_test_4 = pd.read_csv('data/test.csv')

In [211]:
data_test_4 = pd.get_dummies(data_test_4, columns=["cut", "color", "clarity"], drop_first=True)

In [212]:
X_true_test_4 = data_test_4

In [213]:
data_test_4['price'] = modelo_boosting_3.predict(X_true_test_4)

In [214]:
predictions_4 = data_test_4[['id', 'price']]

In [215]:
predictions_4.set_index('id', inplace=True)

In [216]:
predictions_4.to_csv("data/predictions_Boosting_6.csv")