In [81]:
import pandas as pd
import numpy as np
import seaborn as sns
import xgboost as xg
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.metrics import mean_squared_error
import statsmodels.formula.api as smf

In [31]:
diamonds_train = pd.read_csv('/home/julian/Cursos/Ironhack/Proyectos/Proyecto5/Diamond-Price-Prediction/train.csv')

In [118]:
diamonds_test = pd.read_csv('/home/julian/Cursos/Ironhack/Proyectos/Proyecto5/Diamond-Price-Prediction/test.csv')

In [33]:
diamonds_train.shape

(40455, 11)

In [34]:
diamonds_train.head(3)

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,0.3,Premium,D,SI2,62.4,58.0,4.31,4.28,2.68,6.353
1,1,1.01,Ideal,E,VVS2,62.7,56.0,6.42,6.46,4.04,9.183
2,2,0.72,Ideal,F,VS2,61.8,59.0,5.71,5.74,3.54,7.983


#### CATEGORICAL FEATURES

In [35]:
diamonds_train.cut.value_counts()

Ideal        16230
Premium      10338
Very Good     9003
Good          3682
Fair          1202
Name: cut, dtype: int64

In [36]:
dic_cut = {'Ideal': '1', 'Premium': '2', 'Very Good': '3', 'Good': '4', 'Fair': '5'}

In [37]:
diamonds_train.cut = diamonds_train.cut.map(dic_cut)

#### COLOR IS MORE IMPORTANT THAN CLARITY (ojo)

In [39]:
diamonds_train.color.value_counts()

G    8410
E    7368
F    7182
H    6203
D    5100
I    4070
J    2122
Name: color, dtype: int64

In [40]:
#THE COLOR SCALE STARTS AT D, WICH IS NOT TOP QUALITY

In [41]:
dic_color = {'D': '1', 'E': '2', 'F': '3', 'G': '4', 'H': '5', 'I': '6', 'J': '7'}

In [42]:
diamonds_train.color = diamonds_train.color.map(dic_color)

In [43]:
diamonds_train.color.value_counts()

4    8410
2    7368
3    7182
5    6203
1    5100
6    4070
7    2122
Name: color, dtype: int64

In [44]:
diamonds_train.clarity.unique()

array(['SI2', 'VVS2', 'VS2', 'VS1', 'SI1', 'VVS1', 'IF', 'I1'],
      dtype=object)

In [45]:
dic_clarity = {'I1': '1', 'SI2': '2', 'SI1': '3', 'VS2': '4', 'VS1': '5', 'VVS2': '6', 'VVS1': '7', 'IF': '8'}

In [46]:
diamonds_train.clarity = diamonds_train.clarity.map(dic_clarity)

In [48]:
X = diamonds_train.drop('price', axis=1)

In [49]:
y = diamonds_train.price

In [50]:
from sklearn.model_selection import train_test_split

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#### RANDOM FOREST

In [58]:
rs = RandomizedSearchCV(
    estimator=RandomForestRegressor(),
    param_distributions={
        'bootstrap': [True, False],
        'max_depth': [10, 11, 12, 13, 14, 15, 16, None],
        'max_features': [5, 6, 7, 8, 9],
        'min_samples_split': [15, 30, 50],
        'n_estimators': [50, 100, 150]
    },
    n_iter=100,
    cv=10,
    verbose=2,
    scoring="neg_root_mean_squared_error",
    return_train_score=True
)

In [59]:
%%time
rs.fit(X_train, y_train)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits
[CV] END bootstrap=True, max_depth=16, max_features=6, min_samples_split=50, n_estimators=150; total time=   6.0s
[CV] END bootstrap=True, max_depth=16, max_features=6, min_samples_split=50, n_estimators=150; total time=   6.0s
[CV] END bootstrap=True, max_depth=16, max_features=6, min_samples_split=50, n_estimators=150; total time=   6.0s
[CV] END bootstrap=True, max_depth=16, max_features=6, min_samples_split=50, n_estimators=150; total time=   6.1s
[CV] END bootstrap=True, max_depth=16, max_features=6, min_samples_split=50, n_estimators=150; total time=   6.1s
[CV] END bootstrap=True, max_depth=16, max_features=6, min_samples_split=50, n_estimators=150; total time=   6.1s
[CV] END bootstrap=True, max_depth=16, max_features=6, min_samples_split=50, n_estimators=150; total time=   6.2s
[CV] END bootstrap=True, max_depth=16, max_features=6, min_samples_split=50, n_estimators=150; total time=   6.2s
[CV] END bootstrap=True

[CV] END bootstrap=False, max_depth=None, max_features=6, min_samples_split=50, n_estimators=100; total time=   6.8s
[CV] END bootstrap=False, max_depth=None, max_features=6, min_samples_split=50, n_estimators=100; total time=   7.3s
[CV] END bootstrap=False, max_depth=None, max_features=6, min_samples_split=50, n_estimators=100; total time=   7.6s
[CV] END bootstrap=False, max_depth=None, max_features=6, min_samples_split=50, n_estimators=100; total time=   7.1s
[CV] END bootstrap=False, max_depth=None, max_features=6, min_samples_split=50, n_estimators=100; total time=   6.9s
[CV] END bootstrap=False, max_depth=None, max_features=6, min_samples_split=50, n_estimators=100; total time=   6.9s
[CV] END bootstrap=False, max_depth=None, max_features=6, min_samples_split=50, n_estimators=100; total time=   7.2s
[CV] END bootstrap=False, max_depth=None, max_features=6, min_samples_split=50, n_estimators=100; total time=   6.8s
[CV] END bootstrap=False, max_depth=None, max_features=6, min_sa

[CV] END bootstrap=True, max_depth=13, max_features=6, min_samples_split=15, n_estimators=50; total time=   2.3s
[CV] END bootstrap=True, max_depth=13, max_features=6, min_samples_split=15, n_estimators=50; total time=   2.3s
[CV] END bootstrap=True, max_depth=13, max_features=6, min_samples_split=15, n_estimators=50; total time=   2.3s
[CV] END bootstrap=True, max_depth=13, max_features=6, min_samples_split=15, n_estimators=50; total time=   2.3s
[CV] END bootstrap=True, max_depth=13, max_features=6, min_samples_split=15, n_estimators=50; total time=   3.1s
[CV] END bootstrap=True, max_depth=13, max_features=6, min_samples_split=15, n_estimators=50; total time=   2.3s
[CV] END bootstrap=True, max_depth=13, max_features=6, min_samples_split=15, n_estimators=50; total time=   2.3s
[CV] END bootstrap=True, max_depth=13, max_features=6, min_samples_split=15, n_estimators=50; total time=   2.3s
[CV] END bootstrap=True, max_depth=None, max_features=8, min_samples_split=15, n_estimators=100;

[CV] END bootstrap=True, max_depth=12, max_features=9, min_samples_split=30, n_estimators=150; total time=   9.7s
[CV] END bootstrap=True, max_depth=12, max_features=9, min_samples_split=30, n_estimators=150; total time=   9.5s
[CV] END bootstrap=True, max_depth=12, max_features=9, min_samples_split=30, n_estimators=150; total time=   9.5s
[CV] END bootstrap=True, max_depth=12, max_features=9, min_samples_split=30, n_estimators=150; total time=   9.2s
[CV] END bootstrap=True, max_depth=12, max_features=9, min_samples_split=30, n_estimators=150; total time=   9.5s
[CV] END bootstrap=True, max_depth=12, max_features=9, min_samples_split=30, n_estimators=150; total time=   9.0s
[CV] END bootstrap=True, max_depth=15, max_features=6, min_samples_split=50, n_estimators=50; total time=   2.1s
[CV] END bootstrap=True, max_depth=15, max_features=6, min_samples_split=50, n_estimators=50; total time=   2.4s
[CV] END bootstrap=True, max_depth=15, max_features=6, min_samples_split=50, n_estimators=

[CV] END bootstrap=True, max_depth=None, max_features=7, min_samples_split=15, n_estimators=100; total time=   6.1s
[CV] END bootstrap=True, max_depth=None, max_features=7, min_samples_split=15, n_estimators=100; total time=   6.4s
[CV] END bootstrap=True, max_depth=None, max_features=7, min_samples_split=15, n_estimators=100; total time=   5.8s
[CV] END bootstrap=True, max_depth=None, max_features=7, min_samples_split=15, n_estimators=100; total time=   6.4s
[CV] END bootstrap=True, max_depth=13, max_features=9, min_samples_split=15, n_estimators=100; total time=   7.2s
[CV] END bootstrap=True, max_depth=13, max_features=9, min_samples_split=15, n_estimators=100; total time=   6.8s
[CV] END bootstrap=True, max_depth=13, max_features=9, min_samples_split=15, n_estimators=100; total time=   7.4s
[CV] END bootstrap=True, max_depth=13, max_features=9, min_samples_split=15, n_estimators=100; total time=   7.2s
[CV] END bootstrap=True, max_depth=13, max_features=9, min_samples_split=15, n_e

[CV] END bootstrap=False, max_depth=16, max_features=8, min_samples_split=15, n_estimators=150; total time=  16.8s
[CV] END bootstrap=False, max_depth=16, max_features=8, min_samples_split=15, n_estimators=150; total time=  16.0s
[CV] END bootstrap=True, max_depth=12, max_features=8, min_samples_split=50, n_estimators=150; total time=   8.4s
[CV] END bootstrap=True, max_depth=12, max_features=8, min_samples_split=50, n_estimators=150; total time=   8.0s
[CV] END bootstrap=True, max_depth=12, max_features=8, min_samples_split=50, n_estimators=150; total time=   8.3s
[CV] END bootstrap=True, max_depth=12, max_features=8, min_samples_split=50, n_estimators=150; total time=   8.1s
[CV] END bootstrap=True, max_depth=12, max_features=8, min_samples_split=50, n_estimators=150; total time=   7.6s
[CV] END bootstrap=True, max_depth=12, max_features=8, min_samples_split=50, n_estimators=150; total time=   7.7s
[CV] END bootstrap=True, max_depth=12, max_features=8, min_samples_split=50, n_estimat

[CV] END bootstrap=True, max_depth=14, max_features=7, min_samples_split=30, n_estimators=150; total time=   8.2s
[CV] END bootstrap=True, max_depth=14, max_features=7, min_samples_split=30, n_estimators=150; total time=   8.1s
[CV] END bootstrap=True, max_depth=14, max_features=7, min_samples_split=30, n_estimators=150; total time=   7.6s
[CV] END bootstrap=True, max_depth=14, max_features=7, min_samples_split=30, n_estimators=150; total time=   8.0s
[CV] END bootstrap=True, max_depth=14, max_features=7, min_samples_split=30, n_estimators=150; total time=   8.2s
[CV] END bootstrap=True, max_depth=14, max_features=7, min_samples_split=30, n_estimators=150; total time=   7.8s
[CV] END bootstrap=True, max_depth=14, max_features=7, min_samples_split=30, n_estimators=150; total time=   8.9s
[CV] END bootstrap=True, max_depth=14, max_features=7, min_samples_split=30, n_estimators=150; total time=   7.8s
[CV] END bootstrap=True, max_depth=14, max_features=7, min_samples_split=30, n_estimator

[CV] END bootstrap=False, max_depth=15, max_features=8, min_samples_split=50, n_estimators=100; total time=   9.2s
[CV] END bootstrap=False, max_depth=15, max_features=8, min_samples_split=50, n_estimators=100; total time=   8.6s
[CV] END bootstrap=False, max_depth=15, max_features=8, min_samples_split=50, n_estimators=100; total time=   8.5s
[CV] END bootstrap=False, max_depth=15, max_features=8, min_samples_split=50, n_estimators=100; total time=   9.2s
[CV] END bootstrap=False, max_depth=15, max_features=8, min_samples_split=50, n_estimators=100; total time=   9.2s
[CV] END bootstrap=False, max_depth=15, max_features=8, min_samples_split=50, n_estimators=100; total time=   8.6s
[CV] END bootstrap=False, max_depth=15, max_features=8, min_samples_split=50, n_estimators=100; total time=   8.5s
[CV] END bootstrap=False, max_depth=15, max_features=8, min_samples_split=50, n_estimators=100; total time=   9.2s
[CV] END bootstrap=False, max_depth=12, max_features=5, min_samples_split=50, n_

[CV] END bootstrap=True, max_depth=16, max_features=9, min_samples_split=50, n_estimators=150; total time=   9.6s
[CV] END bootstrap=True, max_depth=16, max_features=9, min_samples_split=50, n_estimators=150; total time=   9.2s
[CV] END bootstrap=True, max_depth=16, max_features=9, min_samples_split=50, n_estimators=150; total time=   9.1s
[CV] END bootstrap=True, max_depth=16, max_features=9, min_samples_split=50, n_estimators=150; total time=   9.0s
[CV] END bootstrap=True, max_depth=16, max_features=9, min_samples_split=50, n_estimators=150; total time=   9.6s
[CV] END bootstrap=True, max_depth=16, max_features=9, min_samples_split=50, n_estimators=150; total time=   9.5s
[CV] END bootstrap=False, max_depth=15, max_features=5, min_samples_split=50, n_estimators=100; total time=   5.6s
[CV] END bootstrap=False, max_depth=15, max_features=5, min_samples_split=50, n_estimators=100; total time=   5.6s
[CV] END bootstrap=False, max_depth=15, max_features=5, min_samples_split=50, n_estima

[CV] END bootstrap=False, max_depth=None, max_features=7, min_samples_split=50, n_estimators=150; total time=  11.3s
[CV] END bootstrap=False, max_depth=None, max_features=7, min_samples_split=50, n_estimators=150; total time=  11.4s
[CV] END bootstrap=False, max_depth=None, max_features=7, min_samples_split=50, n_estimators=150; total time=  11.3s
[CV] END bootstrap=False, max_depth=None, max_features=7, min_samples_split=50, n_estimators=150; total time=  11.3s
[CV] END bootstrap=False, max_depth=11, max_features=6, min_samples_split=30, n_estimators=50; total time=   3.0s
[CV] END bootstrap=False, max_depth=11, max_features=6, min_samples_split=30, n_estimators=50; total time=   3.0s
[CV] END bootstrap=False, max_depth=11, max_features=6, min_samples_split=30, n_estimators=50; total time=   3.0s
[CV] END bootstrap=False, max_depth=11, max_features=6, min_samples_split=30, n_estimators=50; total time=   3.0s
[CV] END bootstrap=False, max_depth=11, max_features=6, min_samples_split=30

[CV] END bootstrap=False, max_depth=13, max_features=5, min_samples_split=15, n_estimators=50; total time=   2.9s
[CV] END bootstrap=False, max_depth=13, max_features=5, min_samples_split=15, n_estimators=50; total time=   2.9s
[CV] END bootstrap=False, max_depth=12, max_features=5, min_samples_split=50, n_estimators=100; total time=   5.1s
[CV] END bootstrap=False, max_depth=12, max_features=5, min_samples_split=50, n_estimators=100; total time=   5.1s
[CV] END bootstrap=False, max_depth=12, max_features=5, min_samples_split=50, n_estimators=100; total time=   5.1s
[CV] END bootstrap=False, max_depth=12, max_features=5, min_samples_split=50, n_estimators=100; total time=   5.1s
[CV] END bootstrap=False, max_depth=12, max_features=5, min_samples_split=50, n_estimators=100; total time=   5.1s
[CV] END bootstrap=False, max_depth=12, max_features=5, min_samples_split=50, n_estimators=100; total time=   5.1s
[CV] END bootstrap=False, max_depth=12, max_features=5, min_samples_split=50, n_es

[CV] END bootstrap=True, max_depth=12, max_features=7, min_samples_split=15, n_estimators=50; total time=   2.4s
[CV] END bootstrap=True, max_depth=12, max_features=7, min_samples_split=15, n_estimators=50; total time=   2.4s
[CV] END bootstrap=True, max_depth=12, max_features=7, min_samples_split=15, n_estimators=50; total time=   2.4s
[CV] END bootstrap=True, max_depth=12, max_features=7, min_samples_split=15, n_estimators=50; total time=   2.4s
[CV] END bootstrap=True, max_depth=12, max_features=7, min_samples_split=15, n_estimators=50; total time=   2.4s
[CV] END bootstrap=True, max_depth=12, max_features=7, min_samples_split=15, n_estimators=50; total time=   2.5s
[CV] END bootstrap=True, max_depth=12, max_features=7, min_samples_split=15, n_estimators=50; total time=   2.4s
[CV] END bootstrap=True, max_depth=12, max_features=7, min_samples_split=15, n_estimators=50; total time=   2.4s
[CV] END bootstrap=True, max_depth=12, max_features=7, min_samples_split=15, n_estimators=50; to

[CV] END bootstrap=False, max_depth=10, max_features=6, min_samples_split=50, n_estimators=50; total time=   2.8s
[CV] END bootstrap=False, max_depth=10, max_features=6, min_samples_split=50, n_estimators=50; total time=   2.8s
[CV] END bootstrap=False, max_depth=10, max_features=6, min_samples_split=50, n_estimators=50; total time=   2.8s
[CV] END bootstrap=False, max_depth=10, max_features=6, min_samples_split=50, n_estimators=50; total time=   2.9s
[CV] END bootstrap=False, max_depth=10, max_features=6, min_samples_split=50, n_estimators=50; total time=   3.1s
[CV] END bootstrap=False, max_depth=10, max_features=6, min_samples_split=50, n_estimators=50; total time=   3.3s
[CV] END bootstrap=False, max_depth=10, max_features=6, min_samples_split=50, n_estimators=50; total time=   4.0s
[CV] END bootstrap=True, max_depth=10, max_features=7, min_samples_split=30, n_estimators=100; total time=   5.1s
[CV] END bootstrap=True, max_depth=10, max_features=7, min_samples_split=30, n_estimator

[CV] END bootstrap=True, max_depth=16, max_features=6, min_samples_split=30, n_estimators=50; total time=   2.2s
[CV] END bootstrap=True, max_depth=16, max_features=6, min_samples_split=30, n_estimators=50; total time=   2.3s
[CV] END bootstrap=True, max_depth=16, max_features=6, min_samples_split=30, n_estimators=50; total time=   2.9s
[CV] END bootstrap=True, max_depth=16, max_features=6, min_samples_split=30, n_estimators=50; total time=   2.3s
[CV] END bootstrap=True, max_depth=16, max_features=6, min_samples_split=30, n_estimators=50; total time=   2.2s
[CV] END bootstrap=False, max_depth=11, max_features=6, min_samples_split=50, n_estimators=100; total time=   6.0s
[CV] END bootstrap=False, max_depth=11, max_features=6, min_samples_split=50, n_estimators=100; total time=   5.9s
[CV] END bootstrap=False, max_depth=11, max_features=6, min_samples_split=50, n_estimators=100; total time=   5.9s
[CV] END bootstrap=False, max_depth=11, max_features=6, min_samples_split=50, n_estimators

RandomizedSearchCV(cv=10, estimator=RandomForestRegressor(), n_iter=100,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 11, 12, 13, 14, 15,
                                                      16, None],
                                        'max_features': [5, 6, 7, 8, 9],
                                        'min_samples_split': [15, 30, 50],
                                        'n_estimators': [50, 100, 150]},
                   return_train_score=True,
                   scoring='neg_root_mean_squared_error', verbose=2)

In [60]:
best_ranf = rs.best_estimator_
best_ranf

RandomForestRegressor(bootstrap=False, max_depth=16, max_features=5,
                      min_samples_split=15, n_estimators=150)

#### RANDOM FOREST (BASADO EN EL random SEARCH ANTERIOR)

In [134]:
random_forest2 = RandomForestRegressor(bootstrap=False, max_depth=16, max_features=5, min_samples_split=15, n_estimators=150)

In [135]:
random_forest2.fit(X_train, y_train)

RandomForestRegressor(bootstrap=False, max_depth=16, max_features=5,
                      min_samples_split=15, n_estimators=150)

In [136]:
y_test_pred = random_forest2.predict(X_test)

In [137]:
(mean_squared_error(y_test, y_test_pred)) ** 0.5

0.09170707818201139

#### XGBOOST !

In [None]:
train_dmatrix = xg.DMatrix(data=X_train, enable_categorical=False, label=y_train)

In [None]:
xgb_model = XGBRegressor(n_estimators=100, max_depth=16, eta=0.1, subsample=0.7, colsample_bytree=0.8, enable_categorical=True)

In [None]:
xgb_model.fit(X_train, y_train)

In [119]:
diamonds_test.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
0,0,0.32,Ideal,I,SI1,60.5,58.0,4.43,4.49,2.7
1,1,1.24,Premium,I,SI1,62.9,60.0,6.8,6.74,4.26
2,2,1.66,Premium,D,SI1,62.0,59.0,7.55,7.6,4.7
3,3,0.75,Premium,D,SI2,60.6,56.0,5.94,5.9,3.59
4,4,1.5,Fair,E,SI2,64.8,55.0,7.26,7.15,4.67


In [120]:
diamonds_test.clarity = diamonds_test.clarity.map(dic_clarity)

In [121]:
diamonds_test.color = diamonds_test.color.map(dic_color)

In [122]:
diamonds_test.cut = diamonds_test.cut.map(dic_cut)

In [123]:
diamonds_test

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
0,0,0.32,1,6,3,60.5,58.0,4.43,4.49,2.70
1,1,1.24,2,6,3,62.9,60.0,6.80,6.74,4.26
2,2,1.66,2,1,3,62.0,59.0,7.55,7.60,4.70
3,3,0.75,2,1,2,60.6,56.0,5.94,5.90,3.59
4,4,1.50,5,2,2,64.8,55.0,7.26,7.15,4.67
...,...,...,...,...,...,...,...,...,...,...
13480,13480,1.10,2,4,3,59.6,60.0,6.74,6.70,4.00
13481,13481,0.90,3,1,3,62.1,60.0,6.14,6.20,3.83
13482,13482,0.30,1,3,4,62.1,53.3,4.30,4.32,2.68
13483,13483,1.25,1,6,3,59.6,59.0,7.01,7.09,4.20


In [124]:
y_price_pred = random_forest2.predict(diamonds_test)

In [125]:
y_price_pred

array([6.10912657, 8.58592504, 9.48619284, ..., 6.42835573, 8.71117311,
       8.00742412])

In [126]:
diamonds_test['price'] = y_price_pred

In [127]:
diamonds_test.drop(columns=['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y',
       'z'], inplace=True)

In [128]:
diamonds_test

Unnamed: 0,id,price
0,0,6.109127
1,1,8.585925
2,2,9.486193
3,3,7.834822
4,4,8.939083
...,...,...
13480,13480,8.557133
13481,13481,8.378571
13482,13482,6.428356
13483,13483,8.711173


In [129]:
diamonds_test.to_csv('diamonds_sub2.csv', index = False)