## Libraries import

In [11]:
import warnings
warnings.filterwarnings('ignore')

In [12]:
# imports 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#outliers
from sklearn.datasets import load_boston
#train test split
from sklearn.model_selection import train_test_split
#models
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
#error
from sklearn.metrics import mean_squared_error,r2_score

## Data import

In [13]:
# Impor train data
df_diamonds_train=pd.read_csv('../data/diamonds_train.csv')
df_diamonds_train.pop("Unnamed: 0")
df_diamonds_train

Unnamed: 0,index_id,depth,table,x,y,z,price,carat,cut,color,clarity,city
0,5feceb66ffc86f38d952786c6d696c79c2dbc239dd4e91...,62.4,58.0,6.83,6.79,4.25,4268,1.21,Premium,J,VS2,Dubai
1,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d...,63.0,57.0,4.35,4.38,2.75,505,0.32,Very Good,H,VS2,Kimberly
2,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,65.5,55.0,5.62,5.53,3.65,2686,0.71,Fair,G,VS1,Las Vegas
3,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,63.8,56.0,4.68,4.72,3.00,738,0.41,Good,D,SI1,Kimberly
4,4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328c...,60.5,59.0,6.55,6.51,3.95,4882,1.02,Ideal,G,SI1,Dubai
...,...,...,...,...,...,...,...,...,...,...,...,...
40450,f0bc79169405ebeb24e308055156b946ffd819db9b4f75...,62.7,57.0,7.10,7.04,4.43,10070,1.34,Ideal,G,VS1,Antwerp
40451,339916a23bf22b052b54cb2a9b36ee8418c1c68b46acad...,57.1,60.0,8.31,8.25,4.73,12615,2.02,Good,F,SI2,Madrid
40452,46957922b99954654c1deb8d854c3f069bf118b2ce9415...,62.7,56.0,6.37,6.42,4.01,5457,1.01,Ideal,H,SI1,Kimberly
40453,9d733392d362d5c6f1d9b9659b601c7d4b5a1c1c8df579...,61.9,54.3,4.45,4.47,2.76,456,0.33,Ideal,J,VS1,Kimberly


In [14]:
# Import test data
df_diamonds_test=pd.read_csv('../data/diamonds_test.csv')
df_diamonds_test

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,city
0,0,0.79,Very Good,F,SI1,62.7,60.0,5.82,5.89,3.67,Amsterdam
1,1,1.20,Ideal,J,VS1,61.0,57.0,6.81,6.89,4.18,Surat
2,2,1.57,Premium,H,SI1,62.2,61.0,7.38,7.32,4.57,Kimberly
3,3,0.90,Very Good,F,SI1,63.8,54.0,6.09,6.13,3.90,Kimberly
4,4,0.50,Very Good,F,VS1,62.9,58.0,5.05,5.09,3.19,Amsterdam
...,...,...,...,...,...,...,...,...,...,...,...
13480,13480,0.57,Ideal,E,SI1,61.9,56.0,5.35,5.32,3.30,Amsterdam
13481,13481,0.71,Ideal,I,VS2,62.2,55.0,5.71,5.73,3.56,New York City
13482,13482,0.70,Ideal,F,VS1,61.6,55.0,5.75,5.71,3.53,Tel Aviv
13483,13483,0.70,Very Good,F,SI2,58.8,57.0,5.85,5.89,3.45,Surat


## Data prep -  training model

In [15]:
#1.Excluding x y z and city
# Defining numerical and categorical features
num_features_list=['x','y','z','depth','table','carat']
cat_features_list=['cut','color','clarity']
features_list=['x','y','z','depth','table','carat','cut','color','clarity']

In [16]:
#2.Checking if 0 values
if 0 in df_diamonds_train[num_features_list]:
    print('0 values')
else:
    print('No 0 values')

No 0 values


In [17]:
#2.Checking if null values
df_diamonds_train[num_features_list].isna().sum()

x        0
y        0
z        0
depth    0
table    0
carat    0
dtype: int64

In [18]:
#3.remove outliers
def remove_outliers(df,feature):
    # IQR
    Q1 = np.percentile(df[feature], 25,
                   interpolation = 'midpoint')
    Q3 = np.percentile(df[feature], 75,
                   interpolation = 'midpoint')
    IQR = Q3 - Q1
    # Upper and lower
    upper_limit=Q3+1.5*IQR
    lower_limit=Q1-1.5*IQR
    # Removing the Outliers
    return df[(df[feature]>=lower_limit) & (df[feature]<=upper_limit)]

In [19]:
df_diamonds_train=remove_outliers(df_diamonds_train,'x')
df_diamonds_train=remove_outliers(df_diamonds_train,'y')
df_diamonds_train=remove_outliers(df_diamonds_train,'z')
df_diamonds_train=remove_outliers(df_diamonds_train,'depth')
df_diamonds_train=remove_outliers(df_diamonds_train,'table')
df_diamonds_train=remove_outliers(df_diamonds_train,'carat')

In [20]:
#4. Change colums to categoric
df_diamonds_train['cut'] = df_diamonds_train['cut'].astype('category')
df_diamonds_train['color'] = df_diamonds_train['color'].astype('category')
df_diamonds_train['clarity'] = df_diamonds_train['clarity'].astype('category')

In [21]:
df_diamonds_train.shape

(37264, 12)

In [22]:
# Adapting categorical features for training model
X=df_diamonds_train[features_list]
    # One-hot encoding for categorical variables
X=pd.get_dummies(X,columns=cat_features_list)
    #Target
y=df_diamonds_train['price']
    # Splitting train and test
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

## Data prep - validation model

In [23]:
# Adapting categorical features for validation model
X_test=df_diamonds_test[features_list]
    # One-hot encoding for categorical variables
X_test=pd.get_dummies(X_test,columns=cat_features_list)

In [24]:
df_diamonds_test['cut'] = df_diamonds_test['cut'].astype('category')
df_diamonds_test['color'] = df_diamonds_test['color'].astype('category')
df_diamonds_test['clarity'] = df_diamonds_test['clarity'].astype('category')

## Model definition

Tiene varios arboles en el que le va metiendo distintas samples de datos y distintas combinaciones de las features
si dejas crecer al arbol se sobreajusta
los arboles intentan reducir la varianza del error
- error bias: el modelo es muy simple y no 
- varianza en el error: 
- bootstrap = method for sampling data points: TRUE
    - with - bagging: reemplazando los que quitan 
    - pasting -  without replacement (): sin reemplazarlos
- ccp_alpha =
- criterion =
- max_depth = max number of levels in each decision tree
- max_features = max number of features considered for splitting a node
- max_leaf_nodes = maximo número de nodos de solución
- max_samples = 
- min_impurity_decrease
- min_samples_leaf = min number of data points allowed in a leaf node
- min_samples_split = min number of data points placed in a node before the node is split
- min_weight_fraction_leaf
- n_estimators = number of trees in the foreset
- n_jobs 
- oob_score 
- random_state
- verbose
- warm_start

- bootstrap = TRUE
- n_estimators = number of trees in the foreset
- max_depth = max number of levels in each decision tree
- max_features = max number of features considered for splitting a node

In [25]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [26]:
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [28]:
# 1. RandomForestRegressor 
model = RandomForestRegressor()
    
#bootstrap=True,max_depth=70,min_samples_leaf=4,min_samples_split=10,n_estimators=400)
hyperparameters = model.get_params()
print(type(model), '\n')
print('Model hyperparameters:', hyperparameters, '\n')

<class 'sklearn.ensemble._forest.RandomForestRegressor'> 

Model hyperparameters: {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False} 



In [27]:
# 1. XGBRegressor 
#model = XGBRegressor()
#hyperparameters = model.get_params()
#print(type(model), '\n')
#print('Model hyperparameters:', hyperparameters, '\n')

## Model training with validation

In [29]:
%%time
# Model training
model.fit(X_train, y_train)
print('Model:', model, '\n')
print('Model hyperparameters:', hyperparameters, '\n')

Model: RandomForestRegressor() 

Model hyperparameters: {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False} 

CPU times: user 11.6 s, sys: 145 ms, total: 11.8 s
Wall time: 11.9 s


In [30]:
%%time
# Model predictions
y_pred_val = model.predict(X_val)
print(type(y_pred_val))

<class 'numpy.ndarray'>
CPU times: user 223 ms, sys: 5.98 ms, total: 229 ms
Wall time: 231 ms


## Train validation

In [31]:
%%time
# Model predictions
y_pred_train = model.predict(X_train)
print(type(y_pred_train))

<class 'numpy.ndarray'>
CPU times: user 669 ms, sys: 11.9 ms, total: 681 ms
Wall time: 687 ms


In [32]:
%%time
# Model predictions
rmse_train = mean_squared_error(y_train, y_pred_train)**0.5
rmse_train

CPU times: user 1.5 ms, sys: 894 µs, total: 2.39 ms
Wall time: 1.6 ms


180.55304149260567

In [33]:
r2r = r2_score(y_val, y_pred_val)
r2r

0.983639531093109

## Model validation

In [34]:
%%time
# Model predictions
y_pred_val = model.predict(X_val)
print(type(y_pred_val))

<class 'numpy.ndarray'>
CPU times: user 203 ms, sys: 4.99 ms, total: 208 ms
Wall time: 208 ms


In [35]:
#444 con ciudad
rmse_val = mean_squared_error(y_val, y_pred_val)**0.5
rmse_val

442.5404068100221

In [36]:
r2r = r2_score(y_val, y_pred_val)
r2r

0.983639531093109

## Model training without validation

In [37]:
# Model training
model.fit(X, y)
print('Model:', model, '\n')
print('Model hyperparameters:', hyperparameters, '\n')

Model: RandomForestRegressor() 

Model hyperparameters: {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False} 



## Test Preditions

In [38]:
predictions = model.predict(X_test)

In [39]:
predictions=pd.DataFrame(predictions)

In [40]:
predictions.reset_index(inplace=True)

In [41]:
predictions=predictions.rename({0: 'price','index': 'id'}, axis=1)

## Save Preditions

In [42]:
predictions.to_csv('../data/diamonds_predictions_RandomForestRegressor_with_depht_table.csv',index=False)