## Libraries import

In [141]:
import warnings
warnings.filterwarnings('ignore')

In [142]:
# imports 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#outliers
from sklearn.datasets import load_boston
#train test split
from sklearn.model_selection import train_test_split
#models
from sklearn.ensemble import RandomForestRegressor
#error
from sklearn.metrics import mean_squared_error

## Data import

In [143]:
# Impor train data
df_diamonds_train=pd.read_csv('../data/diamonds_train.csv')
df_diamonds_train.pop("Unnamed: 0")
df_diamonds_train

Unnamed: 0,index_id,depth,table,x,y,z,price,carat,cut,color,clarity,city
0,5feceb66ffc86f38d952786c6d696c79c2dbc239dd4e91...,62.4,58.0,6.83,6.79,4.25,4268,1.21,Premium,J,VS2,Dubai
1,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d...,63.0,57.0,4.35,4.38,2.75,505,0.32,Very Good,H,VS2,Kimberly
2,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,65.5,55.0,5.62,5.53,3.65,2686,0.71,Fair,G,VS1,Las Vegas
3,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,63.8,56.0,4.68,4.72,3.00,738,0.41,Good,D,SI1,Kimberly
4,4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328c...,60.5,59.0,6.55,6.51,3.95,4882,1.02,Ideal,G,SI1,Dubai
...,...,...,...,...,...,...,...,...,...,...,...,...
40450,f0bc79169405ebeb24e308055156b946ffd819db9b4f75...,62.7,57.0,7.10,7.04,4.43,10070,1.34,Ideal,G,VS1,Antwerp
40451,339916a23bf22b052b54cb2a9b36ee8418c1c68b46acad...,57.1,60.0,8.31,8.25,4.73,12615,2.02,Good,F,SI2,Madrid
40452,46957922b99954654c1deb8d854c3f069bf118b2ce9415...,62.7,56.0,6.37,6.42,4.01,5457,1.01,Ideal,H,SI1,Kimberly
40453,9d733392d362d5c6f1d9b9659b601c7d4b5a1c1c8df579...,61.9,54.3,4.45,4.47,2.76,456,0.33,Ideal,J,VS1,Kimberly


In [144]:
# Import test data
df_diamonds_test=pd.read_csv('../data/diamonds_test.csv')
df_diamonds_test

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,city
0,0,0.79,Very Good,F,SI1,62.7,60.0,5.82,5.89,3.67,Amsterdam
1,1,1.20,Ideal,J,VS1,61.0,57.0,6.81,6.89,4.18,Surat
2,2,1.57,Premium,H,SI1,62.2,61.0,7.38,7.32,4.57,Kimberly
3,3,0.90,Very Good,F,SI1,63.8,54.0,6.09,6.13,3.90,Kimberly
4,4,0.50,Very Good,F,VS1,62.9,58.0,5.05,5.09,3.19,Amsterdam
...,...,...,...,...,...,...,...,...,...,...,...
13480,13480,0.57,Ideal,E,SI1,61.9,56.0,5.35,5.32,3.30,Amsterdam
13481,13481,0.71,Ideal,I,VS2,62.2,55.0,5.71,5.73,3.56,New York City
13482,13482,0.70,Ideal,F,VS1,61.6,55.0,5.75,5.71,3.53,Tel Aviv
13483,13483,0.70,Very Good,F,SI2,58.8,57.0,5.85,5.89,3.45,Surat


## Data prep -  training model

In [145]:
#1.Excluding x y z and city
# Defining numerical and categorical features
num_features_list=['x','y','z','carat']
cat_features_list=['cut','color','clarity']
features_list=['x','y','z','carat','cut','color','clarity']

In [146]:
#2.Checking if 0 values
if 0 in df_diamonds_train[num_features_list]:
    print('0 values')
else:
    print('No 0 values')

No 0 values


In [147]:
#2.Checking if null values
df_diamonds_train[num_features_list].isna().sum()

x        0
y        0
z        0
carat    0
dtype: int64

In [148]:
#3.remove outliers
def remove_outliers(df,feature):
    # IQR
    Q1 = np.percentile(df[feature], 25,
                   interpolation = 'midpoint')
    Q3 = np.percentile(df[feature], 75,
                   interpolation = 'midpoint')
    IQR = Q3 - Q1
    # Upper and lower
    upper_limit=Q3+1.5*IQR
    lower_limit=Q1-1.5*IQR
    # Removing the Outliers
    return df[(df[feature]>=lower_limit) & (df[feature]<=upper_limit)]

In [149]:
df_diamonds_train=remove_outliers(df_diamonds_train,'x')
df_diamonds_train=remove_outliers(df_diamonds_train,'y')
df_diamonds_train=remove_outliers(df_diamonds_train,'z')
df_diamonds_train=remove_outliers(df_diamonds_train,'carat')

In [150]:
df_diamonds_train.shape

(39002, 12)

In [151]:
#4.convert to log scale
def convertlog(x):
    return np.log(x)
df_diamonds_train['x']=df_diamonds_train['x'].apply(convertlog)
df_diamonds_train['y']=df_diamonds_train['y'].apply(convertlog)
df_diamonds_train['z']=df_diamonds_train['z'].apply(convertlog)
df_diamonds_train['carat']=df_diamonds_train['carat'].apply(convertlog)

In [152]:
# Adapting categorical features for training model
X=df_diamonds_train[features_list]
    # One-hot encoding for categorical variables
X=pd.get_dummies(X,columns=cat_features_list)
    #Target
y=df_diamonds_train['price']
    # Splitting train and test
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

## Data prep - validation model

In [153]:
# Adapting categorical features for validation model
#4.convert to log scale
df_diamonds_test['x']=df_diamonds_test['x'].apply(convertlog)
df_diamonds_test['y']=df_diamonds_test['y'].apply(convertlog)
df_diamonds_test['z']=df_diamonds_test['z'].apply(convertlog)
df_diamonds_test['carat']=df_diamonds_test['carat'].apply(convertlog)
X_test=df_diamonds_test[features_list]
    # One-hot encoding for categorical variables
X_test=pd.get_dummies(X_test,columns=cat_features_list)

## Model definition

In [154]:
# 1. RandomForestRegressor 
model = RandomForestRegressor()
hyperparameters = model.get_params()
print(type(model), '\n')
print('Model hyperparameters:', hyperparameters, '\n')

<class 'sklearn.ensemble._forest.RandomForestRegressor'> 

Model hyperparameters: {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False} 



## Model training with validation

In [155]:
%%time
# Model training
model.fit(X_train, y_train)
print('Model:', model, '\n')
print('Model hyperparameters:', hyperparameters, '\n')

Model: RandomForestRegressor() 

Model hyperparameters: {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False} 

CPU times: user 9.37 s, sys: 154 ms, total: 9.53 s
Wall time: 9.69 s


## Model validation

In [156]:
%%time
# Model predictions
y_pred_val = model.predict(X_val)
print(type(y_pred_val))

<class 'numpy.ndarray'>
CPU times: user 215 ms, sys: 4.54 ms, total: 220 ms
Wall time: 220 ms


In [157]:
rmse_val = mean_squared_error(y_val, y_pred_val)**0.5
rmse_val

458.7595942011633

## Model training without validation

In [158]:
# Model training
model.fit(X, y)
print('Model:', model, '\n')
print('Model hyperparameters:', hyperparameters, '\n')

Model: RandomForestRegressor() 

Model hyperparameters: {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False} 



In [165]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13485 entries, 0 to 13484
Data columns (total 24 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   x              13485 non-null  float64
 1   y              13485 non-null  float64
 2   z              13485 non-null  float64
 3   carat          13485 non-null  float64
 4   cut_Fair       13485 non-null  uint8  
 5   cut_Good       13485 non-null  uint8  
 6   cut_Ideal      13485 non-null  uint8  
 7   cut_Premium    13485 non-null  uint8  
 8   cut_Very Good  13485 non-null  uint8  
 9   color_D        13485 non-null  uint8  
 10  color_E        13485 non-null  uint8  
 11  color_F        13485 non-null  uint8  
 12  color_G        13485 non-null  uint8  
 13  color_H        13485 non-null  uint8  
 14  color_I        13485 non-null  uint8  
 15  color_J        13485 non-null  uint8  
 16  clarity_I1     13485 non-null  uint8  
 17  clarity_IF     13485 non-null  uint8  
 18  clarit

## Test Preditions

In [159]:
predictions = model.predict(X_test)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [63]:
predictions=pd.DataFrame(predictions)

In [64]:
predictions.reset_index(inplace=True)

In [65]:
predictions=predictions.rename({0: 'price','index': 'id'}, axis=1)

In [66]:
predictions['price'].min()

369.0

In [67]:
predictions['price'].max()

18158.92

## Save Preditions

In [207]:
predictions.to_csv('../data/diamonds_predictions_RandomForestRegressor.csv',index=False)