# imports

In [18]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error

# leo diamantes.db

In [3]:
diamonds_train_sorted = pd.read_csv('../data/diamonds_train_sorted.csv')
diamonds_train_sorted.head()

Unnamed: 0,price,carat,cut,color,clarity,x,y,z,depth,table
0,326,0.21,4,1,2,3.89,3.84,2.31,59.8,61.0
1,326,0.23,0,1,1,3.95,3.98,2.43,61.5,55.0
2,327,0.23,1,1,4,4.05,4.07,2.31,56.9,65.0
3,334,0.29,4,5,3,4.2,4.23,2.63,62.4,58.0
4,335,0.31,1,6,1,4.34,4.35,2.75,63.3,58.0


In [4]:
X = diamonds_train_sorted[['carat', 
                           'cut',
                           'color', 
                           'clarity', 
                           'x', 
                           'y', 
                           'z', 
                           'depth', 
                           'table']]

y = diamonds_train_sorted['price']

In [5]:
# Train + test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
print(f"X_train: {X_train.shape}, X_test: {X_test.shape}, y_train: {y_train.shape}, y_test: {y_test.shape}")
print(f"X_train: {type(X_train)}, X_test: {type(X_test)}, y_train: {type(y_train)}, y_test: {type(y_test)}")

X_train: (32364, 9), X_test: (8091, 9), y_train: (32364,), y_test: (8091,)
X_train: <class 'pandas.core.frame.DataFrame'>, X_test: <class 'pandas.core.frame.DataFrame'>, y_train: <class 'pandas.core.series.Series'>, y_test: <class 'pandas.core.series.Series'>


In [6]:
# Scaling standard
scaler = StandardScaler()
scaling_X_train = scaler.fit_transform(X_train)
scaling_X_test = scaler.fit_transform(X_test)
scaled_X_train = pd.DataFrame(scaling_X_train)
scaled_X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.849732,0.683603,1.417490,0.576792,0.937885,0.875555,0.983299,0.453617,-0.646673
1,-0.877712,-1.075585,0.241191,1.184174,-0.969548,-0.921217,-0.942680,0.104369,-0.825001
2,0.870798,-0.489189,0.829341,0.576792,0.884406,0.901595,1.055164,1.082265,0.244965
3,-1.046243,0.683603,-1.523256,0.576792,-1.317165,-1.268419,-1.273258,0.313918,0.244965
4,-0.182521,-1.075585,-0.935107,-0.637973,-0.006918,0.033589,0.034683,0.174219,-0.646673
...,...,...,...,...,...,...,...,...,...
32359,-0.814513,-1.075585,0.829341,-0.030591,-0.844763,-0.799696,-0.842069,-0.175030,-1.092492
32360,-0.814513,0.683603,-1.523256,0.576792,-0.862589,-0.808376,-0.842069,-0.035331,0.244965
32361,2.556109,-1.075585,1.417490,-0.637973,2.087694,1.969242,2.176256,0.802866,-1.092492
32362,-1.130509,-0.489189,0.829341,1.184174,-1.504343,-1.433340,-1.388242,1.082265,0.244965


In [7]:
scaled_X_test = pd.DataFrame(scaling_X_test)
scaled_X_test

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,-0.588782,-1.091792,-1.546628,-0.027480,-0.481229,-0.523623,-0.515923,-0.182890,-0.650885
1,-0.797596,-1.091792,0.213231,1.785053,-0.763314,-0.817053,-0.829217,-0.462234,-0.650885
2,-1.006411,-1.091792,-0.373389,-0.027480,-1.177627,-1.154942,-1.170991,-0.182890,-1.106511
3,2.564319,1.254629,-0.373389,-1.235835,2.128056,2.108352,2.004665,-0.601907,0.715994
4,-0.129390,1.254629,-0.373389,-0.631657,0.065310,0.072128,0.039460,-0.252726,0.715994
...,...,...,...,...,...,...,...,...,...
8086,-0.943767,0.668024,0.799850,-0.631657,-1.080660,-1.137158,-1.014345,0.934490,0.260367
8087,0.559699,1.254629,0.213231,-1.235835,0.823413,0.774581,0.623325,-1.300269,0.715994
8088,-0.567900,-1.091792,-0.960009,-0.027480,-0.498860,-0.470272,-0.544405,-0.601907,0.715994
8089,3.399577,1.254629,0.213231,-1.840013,2.471847,2.446241,2.431883,-0.043217,1.171620


In [8]:
# Scaling robust
scaler_r = RobustScaler()
scaling_X_train_r = scaler_r.fit_transform(X_train)
scaling_X_test_r = scaler_r.fit_transform(X_test)
scaled_X_train_r = pd.DataFrame(scaling_X_train_r)
scaled_X_train_r

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.781250,0.25,0.666667,0.5,0.595628,0.574586,0.625000,0.400000,-0.333333
1,-0.500000,-0.50,0.000000,1.0,-0.573770,-0.569061,-0.571429,0.066667,-0.466667
2,0.796875,-0.25,0.333333,0.5,0.562842,0.591160,0.669643,1.000000,0.333333
3,-0.625000,0.25,-1.000000,0.5,-0.786885,-0.790055,-0.776786,0.266667,0.333333
4,0.015625,-0.50,-0.666667,-0.5,0.016393,0.038674,0.035714,0.133333,-0.333333
...,...,...,...,...,...,...,...,...,...
32359,-0.453125,-0.50,0.333333,0.0,-0.497268,-0.491713,-0.508929,-0.200000,-0.666667
32360,-0.453125,0.25,-1.000000,0.5,-0.508197,-0.497238,-0.508929,-0.066667,0.333333
32361,2.046875,-0.50,0.666667,-0.5,1.300546,1.270718,1.366071,0.733333,-0.666667
32362,-0.687500,-0.25,0.333333,1.0,-0.901639,-0.895028,-0.848214,1.000000,0.333333


In [9]:
scaled_X_test_r = pd.DataFrame(scaling_X_test_r)
scaled_X_test_r

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,-0.292308,-0.50,-1.000000,0.0,-0.275676,-0.309783,-0.304348,-0.214286,-0.333333
1,-0.446154,-0.50,0.000000,1.5,-0.448649,-0.489130,-0.495652,-0.500000,-0.333333
2,-0.600000,-0.50,-0.333333,0.0,-0.702703,-0.695652,-0.704348,-0.214286,-0.666667
3,2.030769,0.50,-0.333333,-1.0,1.324324,1.298913,1.234783,-0.642857,0.666667
4,0.046154,0.50,-0.333333,-0.5,0.059459,0.054348,0.034783,-0.285714,0.666667
...,...,...,...,...,...,...,...,...,...
8086,-0.553846,0.25,0.333333,-0.5,-0.643243,-0.684783,-0.608696,0.928571,0.333333
8087,0.553846,0.50,0.000000,-1.0,0.524324,0.483696,0.391304,-1.357143,0.666667
8088,-0.276923,-0.50,-0.666667,0.0,-0.286486,-0.277174,-0.321739,-0.642857,0.666667
8089,2.646154,0.50,0.000000,-1.5,1.535135,1.505435,1.495652,-0.071429,1.000000


In [21]:
# rf model
regressor = RandomForestRegressor()

hyperparameters = regressor.get_params()
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

In [22]:
# MSE calculation
mean_squared_error(y_test, y_pred)

303982.08648635395

In [23]:
# RMSE calculation
rmse = mean_squared_error(y_test, y_pred, squared=False)
rmse

551.3457050584088

In [24]:
# rf model Scaling standard
regressor = RandomForestRegressor()

hyperparameters = regressor.get_params()
regressor.fit(scaled_X_train, y_train)
y_pred = regressor.predict(scaled_X_test)

In [25]:
# MSE calculation
mean_squared_error(y_test, y_pred)

333329.9744454908

In [26]:
# RMSE calculation
rmse = mean_squared_error(y_test, y_pred, squared=False)
rmse

577.3473603000976

In [27]:
# rf model Scaling robust train
regressor = RandomForestRegressor()
hyperparameters = regressor.get_params()
regressor.fit(scaled_X_train_r, y_train)

In [None]:
#Prediction
y_pred = regressor.predict(scaled_X_test_r)

In [28]:
# MSE calculation
mean_squared_error(y_test, y_pred)

359491.92175674625

In [29]:
# RMSE calculation
rmse = mean_squared_error(y_test, y_pred, squared=False)
rmse

599.5764519698437