# Imports

In [None]:
#!pip install catboost

In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error
import catboost as cb

# Read diamonds

In [2]:
diamonds_train_sorted = pd.read_csv('../data/diamonds_train_sorted.csv')
diamonds_train_sorted.head()

Unnamed: 0,price,carat,cut,color,clarity,depth,table,x,y,z
0,326,0.21,4,1,2,59.8,61.0,3.89,3.84,2.31
1,326,0.23,0,1,1,61.5,55.0,3.95,3.98,2.43
2,327,0.23,1,1,4,56.9,65.0,4.05,4.07,2.31
3,334,0.29,4,5,3,62.4,58.0,4.2,4.23,2.63
4,335,0.31,1,6,1,63.3,58.0,4.34,4.35,2.75


# Feature engineering

In [3]:
#diamonds_train_sorted['table_xy'] = (diamonds_train_sorted['table'].mean()*(diamonds_train_sorted['x']*diamonds_train_sorted['y']).mean()-diamonds_train_sorted['table']*(diamonds_train_sorted['x']*diamonds_train_sorted['y']))
#diamonds_train_sorted['depth_z'] = (diamonds_train_sorted['depth'].mean()*diamonds_train_sorted['z'].mean()-diamonds_train_sorted['depth']*diamonds_train_sorted['z'])
diamonds_train_sorted['volume'] = diamonds_train_sorted['x']*diamonds_train_sorted['y']*diamonds_train_sorted['z']
diamonds_train_sorted['super_feature'] = diamonds_train_sorted['carat'] / diamonds_train_sorted['table'] * diamonds_train_sorted['depth']
diamonds_train_sorted.head()

Unnamed: 0,price,carat,cut,color,clarity,depth,table,x,y,z,volume,super_feature
0,326,0.21,4,1,2,59.8,61.0,3.89,3.84,2.31,34.505856,0.205869
1,326,0.23,0,1,1,61.5,55.0,3.95,3.98,2.43,38.20203,0.257182
2,327,0.23,1,1,4,56.9,65.0,4.05,4.07,2.31,38.076885,0.201338
3,334,0.29,4,5,3,62.4,58.0,4.2,4.23,2.63,46.72458,0.312
4,335,0.31,1,6,1,63.3,58.0,4.34,4.35,2.75,51.91725,0.338328


In [4]:
diamonds_train = diamonds_train_sorted[['price', 'carat', 'cut', 'color', 
                                        'clarity', 'depth', 'table', 'volume', 'super_feature']]
diamonds_train

Unnamed: 0,price,carat,cut,color,clarity,depth,table,volume,super_feature
0,326,0.21,4,1,2,59.8,61.0,34.505856,0.205869
1,326,0.23,0,1,1,61.5,55.0,38.202030,0.257182
2,327,0.23,1,1,4,56.9,65.0,38.076885,0.201338
3,334,0.29,4,5,3,62.4,58.0,46.724580,0.312000
4,335,0.31,1,6,1,63.3,58.0,51.917250,0.338328
...,...,...,...,...,...,...,...,...,...
40426,18795,2.04,4,4,2,58.1,60.0,335.429424,1.975400
40427,18797,2.29,4,5,2,61.8,59.0,377.248560,2.398678
40428,18806,1.51,0,3,7,61.7,55.0,249.029352,1.693945
40429,18818,2.00,3,3,2,63.5,56.0,317.333520,2.267857


# Model train split

In [5]:
X = diamonds_train_sorted[['carat', 
                           'cut',
                           'color', 
                           'clarity',
                           'depth', 
                           'table',
                          'volume',
                          'super_feature']]

y = diamonds_train_sorted['price']

In [6]:
# Train + test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
print(f"X_train: {X_train.shape}, X_test: {X_test.shape}, y_train: {y_train.shape}, y_test: {y_test.shape}")
print(f"X_train: {type(X_train)}, X_test: {type(X_test)}, y_train: {type(y_train)}, y_test: {type(y_test)}")

X_train: (32344, 8), X_test: (8087, 8), y_train: (32344,), y_test: (8087,)
X_train: <class 'pandas.core.frame.DataFrame'>, X_test: <class 'pandas.core.frame.DataFrame'>, y_train: <class 'pandas.core.series.Series'>, y_test: <class 'pandas.core.series.Series'>


# Scaling robust

In [7]:
# Scaling robust
scaler_r = RobustScaler()
scaling_X_train_r = scaler_r.fit_transform(X_train)
scaling_X_test_r = scaler_r.fit_transform(X_test)
scaled_X_train_r = pd.DataFrame(scaling_X_train_r)
scaled_X_train_r

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.000000,-0.50,-1.000000,-1.0,-1.000000,-0.333333,0.000373,0.002643
1,0.312500,-0.50,-0.333333,0.0,-0.866667,-1.000000,0.276189,0.360183
2,0.062500,-0.50,-0.333333,-1.0,0.133333,-0.333333,0.060630,0.094682
3,0.640625,-0.50,0.666667,0.0,0.266667,-0.333333,0.648825,0.675557
4,0.000000,-0.50,0.666667,-0.5,0.066667,0.333333,-0.017422,-0.006740
...,...,...,...,...,...,...,...,...
32339,-0.453125,-0.50,0.333333,0.0,-0.200000,-0.666667,-0.442785,-0.412073
32340,-0.453125,0.25,-1.000000,0.5,-0.066667,0.333333,-0.446794,-0.443391
32341,1.250000,-0.50,-0.333333,0.0,-1.000000,0.000000,1.203275,1.172648
32342,-0.687500,-0.25,0.333333,1.0,1.000000,0.333333,-0.683396,-0.657418


In [8]:
scaled_X_test_r = pd.DataFrame(scaling_X_test_r)
scaled_X_test_r

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.578125,0.25,-0.333333,-1.0,-1.600000,1.000000,0.613341,0.425316
1,2.062500,-0.25,0.333333,-1.0,-1.066667,-0.666667,1.973366,2.036399
2,0.312500,0.50,-0.333333,-1.0,-1.133333,0.666667,0.316861,0.225945
3,-0.453125,-0.50,-0.333333,1.5,0.133333,-0.666667,-0.451980,-0.409990
4,1.312500,-0.50,0.000000,1.0,0.466667,0.000000,1.236733,1.305636
...,...,...,...,...,...,...,...,...
8082,0.015625,0.25,0.000000,0.0,0.200000,1.666667,-0.008740,-0.061716
8083,-0.453125,0.25,-0.666667,-0.5,-1.200000,0.333333,-0.443036,-0.463210
8084,0.859375,0.50,0.666667,0.5,-0.933333,0.666667,0.847989,0.733578
8085,0.531250,-0.50,0.000000,0.5,-0.400000,0.333333,0.513908,0.478585


# Train model RandomForestRegressor

In [9]:
# rf model Scaling robust train
regressor = RandomForestRegressor()
hyperparameters = regressor.get_params()
regressor.fit(scaled_X_train_r, y_train)

# Predict model

In [10]:
#Prediction
y_pred = regressor.predict(scaled_X_test_r)

# RMSE calculation

In [11]:
# RMSE calculation
rmse = mean_squared_error(y_test, y_pred, squared=False)
rmse

541.2616710772165

# Model train 100 %

In [12]:
# Scaling robust 100 % model 
scaler_rs = RobustScaler()
scaling_X_rs = scaler_rs.fit_transform(X)
scaled_X_rs = pd.DataFrame(scaling_X_rs)
scaled_X_rs

Unnamed: 0,0,1,2,3,4,5,6,7
0,-0.765625,0.50,-0.666667,-0.5,-1.333333,1.333333,-0.757398,-0.766940
1,-0.734375,-0.50,-0.666667,-1.0,-0.200000,-0.666667,-0.722497,-0.694947
2,-0.734375,-0.25,-0.666667,0.5,-3.266667,2.666667,-0.723678,-0.773296
3,-0.640625,0.50,0.666667,0.0,0.400000,0.333333,-0.642022,-0.618035
4,-0.609375,-0.25,1.000000,-1.0,1.000000,0.333333,-0.592990,-0.581097
...,...,...,...,...,...,...,...,...
40426,2.093750,0.50,0.333333,-0.5,-2.466667,1.000000,2.084093,1.715766
40427,2.484375,0.50,0.666667,-0.5,0.000000,0.666667,2.478973,2.309638
40428,1.265625,-0.50,0.000000,2.0,-0.066667,-0.666667,1.268255,1.320877
40429,2.031250,0.25,0.000000,-0.5,1.133333,-0.333333,1.913221,2.126092


In [13]:
# Train 100% model
regressor = RandomForestRegressor()
hyperparameters = regressor.get_params()
regressor.fit(scaled_X_rs, y)

In [None]:
"""
model = cb.CatBoostRegressor(loss_function='RMSE')
"""

In [None]:
"""
# rf model train
grid_b = {'iterations': [100, 150, 500],
        'learning_rate': [0.03, 0.1, 0.3],
        'depth': [2, 4, 6, 8],
        'l2_leaf_reg': [0.2, 0.5, 1, 3]}
model.grid_search(grid_b, train_dataset_100)
"""

# Read diamonds_test_processed

In [14]:
# Read diamonds_test_processed
diamonds_test_basic = pd.read_csv('../data/diamonds_test_processed.csv')
diamonds_test_basic.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,0.79,3,2,2,62.7,60.0,5.82,5.89,3.67
1,1.2,0,6,4,61.0,57.0,6.81,6.89,4.18
2,1.57,4,4,2,62.2,61.0,7.38,7.32,4.57
3,0.9,3,2,2,63.8,54.0,6.09,6.13,3.9
4,0.5,3,2,4,62.9,58.0,5.05,5.09,3.19


# Feature engineering

In [15]:
#diamonds_test_basic['table_xy'] = (diamonds_test_basic['table'].mean()*(diamonds_test_basic['x']*diamonds_test_basic['y']).mean()-diamonds_test_basic['table']*(diamonds_test_basic['x']*diamonds_test_basic['y']))
#diamonds_test_basic['depth_z'] = (diamonds_test_basic['depth'].mean()*diamonds_test_basic['z'].mean()-diamonds_test_basic['depth']*diamonds_test_basic['z'])
diamonds_test_basic['volume'] = diamonds_test_basic['x']*diamonds_test_basic['y']*diamonds_test_basic['z']
diamonds_test_basic['super_feature'] = diamonds_test_basic['carat'] / diamonds_test_basic['table'] * diamonds_test_basic['depth']
diamonds_test_basic.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,volume,super_feature
0,0.79,3,2,2,62.7,60.0,5.82,5.89,3.67,125.806866,0.82555
1,1.2,0,6,4,61.0,57.0,6.81,6.89,4.18,196.129362,1.284211
2,1.57,4,4,2,62.2,61.0,7.38,7.32,4.57,246.878712,1.600885
3,0.9,3,2,2,63.8,54.0,6.09,6.13,3.9,145.59363,1.063333
4,0.5,3,2,4,62.9,58.0,5.05,5.09,3.19,81.997355,0.542241


# Predict diamonds_test_scaled_r

In [16]:
# Define fearures
X_dtfe_2 = diamonds_test_basic[['carat', 
                           'cut',
                           'color', 
                           'clarity',
                           'depth', 
                           'table',
                          'volume',
                          'super_feature']]

# Scaling diamonds_test_scaled_sr

In [17]:
# Scaling model 
scaler_dtsr = RobustScaler()
scaling_X_dtsr = scaler_dtsr.fit_transform(X_dtfe_2)
scaled_X_dtsr = pd.DataFrame(scaling_X_dtsr)
scaled_X_dtsr

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.140625,0.25,-0.333333,-0.5,0.533333,1.000000,0.101505,0.097553
1,0.781250,-0.50,1.000000,0.5,-0.600000,0.000000,0.770379,0.745457
2,1.359375,0.50,0.333333,-0.5,0.200000,1.333333,1.253082,1.192792
3,0.312500,0.25,-0.333333,-0.5,1.266667,-1.000000,0.289708,0.433446
4,-0.312500,0.25,-0.333333,0.5,0.666667,0.333333,-0.315189,-0.302650
...,...,...,...,...,...,...,...,...
13480,-0.203125,-0.50,-0.666667,-0.5,0.000000,-0.333333,-0.201743,-0.178606
13481,0.015625,-0.50,0.666667,0.0,0.200000,-0.666667,0.012766,0.065621
13482,0.000000,-0.50,-0.333333,0.5,-0.200000,-0.666667,0.007264,0.038859
13483,0.000000,0.25,-0.333333,-1.0,-2.066667,0.000000,0.035572,-0.048574


In [18]:
# predict
y_pred_dtsr = regressor.predict(scaled_X_dtsr)
y_pred_dtsr

array([2963.98      , 5487.88833333, 9429.99333333, ..., 3236.52      ,
       2146.90333333,  949.73533333])

# Create sample_submission

In [19]:
predictions_fe_2 = pd.DataFrame(y_pred_dtsr).rename(columns = {0:'price'})
predictions_fe_2.index.names = ['id']
predictions_fe_2

Unnamed: 0_level_0,price
id,Unnamed: 1_level_1
0,2963.980000
1,5487.888333
2,9429.993333
3,4031.575000
4,1703.956667
...,...
13480,1594.870000
13481,2342.560000
13482,3236.520000
13483,2146.903333


In [20]:
predictions_fe_2.to_csv('../predictions/predictions_RandomForest_sr.csv')