# Imports

In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error

# Read diamonds

In [2]:
diamonds_train_sorted = pd.read_csv('../data/diamonds_train_sorted.csv')
diamonds_train_sorted.head()

Unnamed: 0,price,carat,cut,color,clarity,depth,table,x,y,z
0,326,0.21,4,1,2,59.8,61.0,3.89,3.84,2.31
1,326,0.23,0,1,1,61.5,55.0,3.95,3.98,2.43
2,327,0.23,1,1,4,56.9,65.0,4.05,4.07,2.31
3,334,0.29,4,5,3,62.4,58.0,4.2,4.23,2.63
4,335,0.31,1,6,1,63.3,58.0,4.34,4.35,2.75


# Feature engineering

In [3]:
diamonds_train_sorted['table_xy'] = (diamonds_train_sorted['table'].mean()*(diamonds_train_sorted['x']*diamonds_train_sorted['y']).mean()-diamonds_train_sorted['table']*(diamonds_train_sorted['x']*diamonds_train_sorted['y']))
diamonds_train_sorted['depth_z'] = (diamonds_train_sorted['depth'].mean()*diamonds_train_sorted['z'].mean()-diamonds_train_sorted['depth']*diamonds_train_sorted['z'])
diamonds_train_sorted['volume'] = diamonds_train_sorted['x']*diamonds_train_sorted['y']*diamonds_train_sorted['z']
diamonds_train_sorted.head()

Unnamed: 0,price,carat,cut,color,clarity,depth,table,x,y,z,table_xy,depth_z,volume
0,326,0.21,4,1,2,59.8,61.0,3.89,3.84,2.31,1047.776468,80.291287,34.505856
1,326,0.23,0,1,1,61.5,55.0,3.95,3.98,2.43,1094.315068,68.984287,38.20203
2,327,0.23,1,1,4,56.9,65.0,4.05,4.07,2.31,887.542568,86.990287,38.076885
3,334,0.29,4,5,3,62.4,58.0,4.2,4.23,2.63,928.542068,54.317287,46.72458
4,335,0.31,1,6,1,63.3,58.0,4.34,4.35,2.75,863.988068,44.354287,51.91725


# Model train split

In [4]:
X = diamonds_train_sorted[['carat', 
                           'cut',
                           'color', 
                           'clarity',
                           'table_xy', 
                           'depth_z',
                          'volume']]

y = diamonds_train_sorted['price']

In [5]:
# Train + test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
print(f"X_train: {X_train.shape}, X_test: {X_test.shape}, y_train: {y_train.shape}, y_test: {y_test.shape}")
print(f"X_train: {type(X_train)}, X_test: {type(X_test)}, y_train: {type(y_train)}, y_test: {type(y_test)}")

X_train: (32364, 7), X_test: (8091, 7), y_train: (32364,), y_test: (8091,)
X_train: <class 'pandas.core.frame.DataFrame'>, X_test: <class 'pandas.core.frame.DataFrame'>, y_train: <class 'pandas.core.series.Series'>, y_test: <class 'pandas.core.series.Series'>


# Train model

In [6]:
# rf model train
regressor = RandomForestRegressor()
hyperparameters = regressor.get_params()
regressor.fit(X_train, y_train)

# Predict model

In [7]:
# rf model predict
y_pred = regressor.predict(X_test)

# RMSE calculation

In [8]:
# RMSE calculation
rmse = mean_squared_error(y_test, y_pred, squared=False)
rmse

551.8293055826337

# Model train 100 %

In [9]:
# rf model train
regressor = RandomForestRegressor()
hyperparameters = regressor.get_params()
regressor.fit(X, y)

# Read diamonds_test_processed

In [10]:
# Read diamonds_test_processed
diamonds_test_basic = pd.read_csv('../data/diamonds_test_processed.csv')
diamonds_test_basic.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,0.79,3,2,2,62.7,60.0,5.82,5.89,3.67
1,1.2,0,6,4,61.0,57.0,6.81,6.89,4.18
2,1.57,4,4,2,62.2,61.0,7.38,7.32,4.57
3,0.9,3,2,2,63.8,54.0,6.09,6.13,3.9
4,0.5,3,2,4,62.9,58.0,5.05,5.09,3.19


# Feature engineering

In [12]:
diamonds_test_basic['table_xy'] = (diamonds_test_basic['table'].mean()*(diamonds_test_basic['x']*diamonds_test_basic['y']).mean()-diamonds_test_basic['table']*(diamonds_test_basic['x']*diamonds_test_basic['y']))
diamonds_test_basic['depth_z'] = (diamonds_test_basic['depth'].mean()*diamonds_test_basic['z'].mean()-diamonds_test_basic['depth']*diamonds_test_basic['z'])
diamonds_test_basic['volume'] = diamonds_test_basic['x']*diamonds_test_basic['y']*diamonds_test_basic['z']
diamonds_test_basic.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,table_xy,depth_z,volume
0,0.79,3,2,2,62.7,60.0,5.82,5.89,3.67,-93.26473,-11.338107,125.806866
1,1.2,0,6,4,61.0,57.0,6.81,6.89,4.18,-710.96803,-36.209107,196.129362
2,1.57,4,4,2,62.2,61.0,7.38,7.32,4.57,-1331.79433,-65.483107,246.878712
3,0.9,3,2,2,63.8,54.0,6.09,6.13,3.9,-52.38853,-30.049107,145.59363
4,0.5,3,2,4,62.9,58.0,5.05,5.09,3.19,472.66227,18.119893,81.997355


# Predict diamonds_test_basic

In [13]:
# Define fearures
X_dtfe_2 = diamonds_test_basic[['carat', 
                           'cut',
                           'color', 
                           'clarity',
                           'table_xy', 
                           'depth_z',
                          'volume']]

In [14]:
# predict
y_pred_dtfe_2 = regressor.predict(X_dtfe_2)
y_pred_dtfe_2

array([2899.03      , 5469.34      , 9549.22      , ..., 3219.73833333,
       2147.69      ,  901.563     ])

# Create sample_submission

In [15]:
predictions_fe_2 = pd.DataFrame(y_pred_dtfe_2).rename(columns = {0:'price'})
predictions_fe_2.index.names = ['id']
predictions_fe_2

Unnamed: 0_level_0,price
id,Unnamed: 1_level_1
0,2899.030000
1,5469.340000
2,9549.220000
3,4072.063333
4,1716.790833
...,...
13480,1668.220000
13481,2393.230000
13482,3219.738333
13483,2147.690000


In [16]:
predictions_fe_2.to_csv('../predictions/predictions_fe_2.csv')