# imports

In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error

# Read diamonds

In [2]:
diamonds_train_sorted = pd.read_csv('../data/diamonds_train_sorted.csv')
diamonds_train_sorted.head()

Unnamed: 0,price,carat,cut,color,clarity,depth,table,x,y,z
0,326,0.21,4,1,2,59.8,61.0,3.89,3.84,2.31
1,326,0.23,0,1,1,61.5,55.0,3.95,3.98,2.43
2,327,0.23,1,1,4,56.9,65.0,4.05,4.07,2.31
3,334,0.29,4,5,3,62.4,58.0,4.2,4.23,2.63
4,335,0.31,1,6,1,63.3,58.0,4.34,4.35,2.75


# Model train split

In [3]:
X = diamonds_train_sorted[['carat', 
                           'cut',
                           'color', 
                           'clarity', 
                           'x', 
                           'y', 
                           'z', 
                           'depth', 
                           'table']]

y = diamonds_train_sorted['price']

In [4]:
# Train + test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
print(f"X_train: {X_train.shape}, X_test: {X_test.shape}, y_train: {y_train.shape}, y_test: {y_test.shape}")
print(f"X_train: {type(X_train)}, X_test: {type(X_test)}, y_train: {type(y_train)}, y_test: {type(y_test)}")

X_train: (32364, 9), X_test: (8091, 9), y_train: (32364,), y_test: (8091,)
X_train: <class 'pandas.core.frame.DataFrame'>, X_test: <class 'pandas.core.frame.DataFrame'>, y_train: <class 'pandas.core.series.Series'>, y_test: <class 'pandas.core.series.Series'>


# Scaling robust

In [5]:
# Scaling robust
scaler_r = RobustScaler()
scaling_X_train_r = scaler_r.fit_transform(X_train)
scaling_X_test_r = scaler_r.fit_transform(X_test)
scaled_X_train_r = pd.DataFrame(scaling_X_train_r)
scaled_X_train_r

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.781250,0.25,0.666667,0.5,0.595628,0.574586,0.625000,0.400000,-0.333333
1,-0.500000,-0.50,0.000000,1.0,-0.573770,-0.569061,-0.571429,0.066667,-0.466667
2,0.796875,-0.25,0.333333,0.5,0.562842,0.591160,0.669643,1.000000,0.333333
3,-0.625000,0.25,-1.000000,0.5,-0.786885,-0.790055,-0.776786,0.266667,0.333333
4,0.015625,-0.50,-0.666667,-0.5,0.016393,0.038674,0.035714,0.133333,-0.333333
...,...,...,...,...,...,...,...,...,...
32359,-0.453125,-0.50,0.333333,0.0,-0.497268,-0.491713,-0.508929,-0.200000,-0.666667
32360,-0.453125,0.25,-1.000000,0.5,-0.508197,-0.497238,-0.508929,-0.066667,0.333333
32361,2.046875,-0.50,0.666667,-0.5,1.300546,1.270718,1.366071,0.733333,-0.666667
32362,-0.687500,-0.25,0.333333,1.0,-0.901639,-0.895028,-0.848214,1.000000,0.333333


In [6]:
scaled_X_test_r = pd.DataFrame(scaling_X_test_r)
scaled_X_test_r

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,-0.292308,-0.50,-1.000000,0.0,-0.275676,-0.309783,-0.304348,-0.214286,-0.333333
1,-0.446154,-0.50,0.000000,1.5,-0.448649,-0.489130,-0.495652,-0.500000,-0.333333
2,-0.600000,-0.50,-0.333333,0.0,-0.702703,-0.695652,-0.704348,-0.214286,-0.666667
3,2.030769,0.50,-0.333333,-1.0,1.324324,1.298913,1.234783,-0.642857,0.666667
4,0.046154,0.50,-0.333333,-0.5,0.059459,0.054348,0.034783,-0.285714,0.666667
...,...,...,...,...,...,...,...,...,...
8086,-0.553846,0.25,0.333333,-0.5,-0.643243,-0.684783,-0.608696,0.928571,0.333333
8087,0.553846,0.50,0.000000,-1.0,0.524324,0.483696,0.391304,-1.357143,0.666667
8088,-0.276923,-0.50,-0.666667,0.0,-0.286486,-0.277174,-0.321739,-0.642857,0.666667
8089,2.646154,0.50,0.000000,-1.5,1.535135,1.505435,1.495652,-0.071429,1.000000


# Train model

In [7]:
# rf model Scaling robust train
regressor = RandomForestRegressor()
hyperparameters = regressor.get_params()
regressor.fit(scaled_X_train_r, y_train)

# Predict model

In [8]:
#Prediction
y_pred = regressor.predict(scaled_X_test_r)

# RMSE calculation

In [11]:
# RMSE calculation
rmse = mean_squared_error(y_test, y_pred, squared=False)
rmse

609.2489291033345

# Model train 100%

In [12]:
# Scaling robust 100 % model 
scaler_rs = RobustScaler()
scaling_X_rs = scaler_rs.fit_transform(X)
scaled_X_rs = pd.DataFrame(scaling_X_rs)
scaled_X_rs

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,-0.765625,0.50,-0.666667,-0.5,-0.983607,-1.027473,-1.075556,-1.333333,1.333333
1,-0.734375,-0.50,-0.666667,-1.0,-0.950820,-0.950549,-0.968889,-0.200000,-0.666667
2,-0.734375,-0.25,-0.666667,0.5,-0.896175,-0.901099,-1.075556,-3.266667,2.666667
3,-0.640625,0.50,0.666667,0.0,-0.814208,-0.813187,-0.791111,0.400000,0.333333
4,-0.609375,-0.25,1.000000,-1.0,-0.737705,-0.747253,-0.684444,1.000000,0.333333
...,...,...,...,...,...,...,...,...,...
40450,2.093750,0.50,0.333333,-0.5,1.464481,1.412088,1.173333,-2.466667,1.000000
40451,2.484375,0.50,0.666667,-0.5,1.546448,1.505495,1.528889,0.000000,0.666667
40452,1.265625,-0.50,0.000000,2.0,0.918033,0.934066,0.924444,-0.066667,-0.666667
40453,2.031250,0.25,0.000000,-0.5,1.207650,1.241758,1.351111,1.133333,-0.333333


In [13]:
# Train 100% model
regressor = RandomForestRegressor()
hyperparameters = regressor.get_params()
regressor.fit(X, y)

# Read diamonds_test_processed

In [14]:
# Read diamonds_test_processed
diamonds_test_basic = pd.read_csv('../data/diamonds_test_processed.csv')
diamonds_test_basic.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,0.79,3,2,2,62.7,60.0,5.82,5.89,3.67
1,1.2,0,6,4,61.0,57.0,6.81,6.89,4.18
2,1.57,4,4,2,62.2,61.0,7.38,7.32,4.57
3,0.9,3,2,2,63.8,54.0,6.09,6.13,3.9
4,0.5,3,2,4,62.9,58.0,5.05,5.09,3.19


# Predict diamonds_test_scaled_r

In [15]:
# Define fearures
X_dtsr = diamonds_test_basic[["carat", 
                         "cut", 
                         "color", 
                         "clarity", 
                         "depth", 
                         "table", 
                         "x", 
                         "y", 
                         "z"]]

# Scaling diamonds_test_scaled_s

In [18]:
# Scaling model 
scaler_dtsr = RobustScaler()
scaling_X_dtsr = scaler_dtsr.fit_transform(X_dtsr)
scaled_X_dtsr = pd.DataFrame(scaling_X_dtsr)
scaled_X_dtsr

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.140625,0.25,-0.333333,-0.5,0.533333,1.000000,0.066667,0.094444,0.125000
1,0.781250,-0.50,1.000000,0.5,-0.600000,0.000000,0.616667,0.650000,0.580357
2,1.359375,0.50,0.333333,-0.5,0.200000,1.333333,0.933333,0.888889,0.928571
3,0.312500,0.25,-0.333333,-0.5,1.266667,-1.000000,0.216667,0.227778,0.330357
4,-0.312500,0.25,-0.333333,0.5,0.666667,0.333333,-0.361111,-0.350000,-0.303571
...,...,...,...,...,...,...,...,...,...
13480,-0.203125,-0.50,-0.666667,-0.5,0.000000,-0.333333,-0.194444,-0.222222,-0.205357
13481,0.015625,-0.50,0.666667,0.0,0.200000,-0.666667,0.005556,0.005556,0.026786
13482,0.000000,-0.50,-0.333333,0.5,-0.200000,-0.666667,0.027778,-0.005556,0.000000
13483,0.000000,0.25,-0.333333,-1.0,-2.066667,0.000000,0.083333,0.094444,-0.071429


In [19]:
# predict
y_pred_dtsr = regressor.predict(scaled_X_dtsr)
y_pred_dtsr



array([ 654.49, 1691.97, 5056.87, ...,  654.49,  654.49,  654.08])

# Create sample_submission

In [20]:
predictions_sr = pd.DataFrame(y_pred_dtsr).rename(columns = {0:'price'})
predictions_sr.index.names = ['id']
predictions_sr

Unnamed: 0_level_0,price
id,Unnamed: 1_level_1
0,654.49
1,1691.97
2,5056.87
3,750.96
4,654.49
...,...
13480,654.49
13481,654.08
13482,654.49
13483,654.49


In [21]:
predictions_sr.to_csv('../predictions/predictions_sr.csv')