# imports

In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error

# Read diamonds

In [2]:
diamonds_train_sorted = pd.read_csv('../data/diamonds_train_sorted.csv')
diamonds_train_sorted.head()

Unnamed: 0,price,carat,cut,color,clarity,depth,table,x,y,z
0,326,0.21,4,1,2,59.8,61.0,3.89,3.84,2.31
1,326,0.23,0,1,1,61.5,55.0,3.95,3.98,2.43
2,327,0.23,1,1,4,56.9,65.0,4.05,4.07,2.31
3,334,0.29,4,5,3,62.4,58.0,4.2,4.23,2.63
4,335,0.31,1,6,1,63.3,58.0,4.34,4.35,2.75


# Model train split

In [3]:
X = diamonds_train_sorted[['carat', 
                           'cut',
                           'color', 
                           'clarity', 
                           'x', 
                           'y', 
                           'z', 
                           'depth', 
                           'table']]

y = diamonds_train_sorted['price']

In [4]:
# Train + test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
print(f"X_train: {X_trainaa.shape}, X_test: {X_test.shape}, y_train: {y_train.shape}, y_test: {y_test.shape}")
print(f"X_train: {type(X_train)}, X_test: {type(X_test)}, y_train: {type(y_train)}, y_test: {type(y_test)}")

X_train: (32364, 9), X_test: (8091, 9), y_train: (32364,), y_test: (8091,)
X_train: <class 'pandas.core.frame.DataFrame'>, X_test: <class 'pandas.core.frame.DataFrame'>, y_train: <class 'pandas.core.series.Series'>, y_test: <class 'pandas.core.series.Series'>


# Scaling standard

In [9]:
# Scaling standard
scaler = StandardScaler()
scaling_X_train = scaler.fit_transform(X_train)
scaling_X_test = scaler.fit_transform(X_test)
scaled_X_train = pd.DataFrame(scaling_X_train)
scaled_X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.849732,0.683603,1.417490,0.576792,0.937885,0.875555,0.983299,0.453617,-0.646673
1,-0.877712,-1.075585,0.241191,1.184174,-0.969548,-0.921217,-0.942680,0.104369,-0.825001
2,0.870798,-0.489189,0.829341,0.576792,0.884406,0.901595,1.055164,1.082265,0.244965
3,-1.046243,0.683603,-1.523256,0.576792,-1.317165,-1.268419,-1.273258,0.313918,0.244965
4,-0.182521,-1.075585,-0.935107,-0.637973,-0.006918,0.033589,0.034683,0.174219,-0.646673
...,...,...,...,...,...,...,...,...,...
32359,-0.814513,-1.075585,0.829341,-0.030591,-0.844763,-0.799696,-0.842069,-0.175030,-1.092492
32360,-0.814513,0.683603,-1.523256,0.576792,-0.862589,-0.808376,-0.842069,-0.035331,0.244965
32361,2.556109,-1.075585,1.417490,-0.637973,2.087694,1.969242,2.176256,0.802866,-1.092492
32362,-1.130509,-0.489189,0.829341,1.184174,-1.504343,-1.433340,-1.388242,1.082265,0.244965


In [10]:
scaled_X_test = pd.DataFrame(scaling_X_test)
scaled_X_test

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,-0.588782,-1.091792,-1.546628,-0.027480,-0.481229,-0.523623,-0.515923,-0.182890,-0.650885
1,-0.797596,-1.091792,0.213231,1.785053,-0.763314,-0.817053,-0.829217,-0.462234,-0.650885
2,-1.006411,-1.091792,-0.373389,-0.027480,-1.177627,-1.154942,-1.170991,-0.182890,-1.106511
3,2.564319,1.254629,-0.373389,-1.235835,2.128056,2.108352,2.004665,-0.601907,0.715994
4,-0.129390,1.254629,-0.373389,-0.631657,0.065310,0.072128,0.039460,-0.252726,0.715994
...,...,...,...,...,...,...,...,...,...
8086,-0.943767,0.668024,0.799850,-0.631657,-1.080660,-1.137158,-1.014345,0.934490,0.260367
8087,0.559699,1.254629,0.213231,-1.235835,0.823413,0.774581,0.623325,-1.300269,0.715994
8088,-0.567900,-1.091792,-0.960009,-0.027480,-0.498860,-0.470272,-0.544405,-0.601907,0.715994
8089,3.399577,1.254629,0.213231,-1.840013,2.471847,2.446241,2.431883,-0.043217,1.171620


# Train model

In [11]:
# rf model
regressor = RandomForestRegressor()
hyperparameters = regressor.get_params()
regressor.fit(scaled_X_train, y_train)

# Predict model

In [12]:
#Predict model
y_pred = regressor.predict(scaled_X_test)

# RMSE calculation

In [13]:
# RMSE calculation
rmse = mean_squared_error(y_test, y_pred, squared = False)
rmse

573.9043279585525

# Model train 100%

In [19]:
# Scaling standard 100 % model 
scaler = StandardScaler()
scaling_X = scaler.fit_transform(X)
scaled_X = pd.DataFrame(scaling_X)
scaled_X

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,-1.235874,1.266901,-0.940041,-0.636703,-1.635831,-1.650759,-1.760488,-1.363994,1.591160
1,-1.193817,-1.078804,-0.940041,-1.243440,-1.582471,-1.528662,-1.588335,-0.176601,-1.095198
2,-1.193817,-0.492378,-0.940041,0.576771,-1.493538,-1.450172,-1.760488,-3.389548,3.382065
3,-1.067644,1.266901,1.411187,-0.029966,-1.360138,-1.310633,-1.301413,0.452019,0.247981
4,-1.025586,-0.492378,1.998994,-1.243440,-1.235631,-1.205979,-1.129259,1.080639,0.247981
...,...,...,...,...,...,...,...,...,...
40450,2.612396,1.266901,0.823380,-0.636703,2.348379,2.221439,1.869078,-2.551388,1.143433
40451,3.138116,1.266901,1.411187,-0.636703,2.481779,2.369699,2.442922,0.032939,0.695707
40452,1.497870,-1.078804,0.235573,2.396982,1.459046,1.462698,1.467387,-0.036908,-1.095198
40453,2.528281,0.680475,0.235573,-0.636703,1.930393,1.951083,2.156000,1.220332,-0.647472


In [20]:
# Train 100% model
regressor = RandomForestRegressor()
hyperparameters = regressor.get_params()
regressor.fit(X, y)

# Read diamonds_test_processed

In [21]:
# Read diamonds_test_processed
diamonds_test_basic = pd.read_csv('../data/diamonds_test_processed.csv')
diamonds_test_basic.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,0.79,3,2,2,62.7,60.0,5.82,5.89,3.67
1,1.2,0,6,4,61.0,57.0,6.81,6.89,4.18
2,1.57,4,4,2,62.2,61.0,7.38,7.32,4.57
3,0.9,3,2,2,63.8,54.0,6.09,6.13,3.9
4,0.5,3,2,4,62.9,58.0,5.05,5.09,3.19


# Predict diamonds_test_scaled_s

In [24]:
# Define fearures
X_dtss = diamonds_test_basic[["carat", 
                         "cut", 
                         "color", 
                         "clarity", 
                         "depth", 
                         "table", 
                         "x", 
                         "y", 
                         "z"]]

# Scaling diamonds_test_scaled_s

In [26]:
# Scaling model 
scaler_dtss = StandardScaler()
scaling_X_dtss = scaler_dtss.fit_transform(X_dtss)
scaled_X_dtss = pd.DataFrame(scaling_X_dtss)
scaled_X_dtss

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,-0.018412,0.665359,-0.340530,-0.642288,0.669500,1.121874,0.075022,0.133236,0.173091
1,0.855078,-1.089226,2.011650,0.574266,-0.514957,-0.219192,0.964007,1.019395,0.870787
2,1.643349,1.250220,0.835560,-0.642288,0.321131,1.568896,1.475847,1.400444,1.404319
3,0.215939,0.665359,-0.340530,-0.642288,1.435914,-1.560258,0.317472,0.345914,0.487738
4,-0.636246,0.665359,-0.340530,0.574266,0.808848,0.227830,-0.616411,-0.575691,-0.483564
...,...,...,...,...,...,...,...,...,...
13480,-0.487114,-1.089226,-0.928574,-0.642288,0.112109,-0.666214,-0.347022,-0.371875,-0.333080
13481,-0.188849,-1.089226,1.423605,-0.034011,0.321131,-1.113236,-0.023755,-0.008549,0.022608
13482,-0.210153,-1.089226,-0.340530,0.574266,-0.096913,-1.113236,0.012164,-0.026273,-0.018433
13483,-0.210153,0.665359,-0.340530,-1.250565,-2.047784,-0.219192,0.101960,0.133236,-0.127876


In [27]:
# predict
y_pred_dtss = regressor.predict(scaled_X_dtss)
y_pred_dtss



array([ 555.69, 2201.96, 4943.45, ...,  555.91,  555.69,  553.62])

# Create sample_submission

In [28]:
predictions_ss = pd.DataFrame(y_pred_dtss).rename(columns = {0:'price'})
predictions_ss.index.names = ['id']
predictions_ss

Unnamed: 0_level_0,price
id,Unnamed: 1_level_1
0,555.69
1,2201.96
2,4943.45
3,560.30
4,555.69
...,...
13480,555.91
13481,553.62
13482,555.91
13483,555.69


In [29]:
predictions_ss.to_csv('../predictions/predictions_ss.csv')