### _Imports_

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.linear_model  import ElasticNet
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import r2_score

### Create train-test sets

In [2]:
baseline_train = pd.read_csv('../data/baseline_train.csv')
baseline_train.head()

Unnamed: 0,price,carat,depth,table,x,y,z,city_Antwerp,city_Dubai,city_Kimberly,...,color_H,color_I,color_J,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,4268,1.21,62.4,58.0,6.83,6.79,4.25,0,1,0,...,0,0,1,0,0,0,0,1,0,0
1,505,0.32,63.0,57.0,4.35,4.38,2.75,0,0,1,...,1,0,0,0,0,0,0,1,0,0
2,2686,0.71,65.5,55.0,5.62,5.53,3.65,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,738,0.41,63.8,56.0,4.68,4.72,3.0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
4,4882,1.02,60.5,59.0,6.55,6.51,3.95,0,1,0,...,0,0,0,0,1,0,0,0,0,0


In [3]:
y = baseline_train[['price']]
X = baseline_train[[col for col in baseline_train.columns if col != 'price']]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [5]:
X_train

Unnamed: 0,carat,depth,table,x,y,z,city_Antwerp,city_Dubai,city_Kimberly,city_Las Vegas,...,color_H,color_I,color_J,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
32121,0.52,63.2,58.0,5.12,5.10,3.23,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
9831,1.59,59.9,59.0,7.60,7.52,4.53,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
33128,0.66,61.7,55.0,5.64,5.60,3.47,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
6199,0.38,61.2,55.1,4.69,4.73,2.88,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
19661,0.70,61.8,58.0,5.67,5.63,3.49,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6265,0.71,62.0,57.0,5.71,5.75,3.55,1,0,0,0,...,1,0,0,0,0,0,0,0,1,0
11284,0.35,59.5,58.0,4.62,4.59,2.74,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
38158,0.23,59.4,59.0,4.03,4.08,2.41,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
860,1.00,58.0,58.0,6.56,6.62,3.82,0,0,1,0,...,0,1,0,0,1,0,0,0,0,0


In [6]:
X_test

Unnamed: 0,carat,depth,table,x,y,z,city_Antwerp,city_Dubai,city_Kimberly,city_Las Vegas,...,color_H,color_I,color_J,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
17775,0.70,63.8,58.0,5.58,5.61,3.57,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
13506,0.61,61.3,54.0,5.53,5.50,3.38,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4325,0.33,61.6,55.0,4.46,4.47,2.75,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
37870,1.00,58.6,61.0,6.53,6.50,3.82,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
21321,0.29,62.7,61.0,4.20,4.22,2.64,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3781,1.03,63.1,57.0,6.45,6.41,4.06,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
26959,0.32,62.7,54.0,4.39,4.35,2.74,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
15529,0.71,61.3,57.0,5.74,5.78,3.53,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
36333,0.90,59.6,63.0,6.24,6.17,3.70,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


### Select Model

In [7]:
model = LinearRegression()
model.fit(X_train, y_train)
predictions = model.predict(X_test)

### Visual check

In [8]:
predictions_df = pd.DataFrame(predictions)
ground_truth = y_test.reset_index(drop=True)

In [9]:
check = pd.concat([ground_truth, predictions_df], axis=1)
check['difference'] = check['price']-check[0]
check

Unnamed: 0,price,0,difference
0,2970,3592.966083,-622.966083
1,3004,3180.816135,-176.816135
2,838,1303.984829,-465.984829
3,6468,6132.377051,335.622949
4,633,801.154705,-168.154705
...,...,...,...
8086,4764,5485.230912,-721.230912
8087,756,360.328256,395.671744
8088,2690,3346.974806,-656.974806
8089,3992,4535.724524,-543.724524


### Error evaluation

In [10]:
rmse = root_mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print(f'rmse: {rmse}/n')
print(f'r2: {r2}')

rmse: 1124.8553834499119/n
r2: 0.9223017804520173


## Prediction

In [11]:
diamonds_test = pd.read_csv('../data/diamonds_test.csv')
diamonds_test.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,city
0,0,0.79,Very Good,F,SI1,62.7,60.0,5.82,5.89,3.67,Amsterdam
1,1,1.2,Ideal,J,VS1,61.0,57.0,6.81,6.89,4.18,Surat
2,2,1.57,Premium,H,SI1,62.2,61.0,7.38,7.32,4.57,Kimberly
3,3,0.9,Very Good,F,SI1,63.8,54.0,6.09,6.13,3.9,Kimberly
4,4,0.5,Very Good,F,VS1,62.9,58.0,5.05,5.09,3.19,Amsterdam


### Encoding

In [12]:
cat_var_lst = ["city", "cut", "color", "clarity"]

cat_vars = diamonds_test[cat_var_lst]
non_cat_vars = diamonds_test.drop(cat_var_lst, axis=1)

cat_vars_encoded = pd.get_dummies(cat_vars, drop_first=True, dtype=int)

baseline_test = pd.concat([non_cat_vars, cat_vars_encoded], axis=1)

In [13]:
baseline_test = baseline_test.drop('id', axis=1)

In [14]:
baseline_test

Unnamed: 0,carat,depth,table,x,y,z,city_Antwerp,city_Dubai,city_Kimberly,city_Las Vegas,...,color_H,color_I,color_J,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,0.79,62.7,60.0,5.82,5.89,3.67,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,1.20,61.0,57.0,6.81,6.89,4.18,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
2,1.57,62.2,61.0,7.38,7.32,4.57,0,0,1,0,...,1,0,0,0,1,0,0,0,0,0
3,0.90,63.8,54.0,6.09,6.13,3.90,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
4,0.50,62.9,58.0,5.05,5.09,3.19,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13480,0.57,61.9,56.0,5.35,5.32,3.30,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
13481,0.71,62.2,55.0,5.71,5.73,3.56,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
13482,0.70,61.6,55.0,5.75,5.71,3.53,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
13483,0.70,58.8,57.0,5.85,5.89,3.45,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


### Predict

In [15]:
predictions = model.predict(baseline_test)

In [16]:
predictions

array([[ 3663.15318434],
       [ 6301.51625872],
       [10100.28314186],
       ...,
       [ 3902.53069612],
       [ 1936.88218113],
       [  713.46238336]])

### Save

In [17]:
predictions_df = pd.DataFrame(predictions)

In [18]:
pred_baseline = pd.concat([diamonds_test['id'], predictions_df], axis=1)

In [19]:
pred_baseline.rename(columns={0: 'price'}, inplace=True)

In [20]:
pred_baseline

Unnamed: 0,id,price
0,0,3663.153184
1,1,6301.516259
2,2,10100.283142
3,3,4662.073851
4,4,2146.382485
...,...,...
13480,13480,2001.048670
13481,13481,2503.543035
13482,13482,3902.530696
13483,13483,1936.882181


In [21]:
pred_baseline.to_csv('../predictions/pred_baseline.csv', index=False)