In [69]:
# imports 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error

In [52]:
# impor data and removing unuseful columns
df_diamonds_train=pd.read_csv('../data/diamonds_train.csv')
df_diamonds_train.pop("Unnamed: 0")
df_diamonds_train

Unnamed: 0,index_id,depth,table,x,y,z,price,carat,cut,color,clarity,city
0,5feceb66ffc86f38d952786c6d696c79c2dbc239dd4e91...,62.4,58.0,6.83,6.79,4.25,4268,1.21,Premium,J,VS2,Dubai
1,6b86b273ff34fce19d6b804eff5a3f5747ada4eaa22f1d...,63.0,57.0,4.35,4.38,2.75,505,0.32,Very Good,H,VS2,Kimberly
2,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,65.5,55.0,5.62,5.53,3.65,2686,0.71,Fair,G,VS1,Las Vegas
3,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,63.8,56.0,4.68,4.72,3.00,738,0.41,Good,D,SI1,Kimberly
4,4b227777d4dd1fc61c6f884f48641d02b4d121d3fd328c...,60.5,59.0,6.55,6.51,3.95,4882,1.02,Ideal,G,SI1,Dubai
...,...,...,...,...,...,...,...,...,...,...,...,...
40450,f0bc79169405ebeb24e308055156b946ffd819db9b4f75...,62.7,57.0,7.10,7.04,4.43,10070,1.34,Ideal,G,VS1,Antwerp
40451,339916a23bf22b052b54cb2a9b36ee8418c1c68b46acad...,57.1,60.0,8.31,8.25,4.73,12615,2.02,Good,F,SI2,Madrid
40452,46957922b99954654c1deb8d854c3f069bf118b2ce9415...,62.7,56.0,6.37,6.42,4.01,5457,1.01,Ideal,H,SI1,Kimberly
40453,9d733392d362d5c6f1d9b9659b601c7d4b5a1c1c8df579...,61.9,54.3,4.45,4.47,2.76,456,0.33,Ideal,J,VS1,Kimberly


In [53]:
# Checking nulls
df_diamonds_train.isnull().values.any()

False

In [54]:
# Defining numerical and categorical features
num_features_list=['depth','table','x','y','z','carat']
cat_features_list=['cut','color','clarity']
features_list=['depth','table','x','y','z','carat','cut','color','clarity']

In [55]:
# Features
X=df_diamonds_train[features_list]
# One-hot encoding for categorical variables
X=pd.get_dummies(X,columns=cat_features_list)
X

Unnamed: 0,depth,table,x,y,z,carat,cut_Fair,cut_Good,cut_Ideal,cut_Premium,...,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,62.4,58.0,6.83,6.79,4.25,1.21,0,0,0,1,...,0,1,0,0,0,0,0,1,0,0
1,63.0,57.0,4.35,4.38,2.75,0.32,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,65.5,55.0,5.62,5.53,3.65,0.71,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,63.8,56.0,4.68,4.72,3.00,0.41,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
4,60.5,59.0,6.55,6.51,3.95,1.02,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40450,62.7,57.0,7.10,7.04,4.43,1.34,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
40451,57.1,60.0,8.31,8.25,4.73,2.02,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
40452,62.7,56.0,6.37,6.42,4.01,1.01,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
40453,61.9,54.3,4.45,4.47,2.76,0.33,0,0,1,0,...,0,1,0,0,0,0,1,0,0,0


In [56]:
#Target
y=df_diamonds_train['price']

In [58]:
# Splitting train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"X_train: {X_train.shape}, X_test: {X_test.shape}, y_train: {y_train.shape}, y_test: {y_test.shape}")

X_train: (32364, 26), X_test: (8091, 26), y_train: (32364,), y_test: (8091,)


In [64]:
# Model definition
model = Lasso()
hyperparameters = model.get_params()
print(type(model), '\n')
print('Model hyperparameters:', hyperparameters, '\n')

<class 'sklearn.linear_model.coordinate_descent.Lasso'> 

Model hyperparameters: {'alpha': 1.0, 'copy_X': True, 'fit_intercept': True, 'max_iter': 1000, 'normalize': False, 'positive': False, 'precompute': False, 'random_state': None, 'selection': 'cyclic', 'tol': 0.0001, 'warm_start': False} 



In [65]:
%%time

# Model training

model.fit(X_train, y_train)

print('Model:', model, '\n')
print('Model hyperparameters:', hyperparameters, '\n')
print('Model coefficients:', model.coef_, '\n')

Model: Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False) 

Model hyperparameters: {'alpha': 1.0, 'copy_X': True, 'fit_intercept': True, 'max_iter': 1000, 'normalize': False, 'positive': False, 'precompute': False, 'random_state': None, 'selection': 'cyclic', 'tol': 0.0001, 'warm_start': False} 

Model coefficients: [-6.71628864e+01 -2.80641546e+01 -9.62023873e+02 -4.50364736e+00
 -0.00000000e+00  1.10786622e+04 -6.85631224e+02 -1.50978481e+02
  9.03434249e+01  2.15252915e+01  0.00000000e+00  4.78123184e+02
  2.73446316e+02  2.04006825e+02  0.00000000e+00 -4.93770884e+02
 -9.33980118e+02 -1.84444280e+03 -4.40355013e+03  9.31531759e+02
 -6.92754167e+02 -1.64273528e+03  2.06301034e+02 -8.15385702e+01
  6.51066388e+02  5.74993059e+02] 

CPU times: user 1.49 s, sys: 68.4 ms, total: 1.56 s
Wall time: 1.48 s


In [72]:
%%time

# Model predictions

y_pred = model.predict(X_test)

print(type(y_pred))

<class 'numpy.ndarray'>
CPU times: user 3.31 ms, sys: 2.79 ms, total: 6.1 ms
Wall time: 2.36 ms


In [73]:
# Visual check

check = pd.DataFrame({'Ground truth':y_test, 'Predictions':y_pred, 'Diff':y_test-predictions})
check

Unnamed: 0,Ground truth,Predictions,Diff
17775,2970,3577.206026,-607.206026
13506,3004,3219.648257,-215.648257
4325,838,1274.733169,-436.733169
37870,6468,6132.592463,335.407537
21321,633,796.625780,-163.625780
...,...,...,...
3781,4764,5515.595483,-751.595483
26959,756,320.316265,435.683735
15529,2690,3345.657642,-655.657642
36333,3992,4549.167289,-557.167289


In [75]:
rmse = mean_squared_error(y_test, y_pred)**0.5
rmse

1125.2210972929079

In [77]:
# Import test
df_diamonds_predition=pd.read_csv('../data/diamonds_test.csv')
df_diamonds_predition

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,city
0,0,0.79,Very Good,F,SI1,62.7,60.0,5.82,5.89,3.67,Amsterdam
1,1,1.20,Ideal,J,VS1,61.0,57.0,6.81,6.89,4.18,Surat
2,2,1.57,Premium,H,SI1,62.2,61.0,7.38,7.32,4.57,Kimberly
3,3,0.90,Very Good,F,SI1,63.8,54.0,6.09,6.13,3.90,Kimberly
4,4,0.50,Very Good,F,VS1,62.9,58.0,5.05,5.09,3.19,Amsterdam
...,...,...,...,...,...,...,...,...,...,...,...
13480,13480,0.57,Ideal,E,SI1,61.9,56.0,5.35,5.32,3.30,Amsterdam
13481,13481,0.71,Ideal,I,VS2,62.2,55.0,5.71,5.73,3.56,New York City
13482,13482,0.70,Ideal,F,VS1,61.6,55.0,5.75,5.71,3.53,Tel Aviv
13483,13483,0.70,Very Good,F,SI2,58.8,57.0,5.85,5.89,3.45,Surat


In [81]:
# Preparing test dataframe
# Features
X_pred=df_diamonds_predition[features_list]
# One-hot encoding for categorical variables
X_pred=pd.get_dummies(X_pred,columns=cat_features_list)
X_pred

Unnamed: 0,depth,table,x,y,z,carat,cut_Fair,cut_Good,cut_Ideal,cut_Premium,...,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,62.7,60.0,5.82,5.89,3.67,0.79,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,61.0,57.0,6.81,6.89,4.18,1.20,0,0,1,0,...,0,1,0,0,0,0,1,0,0,0
2,62.2,61.0,7.38,7.32,4.57,1.57,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
3,63.8,54.0,6.09,6.13,3.90,0.90,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,62.9,58.0,5.05,5.09,3.19,0.50,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13480,61.9,56.0,5.35,5.32,3.30,0.57,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
13481,62.2,55.0,5.71,5.73,3.56,0.71,0,0,1,0,...,1,0,0,0,0,0,0,1,0,0
13482,61.6,55.0,5.75,5.71,3.53,0.70,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
13483,58.8,57.0,5.85,5.89,3.45,0.70,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [83]:
# Get preditions
predictions = model.predict(X_pred)
predictions

array([ 3611.81301527,  6336.47559721, 10075.23691812, ...,
        3886.48432199,  1982.01931051,   754.39908637])

In [85]:
predictions=pd.DataFrame(predictions) 
predictions

Unnamed: 0,0
0,3611.813015
1,6336.475597
2,10075.236918
3,4664.144285
4,2085.113220
...,...
13480,1954.995482
13481,2569.537546
13482,3886.484322
13483,1982.019311


In [88]:
predictions.reset_index(inplace=true)

In [90]:
predictions=predictions.rename({0: 'price','index': 'id'}, axis=1)

In [91]:
predictions

Unnamed: 0,id,price
0,0,3611.813015
1,1,6336.475597
2,2,10075.236918
3,3,4664.144285
4,4,2085.113220
...,...,...
13480,13480,1954.995482
13481,13481,2569.537546
13482,13482,3886.484322
13483,13483,1982.019311


In [92]:
predictions.to_csv('../data/diamonds_predictions_1.csv',index=False)