# Diamonds

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
import os
from sys import platform

## Import CSV files

In [2]:
train_df = pd.read_csv("../long_lab/train.csv")
test_df = pd.read_csv("../long_lab/test.csv")

In [4]:
train_df

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,0.30,Premium,D,SI2,62.4,58.0,4.31,4.28,2.68,6.353
1,1,1.01,Ideal,E,VVS2,62.7,56.0,6.42,6.46,4.04,9.183
2,2,0.72,Ideal,F,VS2,61.8,59.0,5.71,5.74,3.54,7.983
3,3,1.08,Very Good,G,SI2,63.2,57.0,6.54,6.50,4.12,8.371
4,4,0.36,Premium,G,VS1,62.3,59.0,4.50,4.55,2.82,6.588
...,...,...,...,...,...,...,...,...,...,...,...
40450,40450,0.42,Premium,D,SI2,62.1,59.0,4.78,4.82,2.98,6.551
40451,40451,0.53,Premium,G,VS2,62.0,58.0,5.21,5.18,3.22,7.382
40452,40452,0.80,Good,G,SI2,62.8,58.0,5.86,5.90,3.69,7.768
40453,40453,1.01,Very Good,F,VS2,61.5,57.0,6.40,6.48,3.96,8.726


In [5]:
print(train_df.shape[0])
train_df.sample()

40455


Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
16953,16953,0.51,Ideal,D,SI2,61.1,56.0,5.18,5.16,3.16,7.215


## Cleaning and processing

In [6]:
train_df.dtypes

id           int64
carat      float64
cut         object
color       object
clarity     object
depth      float64
table      float64
x          float64
y          float64
z          float64
price      float64
dtype: object

In [7]:
print(test_df.shape[0])
test_df.sample()

13485


Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
9894,9894,1.21,Very Good,H,VS2,62.8,57.0,6.77,6.82,4.27


### Transform 'cut'

In [8]:
cut_list = list(train_df["cut"].unique())
cut_list

['Premium', 'Ideal', 'Very Good', 'Fair', 'Good']

In [9]:
cut_dictionary = {
    
    
}

for i in cut_list:
    cut_dictionary[i] = 0


In [10]:
cut_dictionary

{'Premium': 0, 'Ideal': 0, 'Very Good': 0, 'Fair': 0, 'Good': 0}

In [11]:
cut_dictionary = {
    'Premium': 4, 
    'Ideal': 3, 
    'Very Good': 2, 
    'Fair': 0, 
    'Good': 1}

In [12]:
train_df["cut"] = train_df.cut.map(cut_dictionary)
train_df

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,0.30,4,D,SI2,62.4,58.0,4.31,4.28,2.68,6.353
1,1,1.01,3,E,VVS2,62.7,56.0,6.42,6.46,4.04,9.183
2,2,0.72,3,F,VS2,61.8,59.0,5.71,5.74,3.54,7.983
3,3,1.08,2,G,SI2,63.2,57.0,6.54,6.50,4.12,8.371
4,4,0.36,4,G,VS1,62.3,59.0,4.50,4.55,2.82,6.588
...,...,...,...,...,...,...,...,...,...,...,...
40450,40450,0.42,4,D,SI2,62.1,59.0,4.78,4.82,2.98,6.551
40451,40451,0.53,4,G,VS2,62.0,58.0,5.21,5.18,3.22,7.382
40452,40452,0.80,1,G,SI2,62.8,58.0,5.86,5.90,3.69,7.768
40453,40453,1.01,2,F,VS2,61.5,57.0,6.40,6.48,3.96,8.726


### Transform 'color'

In [13]:
color_list = list(train_df["color"].unique())
color_list

['D', 'E', 'F', 'G', 'H', 'I', 'J']

In [14]:
color_dictionary = {
    
    
}

for i in color_list:
    color_dictionary[i] = 0


In [15]:
color_dictionary

{'D': 0, 'E': 0, 'F': 0, 'G': 0, 'H': 0, 'I': 0, 'J': 0}

In [16]:
color_dictionary = {'D': 6,
                    'E': 5, 
                    'F': 4, 
                    'G': 3, 
                    'H': 2, 
                    'I': 1, 
                    'J': 0}

In [17]:
train_df["color"] = train_df.color.map(color_dictionary)
train_df

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,0.30,4,6,SI2,62.4,58.0,4.31,4.28,2.68,6.353
1,1,1.01,3,5,VVS2,62.7,56.0,6.42,6.46,4.04,9.183
2,2,0.72,3,4,VS2,61.8,59.0,5.71,5.74,3.54,7.983
3,3,1.08,2,3,SI2,63.2,57.0,6.54,6.50,4.12,8.371
4,4,0.36,4,3,VS1,62.3,59.0,4.50,4.55,2.82,6.588
...,...,...,...,...,...,...,...,...,...,...,...
40450,40450,0.42,4,6,SI2,62.1,59.0,4.78,4.82,2.98,6.551
40451,40451,0.53,4,3,VS2,62.0,58.0,5.21,5.18,3.22,7.382
40452,40452,0.80,1,3,SI2,62.8,58.0,5.86,5.90,3.69,7.768
40453,40453,1.01,2,4,VS2,61.5,57.0,6.40,6.48,3.96,8.726


### Transform 'clarity'

In [18]:
clarity_list = list(train_df["clarity"].unique())
clarity_list

['SI2', 'VVS2', 'VS2', 'VS1', 'SI1', 'VVS1', 'IF', 'I1']

In [19]:
clarity_dictionary = {
    
    
}

for i in clarity_list:
    clarity_dictionary[i] = 0


In [20]:
clarity_dictionary

{'SI2': 0,
 'VVS2': 0,
 'VS2': 0,
 'VS1': 0,
 'SI1': 0,
 'VVS1': 0,
 'IF': 0,
 'I1': 0}

In [21]:
clarity_dictionary = {'SI2': 0,
 'VVS2': 4,
 'VS2': 2,
 'VS1': 3,
 'SI1': 1,
 'VVS1': 5,
 'IF': 6,
 'I1': 0}

In [22]:
train_df["clarity"] = train_df.clarity.map(clarity_dictionary)
train_df

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,0.30,4,6,0,62.4,58.0,4.31,4.28,2.68,6.353
1,1,1.01,3,5,4,62.7,56.0,6.42,6.46,4.04,9.183
2,2,0.72,3,4,2,61.8,59.0,5.71,5.74,3.54,7.983
3,3,1.08,2,3,0,63.2,57.0,6.54,6.50,4.12,8.371
4,4,0.36,4,3,3,62.3,59.0,4.50,4.55,2.82,6.588
...,...,...,...,...,...,...,...,...,...,...,...
40450,40450,0.42,4,6,0,62.1,59.0,4.78,4.82,2.98,6.551
40451,40451,0.53,4,3,2,62.0,58.0,5.21,5.18,3.22,7.382
40452,40452,0.80,1,3,0,62.8,58.0,5.86,5.90,3.69,7.768
40453,40453,1.01,2,4,2,61.5,57.0,6.40,6.48,3.96,8.726


In [23]:
train_df_cleaned = train_df

In [24]:
train_df_cleaned 

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,0.30,4,6,0,62.4,58.0,4.31,4.28,2.68,6.353
1,1,1.01,3,5,4,62.7,56.0,6.42,6.46,4.04,9.183
2,2,0.72,3,4,2,61.8,59.0,5.71,5.74,3.54,7.983
3,3,1.08,2,3,0,63.2,57.0,6.54,6.50,4.12,8.371
4,4,0.36,4,3,3,62.3,59.0,4.50,4.55,2.82,6.588
...,...,...,...,...,...,...,...,...,...,...,...
40450,40450,0.42,4,6,0,62.1,59.0,4.78,4.82,2.98,6.551
40451,40451,0.53,4,3,2,62.0,58.0,5.21,5.18,3.22,7.382
40452,40452,0.80,1,3,0,62.8,58.0,5.86,5.90,3.69,7.768
40453,40453,1.01,2,4,2,61.5,57.0,6.40,6.48,3.96,8.726


In [25]:
train_df_cleaned.drop('id', inplace=True, axis=1)

In [26]:
X = train_df_cleaned.iloc[:,:-1]
y = train_df_cleaned['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [27]:
train_df_cleaned

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,0.30,4,6,0,62.4,58.0,4.31,4.28,2.68,6.353
1,1.01,3,5,4,62.7,56.0,6.42,6.46,4.04,9.183
2,0.72,3,4,2,61.8,59.0,5.71,5.74,3.54,7.983
3,1.08,2,3,0,63.2,57.0,6.54,6.50,4.12,8.371
4,0.36,4,3,3,62.3,59.0,4.50,4.55,2.82,6.588
...,...,...,...,...,...,...,...,...,...,...
40450,0.42,4,6,0,62.1,59.0,4.78,4.82,2.98,6.551
40451,0.53,4,3,2,62.0,58.0,5.21,5.18,3.22,7.382
40452,0.80,1,3,0,62.8,58.0,5.86,5.90,3.69,7.768
40453,1.01,2,4,2,61.5,57.0,6.40,6.48,3.96,8.726


In [28]:
X_train.shape

(32364, 9)

In [29]:
 y_train.shape

(32364,)

In [30]:
X_test.shape

(8091, 9)

In [31]:
y_test.shape

(8091,)

In [32]:
from sklearn.linear_model import Ridge, Lasso

from sklearn.linear_model import SGDRegressor

from sklearn.neighbors import KNeighborsRegressor

from sklearn.ensemble import GradientBoostingRegressor

from sklearn.svm import SVR

from sklearn.linear_model import LinearRegression as LinReg
from sklearn import metrics
import numpy as np

In [33]:
models = {
    "lr": LinReg(),
    "ridge": Ridge(),
    "lasso": Lasso(),
    "sgd": SGDRegressor(),
    "knn": KNeighborsRegressor(),
    "grad": GradientBoostingRegressor(),
    "svr": SVR()
}

In [34]:
for name, model in models.items():
    print("Training 🏋️‍:", name)
    model.fit(X_train, y_train)

Training 🏋️‍: lr
Training 🏋️‍: ridge
Training 🏋️‍: lasso
Training 🏋️‍: sgd
Training 🏋️‍: knn
Training 🏋️‍: grad
Training 🏋️‍: svr


In [35]:
for name, model in models.items():
    y_pred = model.predict(X_test)
    print(f"------{name}------")
    print('MAE - ', metrics.mean_absolute_error(y_test, y_pred))
    print('MSE - ', metrics.mean_squared_error(y_test, y_pred))
    print('RMSE - ', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
    print('R2 - ', metrics.r2_score(y_test, y_pred))

------lr------
MAE -  0.12350917740179786
MSE -  0.05667308084017822
RMSE -  0.23806108636267756
R2 -  0.9452236489756796
------ridge------
MAE -  0.12355840496989731
MSE -  0.056613727195871454
RMSE -  0.2379363931723591
R2 -  0.945281016177302
------lasso------
MAE -  0.8059624416146931
MSE -  0.8768670119083612
RMSE -  0.9364117747595666
R2 -  0.1524798981479133
------sgd------
MAE -  129001386.935006
MSE -  2.8069042653087e+16
RMSE -  167538182.67215088
R2 -  -2.7129630337514604e+16
------knn------
MAE -  0.13249687306884195
MSE -  0.032817169185514763
RMSE -  0.18115509704536267
R2 -  0.9682811530221971
------grad------
MAE -  0.08501490644683976
MSE -  0.013217747844364991
RMSE -  0.11496846456470136
R2 -  0.9872246226084713
------svr------
MAE -  0.12540663307131225
MSE -  0.04564067059582811
RMSE -  0.2136367725739839
R2 -  0.9558868274588299


## Clean 'test' csv

In [36]:
test_df

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
0,0,0.32,Ideal,I,SI1,60.5,58.0,4.43,4.49,2.70
1,1,1.24,Premium,I,SI1,62.9,60.0,6.80,6.74,4.26
2,2,1.66,Premium,D,SI1,62.0,59.0,7.55,7.60,4.70
3,3,0.75,Premium,D,SI2,60.6,56.0,5.94,5.90,3.59
4,4,1.50,Fair,E,SI2,64.8,55.0,7.26,7.15,4.67
...,...,...,...,...,...,...,...,...,...,...
13480,13480,1.10,Premium,G,SI1,59.6,60.0,6.74,6.70,4.00
13481,13481,0.90,Very Good,D,SI1,62.1,60.0,6.14,6.20,3.83
13482,13482,0.30,Ideal,F,VS2,62.1,53.3,4.30,4.32,2.68
13483,13483,1.25,Ideal,I,SI1,59.6,59.0,7.01,7.09,4.20


## Transform cut

In [37]:
test_cut_list = list(test_df["cut"].unique())
test_cut_list

['Ideal', 'Premium', 'Fair', 'Good', 'Very Good']

In [38]:
test_cut_dictionary = {
    
    
}

for i in test_cut_list:
    test_cut_dictionary[i] = 0


In [39]:
test_cut_dictionary

{'Ideal': 0, 'Premium': 0, 'Fair': 0, 'Good': 0, 'Very Good': 0}

In [40]:
test_cut_dictionary = {
    'Premium': 4, 
    'Ideal': 3, 
    'Very Good': 2, 
    'Fair': 0, 
    'Good': 1}

In [41]:
test_df["cut"] = test_df.cut.map(test_cut_dictionary)
test_df

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
0,0,0.32,3,I,SI1,60.5,58.0,4.43,4.49,2.70
1,1,1.24,4,I,SI1,62.9,60.0,6.80,6.74,4.26
2,2,1.66,4,D,SI1,62.0,59.0,7.55,7.60,4.70
3,3,0.75,4,D,SI2,60.6,56.0,5.94,5.90,3.59
4,4,1.50,0,E,SI2,64.8,55.0,7.26,7.15,4.67
...,...,...,...,...,...,...,...,...,...,...
13480,13480,1.10,4,G,SI1,59.6,60.0,6.74,6.70,4.00
13481,13481,0.90,2,D,SI1,62.1,60.0,6.14,6.20,3.83
13482,13482,0.30,3,F,VS2,62.1,53.3,4.30,4.32,2.68
13483,13483,1.25,3,I,SI1,59.6,59.0,7.01,7.09,4.20


### Transform 'color'

In [42]:
test_color_list = list(test_df["color"].unique())
test_color_list

['I', 'D', 'E', 'H', 'G', 'F', 'J']

In [43]:
test_color_dictionary = {
    
    
}

for i in test_color_list:
    test_color_dictionary[i] = 0


In [44]:
test_color_dictionary

{'I': 0, 'D': 0, 'E': 0, 'H': 0, 'G': 0, 'F': 0, 'J': 0}

In [45]:
test_color_dictionary = {'D': 6,
                    'E': 5, 
                    'F': 4, 
                    'G': 3, 
                    'H': 2, 
                    'I': 1, 
                    'J': 0}

In [46]:
test_df["color"] = test_df.color.map(test_color_dictionary)
test_df

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
0,0,0.32,3,1,SI1,60.5,58.0,4.43,4.49,2.70
1,1,1.24,4,1,SI1,62.9,60.0,6.80,6.74,4.26
2,2,1.66,4,6,SI1,62.0,59.0,7.55,7.60,4.70
3,3,0.75,4,6,SI2,60.6,56.0,5.94,5.90,3.59
4,4,1.50,0,5,SI2,64.8,55.0,7.26,7.15,4.67
...,...,...,...,...,...,...,...,...,...,...
13480,13480,1.10,4,3,SI1,59.6,60.0,6.74,6.70,4.00
13481,13481,0.90,2,6,SI1,62.1,60.0,6.14,6.20,3.83
13482,13482,0.30,3,4,VS2,62.1,53.3,4.30,4.32,2.68
13483,13483,1.25,3,1,SI1,59.6,59.0,7.01,7.09,4.20


### Transform 'clarity'

In [47]:
test_clarity_list = list(test_df["clarity"].unique())
test_clarity_list

['SI1', 'SI2', 'VS2', 'VS1', 'IF', 'VVS1', 'VVS2', 'I1']

In [48]:
test_clarity_dictionary = {
    
    
}

for i in test_clarity_list:
    test_clarity_dictionary[i] = 0


In [49]:
test_clarity_dictionary

{'SI1': 0,
 'SI2': 0,
 'VS2': 0,
 'VS1': 0,
 'IF': 0,
 'VVS1': 0,
 'VVS2': 0,
 'I1': 0}

In [50]:
test_clarity_dictionary = {'SI2': 0,
 'VVS2': 4,
 'VS2': 2,
 'VS1': 3,
 'SI1': 1,
 'VVS1': 5,
 'IF': 6,
 'I1': 0}

In [51]:
test_df["clarity"] = test_df.clarity.map(test_clarity_dictionary)
test_df

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
0,0,0.32,3,1,1,60.5,58.0,4.43,4.49,2.70
1,1,1.24,4,1,1,62.9,60.0,6.80,6.74,4.26
2,2,1.66,4,6,1,62.0,59.0,7.55,7.60,4.70
3,3,0.75,4,6,0,60.6,56.0,5.94,5.90,3.59
4,4,1.50,0,5,0,64.8,55.0,7.26,7.15,4.67
...,...,...,...,...,...,...,...,...,...,...
13480,13480,1.10,4,3,1,59.6,60.0,6.74,6.70,4.00
13481,13481,0.90,2,6,1,62.1,60.0,6.14,6.20,3.83
13482,13482,0.30,3,4,2,62.1,53.3,4.30,4.32,2.68
13483,13483,1.25,3,1,1,59.6,59.0,7.01,7.09,4.20


In [53]:
test_df_cleaned.drop('id', inplace=True, axis=1)

In [54]:
test_df_cleaned

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,0.32,3,1,1,60.5,58.0,4.43,4.49,2.70
1,1.24,4,1,1,62.9,60.0,6.80,6.74,4.26
2,1.66,4,6,1,62.0,59.0,7.55,7.60,4.70
3,0.75,4,6,0,60.6,56.0,5.94,5.90,3.59
4,1.50,0,5,0,64.8,55.0,7.26,7.15,4.67
...,...,...,...,...,...,...,...,...,...
13480,1.10,4,3,1,59.6,60.0,6.74,6.70,4.00
13481,0.90,2,6,1,62.1,60.0,6.14,6.20,3.83
13482,0.30,3,4,2,62.1,53.3,4.30,4.32,2.68
13483,1.25,3,1,1,59.6,59.0,7.01,7.09,4.20


In [55]:
lr = LinReg()

In [58]:
lr.fit(X_train, y_train)

In [59]:
y_pred = lr.predict(test_df_cleaned)

In [60]:
y_pred

array([6.11264007, 8.5777327 , 9.56727608, ..., 6.38911417, 8.66270771,
       7.99926818])

In [61]:
test_df_cleaned['price'] = y_pred

In [62]:
test_df_cleaned

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,0.32,3,1,1,60.5,58.0,4.43,4.49,2.70,6.112640
1,1.24,4,1,1,62.9,60.0,6.80,6.74,4.26,8.577733
2,1.66,4,6,1,62.0,59.0,7.55,7.60,4.70,9.567276
3,0.75,4,6,0,60.6,56.0,5.94,5.90,3.59,8.005530
4,1.50,0,5,0,64.8,55.0,7.26,7.15,4.67,9.193845
...,...,...,...,...,...,...,...,...,...,...
13480,1.10,4,3,1,59.6,60.0,6.74,6.70,4.00,8.600257
13481,0.90,2,6,1,62.1,60.0,6.14,6.20,3.83,8.318402
13482,0.30,3,4,2,62.1,53.3,4.30,4.32,2.68,6.389114
13483,1.25,3,1,1,59.6,59.0,7.01,7.09,4.20,8.662708


In [63]:
df_price = test_df_cleaned [['price']]
df_price

Unnamed: 0,price
0,6.112640
1,8.577733
2,9.567276
3,8.005530
4,9.193845
...,...
13480,8.600257
13481,8.318402
13482,6.389114
13483,8.662708


In [65]:
df_id = pd.read_csv("../long_lab/test.csv")
df_id = df_id[['id']]
df_id

Unnamed: 0,id
0,0
1,1
2,2
3,3
4,4
...,...
13480,13480
13481,13481
13482,13482
13483,13483


In [66]:
df_for_submission = pd.concat([df_id,df_price], axis=1)
df_for_submission

Unnamed: 0,id,price
0,0,6.112640
1,1,8.577733
2,2,9.567276
3,3,8.005530
4,4,9.193845
...,...,...
13480,13480,8.600257
13481,13481,8.318402
13482,13482,6.389114
13483,13483,8.662708


In [67]:
df_for_submission.to_csv('../long_lab/df_for_submission.csv', index=False)