In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [242]:
diamonds = pd.read_csv('diamonds0819/data.csv')

In [243]:
display(diamonds.head())
print(diamonds.shape)

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,2.26,Ideal,G,SI2,61.9,57.0,8.44,8.36,5.2,12831
1,2.43,Very Good,H,SI2,63.2,57.0,8.56,8.5,5.39,16170
2,0.8,Premium,F,SI2,61.0,57.0,6.03,6.01,3.67,2797
3,0.4,Ideal,F,I1,63.3,60.0,4.68,4.64,2.95,630
4,0.31,Ideal,G,VS2,61.6,55.0,4.39,4.37,2.7,698


(40455, 10)


In [244]:
print("Check null values:")
print(diamonds.isnull().sum())
print("\nColumns type:")
print(diamonds.dtypes)

Check null values:
carat      0
cut        0
color      0
clarity    0
depth      0
table      0
x          0
y          0
z          0
price      0
dtype: int64

Columns type:
carat      float64
cut         object
color       object
clarity     object
depth      float64
table      float64
x          float64
y          float64
z          float64
price        int64
dtype: object


In [245]:
print("Diamonds describe table")
display(diamonds.describe())

Diamonds describe table


Unnamed: 0,carat,depth,table,x,y,z,price
count,40455.0,40455.0,40455.0,40455.0,40455.0,40455.0,40455.0
mean,0.798385,61.747932,57.459085,5.732041,5.735939,3.539537,3939.242813
std,0.474353,1.432322,2.231152,1.123169,1.152802,0.709709,4000.344155
min,0.2,43.0,43.0,0.0,0.0,0.0,326.0
25%,0.4,61.0,56.0,4.71,4.72,2.91,950.0
50%,0.7,61.8,57.0,5.7,5.71,3.53,2409.0
75%,1.04,62.5,59.0,6.54,6.54,4.04,5331.0
max,5.01,79.0,79.0,10.74,58.9,31.8,18823.0


In [246]:
print("Cut value counts:")
print(diamonds["cut"].value_counts(),"\n")
print("Color value counts:")
print(diamonds["color"].value_counts(),"\n")
print("Clarity value counts:")
print(diamonds["clarity"].value_counts())

Cut value counts:
Ideal        16139
Premium      10377
Very Good     9101
Good          3650
Fair          1188
Name: cut, dtype: int64 

Color value counts:
G    8492
E    7343
F    7183
H    6230
D    5046
I    4046
J    2115
Name: color, dtype: int64 

Clarity value counts:
SI1     9767
VS2     9147
SI2     6909
VS1     6157
VVS2    3822
VVS1    2740
IF      1356
I1       557
Name: clarity, dtype: int64


In [247]:
cut = {"Ideal":5,"Premium":4,"Very Good":3,"Good":2,"Fair":1}
color = {"D":7,"E":6,"F":5,"G":4,"H":3,"I":2,"J":1}
clarity = {"I1":1,"SI2":2,"SI1":3,"VS2":4, "VS1":5,"VVS2":6,"VVS1":7,"IF":8}

In [248]:
def assignNumber(dc,dictionary):
    cat = []
    for e in dc:
        for l,v in dictionary.items():
            if e == l:
                cat.append(v)
    return cat

In [249]:
diamonds["cut"] = assignNumber(diamonds["cut"],cut)
diamonds["color"] = assignNumber(diamonds["color"],color)
diamonds["clarity"] = assignNumber(diamonds["clarity"],clarity)

In [250]:
print("Cut value counts:")
print(diamonds["cut"].value_counts(),"\n")
print("Color value counts:")
print(diamonds["color"].value_counts(),"\n")
print("Clarity value counts:")
print(diamonds["clarity"].value_counts())

Cut value counts:
5    16139
4    10377
3     9101
2     3650
1     1188
Name: cut, dtype: int64 

Color value counts:
4    8492
6    7343
5    7183
3    6230
7    5046
2    4046
1    2115
Name: color, dtype: int64 

Clarity value counts:
3    9767
4    9147
2    6909
5    6157
6    3822
7    2740
8    1356
1     557
Name: clarity, dtype: int64


In [251]:
print(diamonds.dtypes)
display(diamonds.head())

carat      float64
cut          int64
color        int64
clarity      int64
depth      float64
table      float64
x          float64
y          float64
z          float64
price        int64
dtype: object


Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,2.26,5,4,2,61.9,57.0,8.44,8.36,5.2,12831
1,2.43,3,3,2,63.2,57.0,8.56,8.5,5.39,16170
2,0.8,4,5,2,61.0,57.0,6.03,6.01,3.67,2797
3,0.4,5,5,1,63.3,60.0,4.68,4.64,2.95,630
4,0.31,5,4,4,61.6,55.0,4.39,4.37,2.7,698


In [252]:
X_diamonds = diamonds[['carat','cut','color','clarity','depth','table','x','y','z']]
y_diamonds = diamonds['price']

In [293]:
X_train, X_test, y_train, y_test = train_test_split(X_diamonds,y_diamonds,test_size=0.05)

In [294]:
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

In [295]:
X_submission = pd.read_csv('diamonds0819/test.csv')

submission_id = X_submission['id']
X_submission = X_submission.drop(columns=['id'],axis=1)

X_submission["cut"] = assignNumber(X_submission["cut"],cut)
X_submission["color"] = assignNumber(X_submission["color"],color)
X_submission["clarity"] = assignNumber(X_submission["clarity"],clarity)

In [296]:
preds_train = {}

In [299]:
models = {
    "modelLinearRG": LinearRegression(),
    "decisionTree": DecisionTreeRegressor(random_state=0),
    'randomForest': RandomForestRegressor(random_state=0,n_estimators=75,criterion="mae"),
    'BayesianRidge': BayesianRidge()
}

for label, model in models.items():
    print("Training {}".format(label))
    model.fit(X_train,y_train)
    y = model.predict(X_test)
    preds_train[label] = y
    
print("Training finished")

Training modelLinearRG
Training decisionTree
Training randomForest
Training BayesianRidge


In [300]:
from sklearn.metrics import mean_squared_error
from math import sqrt

for lab,pred in preds_train.items():
    print(lab)
    y = pred
    y_pred = []
    for e in y:
        y_pred.append(int(e))
    rms = sqrt(mean_squared_error(y_test,y_pred))
    print(rms)

modelLinearRG
1314.403981842696
decisionTree
757.4897326085778
randomForest
582.5176840231577
BayesianRidge
1314.3838810079362


In [282]:
preds = {}

In [283]:
for label, model in models.items():
    print("Training {}".format(label))
    y = model.predict(X_submission)
    preds[label] = y

Training modelLinearRG
Training decisionTree
Training randomForest
Training BayesianRidge


In [284]:
for label, pred in preds.items():
    submission = pd.DataFrame({
        'Id': submission_id,
        'price': pred
    })
    display(submission.head())
    submission.to_csv('submission_{}.csv'.format(label), index=False)

Unnamed: 0,Id,price
0,0,1139.407031
1,1,5941.657683
2,2,1817.230222
3,3,4558.39082
4,4,2102.239955


Unnamed: 0,Id,price
0,0,1216.0
1,1,6350.0
2,2,1746.0
3,3,4619.0
4,4,1760.0


Unnamed: 0,Id,price
0,0,1299.493333
1,1,6308.213333
2,2,1717.186667
3,3,4052.946667
4,4,1725.746667


Unnamed: 0,Id,price
0,0,1139.646389
1,1,5941.94067
2,2,1817.302182
3,3,4558.660115
4,4,2102.398402
