In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('../datasets/diamonds.csv', index_col=0)
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [2]:
df['cut'].unique()

array(['Ideal', 'Premium', 'Good', 'Very Good', 'Fair'], dtype=object)

In [3]:
# replacing categorical data with numerical values
cut_dict = {'Ideal': 5, 'Premium': 4, 'Very Good': 3, 'Good': 2, 'Fair': 1}
clarity_dict = {"I3": 1, "I2": 2, "I1": 3, "SI2": 4, "SI1": 5, "VS2": 6, "VS1": 7, "VVS2": 8, "VVS1": 9, "IF": 10, "FL": 11}
color_dict = {"J": 1,"I": 2,"H": 3,"G": 4,"F": 5,"E": 6,"D": 7}

df['cut'] = df['cut'].map(cut_dict)
df['clarity'] = df['clarity'].map(clarity_dict)
df['color'] = df['color'].map(color_dict)

In [4]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,5,6,4,61.5,55.0,326,3.95,3.98,2.43
2,0.21,4,6,5,59.8,61.0,326,3.89,3.84,2.31
3,0.23,2,6,7,56.9,65.0,327,4.05,4.07,2.31
4,0.29,4,2,6,62.4,58.0,334,4.2,4.23,2.63
5,0.31,2,1,4,63.3,58.0,335,4.34,4.35,2.75


In [16]:
from sklearn.utils import shuffle
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler

# shuffling our dataset
df = shuffle(df)

X = df.drop('price', axis=1).values
y = df['price'].values

X_scaler = StandardScaler()
X = X_scaler.fit_transform(X)

test_size = 200
X_train = X[:-test_size]
X_test = X[-test_size:]
y_train = y[:-test_size]
y_test = y[-test_size:]

clf = SVR(kernel='linear')

In [19]:
%%time

clf.fit(X_train, y_train)

CPU times: user 2min 29s, sys: 164 ms, total: 2min 29s
Wall time: 2min 29s


SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [20]:
clf.score(X_test, y_test)

0.9086199005329154

In [21]:
for X,y in zip(X_test, y_test):
    print(f'Model: {clf.predict([X])[0]}, Actual: {y}')

Model: 781.3835564732053, Actual: 737
Model: 592.7217986651053, Actual: 671
Model: 11745.689906118832, Actual: 17760
Model: 104.86672494572758, Actual: 515
Model: 1801.8123079453733, Actual: 1656
Model: 8953.835355536561, Actual: 10471
Model: 7668.0811835735485, Actual: 7438
Model: 1736.174579617487, Actual: 1209
Model: 911.4258752444507, Actual: 776
Model: 476.2198690147002, Actual: 702
Model: 2641.5526821888616, Actual: 2366
Model: 1233.1723660606767, Actual: 863
Model: 2836.416410793698, Actual: 2142
Model: 2170.0595400399625, Actual: 2454
Model: 3144.3163377194787, Actual: 2201
Model: 686.0587972767562, Actual: 891
Model: -574.194435069287, Actual: 447
Model: 12694.478390343713, Actual: 14394
Model: 5234.34428468232, Actual: 6389
Model: -315.08132112821204, Actual: 489
Model: 696.8362229668983, Actual: 678
Model: 232.81506509329938, Actual: 666
Model: 3039.623384993314, Actual: 2641
Model: 604.1704973675878, Actual: 830
Model: 1549.783493058727, Actual: 1659
Model: -234.18071847854

In [23]:
%%time
# testing rbf kernel
clf = SVR(kernel='rbf')
clf.fit(X_train, y_train)

CPU times: user 3min 49s, sys: 292 ms, total: 3min 49s
Wall time: 3min 50s


SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [24]:
clf.score(X_test, y_test)

0.6592775037731957

In [25]:
for X,y in zip(X_test, y_test):
    print(f'Model: {clf.predict([X])[0]}, Actual: {y}')

Model: 961.3261586692597, Actual: 737
Model: 1136.2626155437392, Actual: 671
Model: 6622.2978509070435, Actual: 17760
Model: 1464.6177136236572, Actual: 515
Model: 1688.167124659331, Actual: 1656
Model: 6982.937183972883, Actual: 10471
Model: 5797.5461127055205, Actual: 7438
Model: 1417.5236918904097, Actual: 1209
Model: 1187.175384997, Actual: 776
Model: 662.0718488691423, Actual: 702
Model: 2442.824355282434, Actual: 2366
Model: 1464.3949559992477, Actual: 863
Model: 2599.1721365383446, Actual: 2142
Model: 1906.8462564455538, Actual: 2454
Model: 3032.880471833799, Actual: 2201
Model: 1213.7327004815374, Actual: 891
Model: 1130.2336386794777, Actual: 447
Model: 6152.788576958652, Actual: 14394
Model: 4402.782394580495, Actual: 6389
Model: 911.3339748156536, Actual: 489
Model: 665.3514320949912, Actual: 678
Model: 382.77093969392627, Actual: 666
Model: 2904.9934692914176, Actual: 2641
Model: 1086.6225957266633, Actual: 830
Model: 1342.065664676642, Actual: 1659
Model: 1133.976911202903