In [15]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split

df = pd.read_csv('diamonds.csv', na_values = ['unknown'])
col = df.columns.values.tolist()[1:]

In [3]:
# Columns with NaN content > null_threshold will not be used as features
null_threshold = 0.0001

corrupt_col = []
for i in col:
  if (df[i].isna().sum()/len(df) > null_threshold):
    corrupt_col.append(i)

for i in corrupt_col:
  df = df.drop(i, axis=1)

col = df.columns.values.tolist()[1:]

In [5]:
# feature data frames with one-hot encoding to concatinate once encoded
cut = pd.get_dummies(df.cut, prefix='cut')
carat_weight = pd.DataFrame(df['carat_weight'])
lab = pd.get_dummies(df.lab, prefix='lab')
depth_percent = pd.DataFrame(df['depth_percent'])
table_percent = pd.DataFrame(df['table_percent'])
meas_length = pd.DataFrame(df['meas_length'])
meas_width = pd.DataFrame(df['meas_width'])
meas_depth = pd.DataFrame(df['meas_depth'])
total_sales_price = pd.DataFrame(df['total_sales_price'])

# convert clarity to int 0-10
clarity = pd.DataFrame(df['clarity'])
for i in range(len(clarity)):
  grade = ['I3', 'I2', 'I1', 'IF', 'SI3', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1']
  clarity['clarity'][i] = grade.index(clarity['clarity'][i])

# convert symmetry to int 0-4
symmetry = pd.DataFrame(df['symmetry'])
for i in range(len(symmetry)):
  sym = ['Poor', 'Fair', 'Good', 'Very Good', 'Excellent']
  symmetry['symmetry'][i] = sym.index(symmetry['symmetry'][i])

# convert polish to int 0-4
polish = pd.DataFrame(df['polish'])
for i in range(len(polish)):
  pol = ['Poor', 'Fair', 'Good', 'Very Good', 'Excellent']
  polish['polish'][i] = pol.index(polish['polish'][i])

In [6]:
# combining features into cleaned dataframe for regression
frames = [clarity, carat_weight, lab, symmetry, polish, depth_percent, table_percent, meas_length, meas_width, meas_depth, total_sales_price]
clean_regression = pd.concat(frames, axis=1)

# update list of column names
col = clean_regression.columns.values.tolist()[1:]

In [7]:
# updating total_sales_price for classiffication(i.e. 1 if price > x, otherwise 0 for some threshold x)
price_threshold = 2000
#print(clean_regression.loc[[150000]])
clean_classification = clean_regression.copy()

In [8]:

for i in range(len(clean_classification)):
  if clean_classification['total_sales_price'][i] > price_threshold:
    clean_classification['total_sales_price'][i] = 1
  else:
    clean_classification['total_sales_price'][i] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_classification['total_sales_price'][i] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_classification['total_sales_price'][i] = 1


In [16]:
clean_regression.head(10)

Unnamed: 0,clarity,carat_weight,lab_GIA,lab_HRD,lab_IGI,symmetry,polish,depth_percent,table_percent,meas_length,meas_width,meas_depth,total_sales_price
0,9,0.09,0,0,1,3,3,62.7,59.0,2.85,2.87,1.79,200
1,9,0.09,0,0,1,3,3,61.9,59.0,2.84,2.89,1.78,200
2,9,0.09,0,0,1,3,3,61.1,59.0,2.88,2.9,1.77,200
3,9,0.09,0,0,1,3,3,62.0,59.0,2.86,2.88,1.78,200
4,9,0.09,0,0,1,3,4,64.9,58.5,2.79,2.83,1.82,200
5,9,0.09,0,0,1,3,3,60.8,57.0,2.95,2.99,1.81,200
6,9,0.09,0,0,1,3,3,64.0,57.0,2.85,2.88,1.84,200
7,9,0.09,0,0,1,3,3,62.1,59.5,2.86,2.89,1.78,200
8,9,0.09,0,0,1,3,3,63.5,59.5,2.89,2.92,1.85,200
9,9,0.09,0,0,1,3,3,63.2,57.0,2.83,2.87,1.8,200


In [24]:
clean_regression.tail(10)

Unnamed: 0,clarity,carat_weight,lab_GIA,lab_HRD,lab_IGI,symmetry,polish,depth_percent,table_percent,meas_length,meas_width,meas_depth,total_sales_price
219693,3,12.52,1,0,0,4,4,62.1,58.0,14.84,14.91,9.24,1121792
219694,3,12.52,0,1,0,4,4,62.1,58.0,14.84,14.91,9.24,1121792
219695,10,10.0,1,0,0,3,4,78.1,60.0,11.78,10.84,8.47,1124122
219696,7,18.88,1,0,0,4,4,57.1,55.0,21.34,15.23,8.69,1132347
219697,8,10.04,1,0,0,4,4,72.8,55.0,14.13,10.86,7.9,1161102
219698,8,10.65,1,0,0,4,4,61.3,58.0,14.06,14.18,8.66,1210692
219699,7,5.17,1,0,0,3,3,64.8,65.0,11.55,8.81,5.71,1292500
219700,8,18.07,1,0,0,4,4,60.2,59.0,17.06,17.1,10.2,1315496
219701,5,0.9,1,0,0,2,2,70.8,72.0,5.22,4.9,3.47,1350000
219702,9,10.03,1,0,0,3,4,66.7,65.0,15.61,11.09,7.39,1449881


In [17]:
clean_classification.head(10)

Unnamed: 0,clarity,carat_weight,lab_GIA,lab_HRD,lab_IGI,symmetry,polish,depth_percent,table_percent,meas_length,meas_width,meas_depth,total_sales_price
0,9,0.09,0,0,1,3,3,62.7,59.0,2.85,2.87,1.79,0
1,9,0.09,0,0,1,3,3,61.9,59.0,2.84,2.89,1.78,0
2,9,0.09,0,0,1,3,3,61.1,59.0,2.88,2.9,1.77,0
3,9,0.09,0,0,1,3,3,62.0,59.0,2.86,2.88,1.78,0
4,9,0.09,0,0,1,3,4,64.9,58.5,2.79,2.83,1.82,0
5,9,0.09,0,0,1,3,3,60.8,57.0,2.95,2.99,1.81,0
6,9,0.09,0,0,1,3,3,64.0,57.0,2.85,2.88,1.84,0
7,9,0.09,0,0,1,3,3,62.1,59.5,2.86,2.89,1.78,0
8,9,0.09,0,0,1,3,3,63.5,59.5,2.89,2.92,1.85,0
9,9,0.09,0,0,1,3,3,63.2,57.0,2.83,2.87,1.8,0


In [18]:
clean_classification.tail(10)

Unnamed: 0,clarity,carat_weight,lab_GIA,lab_HRD,lab_IGI,symmetry,polish,depth_percent,table_percent,meas_length,meas_width,meas_depth,total_sales_price
219693,3,12.52,1,0,0,4,4,62.1,58.0,14.84,14.91,9.24,1
219694,3,12.52,0,1,0,4,4,62.1,58.0,14.84,14.91,9.24,1
219695,10,10.0,1,0,0,3,4,78.1,60.0,11.78,10.84,8.47,1
219696,7,18.88,1,0,0,4,4,57.1,55.0,21.34,15.23,8.69,1
219697,8,10.04,1,0,0,4,4,72.8,55.0,14.13,10.86,7.9,1
219698,8,10.65,1,0,0,4,4,61.3,58.0,14.06,14.18,8.66,1
219699,7,5.17,1,0,0,3,3,64.8,65.0,11.55,8.81,5.71,1
219700,8,18.07,1,0,0,4,4,60.2,59.0,17.06,17.1,10.2,1
219701,5,0.9,1,0,0,2,2,70.8,72.0,5.22,4.9,3.47,1
219702,9,10.03,1,0,0,3,4,66.7,65.0,15.61,11.09,7.39,1


In [16]:
#SVM
X = clean_classification[["clarity","carat_weight", "lab_GIA","lab_HRD","symmetry","polish","depth_percent",
                          "table_percent","meas_length","meas_width","meas_depth"]].to_numpy()
y = clean_classification["total_sales_price"]
#splitting up the data 
X_train, X_test, y_train, y_test = train_test_split(
    X, y,   
    test_size = 0.1, random_state=10, shuffle=True
)

In [17]:
print('X_train:' + str(X_train.shape))
print('y_train:' + str(y_train.shape))
print('X_test: '  + str(X_test.shape))
print('y_test: '  + str(y_test.shape))

X_train:(197732, 11)
y_train:(197732,)
X_test: (21971, 11)
y_test: (21971,)


In [18]:
y_2d_train = y_train.values.reshape((y_train.shape[0], 1))
y_2d_test = y_test.values.reshape((y_test.shape[0], 1))
print(y_2d_train.shape)
print(y_2d_test.shape)

(197732, 1)
(21971, 1)
