## Import Libraries

In [3]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns

## Load Dataset

1. **carat** is a measure of diamond weight. One carat is equivalent to 0.2 grams.

2. **clarity** refers to how clear a diamond is. Diamonds often contain imperfections like cracks or mineral deposits. The fewer and less noticeable a diamond’s imperfections, the better its clarity. clarity contains 8 ordered levels, from (I1 (worst), SI2, SI1, VS2, VS1, VVS2, VVS1, IF (best)).

3. **color** refers to the color of the diamond. Colorless diamonds are considered better than diamonds with a yellow tint. diamonds contains diamonds of 7 different colors, represented by different letters. “D” - “F” diamonds are considered colorless, while “G” - “J” diamonds have a very faint color.

4. **cut **refers to how a rough diamond is shaped into a finished diamond. Better cuts create more symmetrical and luminous diamonds. cut has 5 ordered levels: “Fair,” “Good,” “Very Good,” “Premium,” “Ideal.”

5. **x, y, z, depth, and table** are various measures of a diamond’s size, in millimeters.



In [4]:
df = pd.read_csv('diamonds.csv')

In [5]:
df

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...,...
53935,53936,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50
53936,53937,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53937,53938,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56
53938,53939,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74


In [6]:
df.shape

(53940, 11)

## Data Cleaning

In [7]:
df.drop('Unnamed: 0',inplace=True,axis=1)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53940 non-null  float64
 1   cut      53940 non-null  object 
 2   color    53940 non-null  object 
 3   clarity  53940 non-null  object 
 4   depth    53940 non-null  float64
 5   table    53940 non-null  float64
 6   price    53940 non-null  int64  
 7   x        53940 non-null  float64
 8   y        53940 non-null  float64
 9   z        53940 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 4.1+ MB


In [9]:
df.isnull().sum()

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
dtype: int64

## Data Transformation

# One-hot encoding

In [10]:
df_new=pd.get_dummies(df,drop_first=True)

In [11]:
df_new.shape

(53940, 24)

In [12]:
df_new

Unnamed: 0,carat,depth,table,price,x,y,z,cut_Good,cut_Ideal,cut_Premium,...,color_H,color_I,color_J,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,0.23,61.5,55.0,326,3.95,3.98,2.43,False,True,False,...,False,False,False,False,False,True,False,False,False,False
1,0.21,59.8,61.0,326,3.89,3.84,2.31,False,False,True,...,False,False,False,False,True,False,False,False,False,False
2,0.23,56.9,65.0,327,4.05,4.07,2.31,True,False,False,...,False,False,False,False,False,False,True,False,False,False
3,0.29,62.4,58.0,334,4.20,4.23,2.63,False,False,True,...,False,True,False,False,False,False,False,True,False,False
4,0.31,63.3,58.0,335,4.34,4.35,2.75,True,False,False,...,False,False,True,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53935,0.72,60.8,57.0,2757,5.75,5.76,3.50,False,True,False,...,False,False,False,False,True,False,False,False,False,False
53936,0.72,63.1,55.0,2757,5.69,5.75,3.61,True,False,False,...,False,False,False,False,True,False,False,False,False,False
53937,0.70,62.8,60.0,2757,5.66,5.68,3.56,False,False,False,...,False,False,False,False,True,False,False,False,False,False
53938,0.86,61.0,58.0,2757,6.15,6.12,3.74,False,False,True,...,True,False,False,False,False,True,False,False,False,False


## Data Spliting

In [13]:
X=df_new.drop('price',axis=1)
Y=df_new['price']

In [14]:
train_x,test_x,train_y,test_y=train_test_split(X,Y,test_size=0.2,random_state=100)

## Data Scaling

In [15]:
train_y=train_y.to_numpy().reshape(-1,1)

test_y=test_y.to_numpy().reshape(-1,1)

In [16]:
scale_x = MinMaxScaler().fit(train_x)
scale_y = MinMaxScaler().fit(train_y)

train_x = scale_x.transform(train_x)
train_y = scale_y.transform(train_y)

In [17]:
tran_x = StandardScaler().fit(train_x)
tran_y = StandardScaler().fit(train_y)

train_x = tran_x.transform(train_x)
train_y = tran_y.transform(train_y)

In [18]:
test_x=scale_x.transform(test_x)
test_x=tran_x.transform(test_x)

In [19]:
test_y=scale_y.transform(test_y)

test_y=tran_y.transform(test_y)

## Model

In [20]:
para = {
    'n_neighbors':[3,5,7,12],
    'weights' : ['uniform', 'distance'],
    'metric':['minkowski','manhattan','euclidean']
}

In [21]:
dia_reg=GridSearchCV(KNeighborsRegressor(),para,cv=10)

In [22]:
dia_reg.fit(train_x,train_y)

[WinError 2] The system cannot find the file specified
  File "c:\Users\Neil\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "c:\Users\Neil\anaconda3\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Neil\anaconda3\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\Neil\anaconda3\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


In [23]:
dia_reg.best_score_

0.9691363520598696

In [24]:
dia_reg.best_params_

{'metric': 'manhattan', 'n_neighbors': 5, 'weights': 'distance'}

In [25]:
reg = KNeighborsRegressor(n_neighbors=5,weights='distance',metric='manhattan')

In [26]:
reg.fit(train_x,train_y)

In [27]:
pred=reg.predict(test_x)

## Evalution

In [28]:
r2_score(pred,test_y)

0.9637714519943994

In [29]:
mean_absolute_error(pred,test_y)

0.08567421379887234

In [30]:
mean_squared_error(pred,test_y)

0.033703446910935325

In [31]:
np.sqrt(mean_squared_error(pred,test_y))

0.1835849855269633