In [None]:
import numpy as np
import pandas as pd

In [None]:
df= pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Machine Learning/diamonds.csv')
df.shape

(53940, 10)

In [None]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [None]:
df.duplicated().sum()

np.int64(146)

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 53794 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53794 non-null  float64
 1   cut      53794 non-null  object 
 2   color    53794 non-null  object 
 3   clarity  53794 non-null  object 
 4   depth    53794 non-null  float64
 5   table    53794 non-null  float64
 6   price    53794 non-null  int64  
 7   x        53794 non-null  float64
 8   y        53794 non-null  float64
 9   z        53794 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 4.5+ MB


In [None]:
df['cut'].value_counts()

Unnamed: 0_level_0,count
cut,Unnamed: 1_level_1
Ideal,21488
Premium,13748
Very Good,12069
Good,4891
Fair,1598


In [None]:
df.color.value_counts()

Unnamed: 0_level_0,count
color,Unnamed: 1_level_1
G,11262
E,9776
F,9520
H,8272
D,6755
I,5407
J,2802


In [None]:
df['clarity'].value_counts()

Unnamed: 0_level_0,count
clarity,Unnamed: 1_level_1
SI1,13032
VS2,12229
SI2,9150
VS1,8156
VVS2,5056
VVS1,3647
IF,1784
I1,740


## Step - 3: Split the data - Test and Train (recommended 75:25 split)

In [None]:
X= df.drop('price',axis=1)
Y= df['price']
X.shape,Y.shape

((53794, 9), (53794,))

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test= train_test_split(X,Y, test_size= 0.25, random_state= 42)
X_train.shape,X_test.shape,Y_train.shape,Y_test.shape

((40345, 9), (13449, 9), (40345,), (13449,))

## Step - 4: Data Preprocessing on X_train

In [None]:
df.clarity.unique()

array(['SI2', 'SI1', 'VS1', 'VS2', 'VVS2', 'VVS1', 'I1', 'IF'],
      dtype=object)

In [None]:
df.color.unique()

array(['E', 'I', 'J', 'H', 'F', 'G', 'D'], dtype=object)

In [None]:
df.cut.unique()

array(['Ideal', 'Premium', 'Good', 'Very Good', 'Fair'], dtype=object)

In [None]:
#For Ordinal Encoding Categories Order
#cut
cut_order= [ 'Fair', 'Good', 'Very Good', 'Premium', 'Ideal']
#color
color_order= [ 'D', 'E', 'F', 'G', 'H', 'I', 'J']
#clarity
clarity_order= [ 'I1', 'SI1', 'SI2', 'VS1', 'VS2', 'VVS1', 'VVS2', 'IF']

In [None]:
from sklearn.preprocessing import OrdinalEncoder
oe= OrdinalEncoder(categories=[cut_order,color_order,clarity_order]).set_output(transform='pandas')
oe

In [None]:
from sklearn.preprocessing import MinMaxScaler
mms= MinMaxScaler().set_output(transform='pandas')
mms

In [None]:
from sklearn.compose import ColumnTransformer
ct= ColumnTransformer([('Ordinal Encoding',oe, ['cut','color','clarity']),
                      ('Scaling', mms, ['carat',	'depth', 'table',	'x',	'y',	'z'])],
                      remainder='passthrough',verbose_feature_names_out=False).set_output(transform='pandas')
ct

In [None]:
x_train_transformed= ct.fit_transform(X_train)
x_train_transformed

Unnamed: 0,cut,color,clarity,carat,depth,table,x,y,z
12820,4.0,3.0,4.0,0.170478,0.538889,0.269231,0.602421,0.107980,0.496278
19997,2.0,2.0,6.0,0.176715,0.508333,0.307692,0.603352,0.111375,0.496278
6099,3.0,3.0,1.0,0.147609,0.544444,0.288462,0.574488,0.104244,0.477667
37984,4.0,0.0,6.0,0.024948,0.497222,0.269231,0.408752,0.075552,0.334988
24865,3.0,3.0,4.0,0.274428,0.525000,0.250000,0.688082,0.123599,0.563275
...,...,...,...,...,...,...,...,...,...
11311,3.0,4.0,4.0,0.166320,0.483333,0.288462,0.606145,0.109677,0.486352
44869,4.0,3.0,1.0,0.089397,0.519444,0.211538,0.513966,0.094397,0.424318
38271,4.0,6.0,1.0,0.024948,0.522222,0.228846,0.408752,0.075042,0.337469
860,3.0,6.0,1.0,0.145530,0.550000,0.307692,0.570764,0.102377,0.473945


## Step - 5: Data Preprocessing on X_test

In [None]:
x_test_transformed= ct.transform(X_test)
x_test_transformed

Unnamed: 0,cut,color,clarity,carat,depth,table,x,y,z
43657,0.0,6.0,4.0,0.106029,0.608333,0.211538,0.524209,0.093888,0.449132
4274,2.0,1.0,2.0,0.145530,0.500000,0.307692,0.571695,0.104924,0.466501
47412,4.0,3.0,3.0,0.076923,0.530556,0.250000,0.493482,0.090492,0.409429
44437,3.0,1.0,4.0,0.064449,0.541667,0.326923,0.472998,0.086587,0.394541
13975,4.0,5.0,1.0,0.205821,0.536111,0.230769,0.633147,0.115789,0.526055
...,...,...,...,...,...,...,...,...,...
43980,2.0,4.0,2.0,0.024948,0.483333,0.307692,0.408752,0.075042,0.330025
1115,2.0,4.0,3.0,0.110187,0.494444,0.269231,0.540037,0.098981,0.439206
48829,4.0,3.0,5.0,0.066528,0.527778,0.230769,0.479516,0.087946,0.397022
42876,0.0,1.0,1.0,0.074844,0.597222,0.346154,0.486034,0.086418,0.413151


## Step - 6: Build the model and predict on X_test (SCRATCH IMPLEMENTATION)
Implement KNN Algorithm from scratch and do the predictions for test data. You should not use the sklearn KNN algorithm here. Write the complete code implementation from scratch for KNN algorithm.

In [None]:
def manual_knn(x_train,y_train,x_test,k):
  y_pred= []
  for x in x_test.values:
    dist= np.sqrt(np.sum((x_train-x)**2,axis=1))
    sorted_dist= dist.argsort()[:k]
    y_pred.append(y_train.iloc[sorted_dist].mean())
  return np.array(y_pred)

In [None]:
y_pred= manual_knn(x_train_transformed,Y_train,x_test_transformed,5)
y_pred

array([2567.4, 3777.6, 1786.8, ..., 2017. , 1376.4, 2814.4])

## Step - 7: Evaluate your model

In [None]:
from sklearn.metrics import r2_score
r2_score(Y_test,y_pred)

0.957424235511545

## Step - 8: Train a model using sklearn KNN Algorithm and compare the results with your scratch implementation

In [None]:
from sklearn.neighbors import KNeighborsRegressor
knn= KNeighborsRegressor(n_neighbors=5)
knn

In [None]:
knn.fit(x_train_transformed,Y_train)

In [None]:
knn.score(x_train_transformed,Y_train)

0.975925007978189

In [None]:
y_pred= knn.predict(x_test_transformed)
y_pred

array([2567.4, 3777.6, 1786.8, ..., 2017. , 1376.4, 2814.4])

In [None]:
r2_score(Y_test,y_pred)

0.957424235511545

For both manual implementation and scikit-learn knn algorithm, I got same performance score 95%.