## Write the KNN code from scratch and make it work for diamond dataset ?

## Step - 1: Load the data

In [1]:
import pandas as pd
import numpy as np

In [3]:
data=pd.read_csv("diamonds.csv")

In [4]:
data

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74


In [5]:
data.shape

(53940, 10)

In [6]:
data.columns

Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y',
       'z'],
      dtype='object')

## Step - 2: Identify input and output variables

In [7]:
x=data.drop("price",axis=1)
y=data["price"]
x.shape,y.shape

((53940, 9), (53940,))

In [8]:
x.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,4.34,4.35,2.75


In [9]:
y.head()

0    326
1    326
2    327
3    334
4    335
Name: price, dtype: int64

## Step - 3: Split the data - Test and Train (recommended 75:25 split)

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=42)

In [12]:
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((40455, 9), (13485, 9), (40455,), (13485,))

## Step - 4: Data Preprocessing on X_train (You can use sklearn for data preprocessing)

### Categorical Data Encoding

In [13]:
x_train

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
35965,0.25,Good,E,VVS2,64.9,58.0,3.95,3.97,2.57
52281,0.84,Ideal,J,SI1,61.8,56.0,6.04,6.07,3.74
6957,1.05,Premium,J,VS2,61.1,58.0,6.56,6.51,3.99
9163,1.02,Ideal,F,SI2,60.7,56.0,6.53,6.50,3.95
50598,0.61,Ideal,F,VS1,61.8,57.0,5.43,5.47,3.37
...,...,...,...,...,...,...,...,...,...
11284,1.05,Very Good,I,VS2,62.4,59.0,6.48,6.51,4.05
44732,0.47,Ideal,D,VS1,61.0,55.0,5.03,5.01,3.06
38158,0.33,Very Good,F,IF,60.3,58.0,4.49,4.46,2.70
860,0.90,Premium,J,SI1,62.8,59.0,6.13,6.03,3.82


In [14]:
from sklearn.preprocessing import OrdinalEncoder

In [15]:
data.color.unique()

array(['E', 'I', 'J', 'H', 'F', 'G', 'D'], dtype=object)

In [16]:
data.clarity.unique()

array(['SI2', 'SI1', 'VS1', 'VS2', 'VVS2', 'VVS1', 'I1', 'IF'],
      dtype=object)

In [17]:
ordinal_encoder=OrdinalEncoder(categories=[["Fair","Good","Very Good","Premium","Ideal"],["J","I","H","G","F","E","D"],['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']]).set_output(transform="pandas")

In [18]:
ordinal_encoder

### Numerical Data Rescaling

In [19]:
from sklearn.preprocessing import MinMaxScaler

In [20]:
minmax_scaler=MinMaxScaler().set_output(transform="pandas")

In [21]:
from sklearn.compose import ColumnTransformer

In [22]:
ct=ColumnTransformer([('encoder',ordinal_encoder,["cut","color","clarity"]),
 ('scaler',minmax_scaler,['carat', 'depth', 'table', 'x', 'y',
       'z'])],remainder="passthrough",verbose_feature_names_out=False).set_output(transform="pandas")

In [23]:
data.columns

Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y',
       'z'],
      dtype='object')

In [24]:
x_train_transformed=ct.fit_transform(x_train)

In [25]:
x_train_transformed

Unnamed: 0,cut,color,clarity,carat,depth,table,x,y,z
35965,1.0,5.0,5.0,0.010395,0.608333,0.288462,0.367784,0.067402,0.080818
52281,4.0,0.0,2.0,0.133056,0.522222,0.250000,0.562384,0.103056,0.117610
6957,3.0,0.0,3.0,0.176715,0.502778,0.288462,0.610801,0.110526,0.125472
9163,4.0,4.0,1.0,0.170478,0.491667,0.250000,0.608007,0.110357,0.124214
50598,4.0,4.0,4.0,0.085239,0.522222,0.269231,0.505587,0.092869,0.105975
...,...,...,...,...,...,...,...,...,...
11284,2.0,1.0,3.0,0.176715,0.538889,0.307692,0.603352,0.110526,0.127358
44732,4.0,6.0,4.0,0.056133,0.500000,0.230769,0.468343,0.085059,0.096226
38158,2.0,4.0,7.0,0.027027,0.480556,0.288462,0.418063,0.075722,0.084906
860,3.0,0.0,2.0,0.145530,0.550000,0.307692,0.570764,0.102377,0.120126


# Step - 5: Data Preprocessing on X_test

In [26]:
x_test_transformed=ct.transform(x_test)

In [27]:
x_test_transformed.values

array([[4.        , 3.        , 6.        , ..., 0.36964618, 0.06791171,
        0.07767296],
       [2.        , 4.        , 5.        , ..., 0.50651769, 0.09202037,
        0.10251572],
       [4.        , 5.        , 5.        , ..., 0.44320298, 0.08047538,
        0.0927673 ],
       ...,
       [4.        , 5.        , 3.        , ..., 0.63873371, 0.11544992,
        0.13396226],
       [3.        , 4.        , 1.        , ..., 0.59683426, 0.10950764,
        0.12578616],
       [3.        , 3.        , 1.        , ..., 0.60242086, 0.1089983 ,
        0.12735849]])

## Step - 6: Build the model and predict on X_test (FROM SCRATCH)
### Implement the KNN algorithm from scratch and generate predictions for the test data. Do not use the sklearn KNN algorithm. Write the complete KNN implementation manually.

In [28]:
x_train_transformed

Unnamed: 0,cut,color,clarity,carat,depth,table,x,y,z
35965,1.0,5.0,5.0,0.010395,0.608333,0.288462,0.367784,0.067402,0.080818
52281,4.0,0.0,2.0,0.133056,0.522222,0.250000,0.562384,0.103056,0.117610
6957,3.0,0.0,3.0,0.176715,0.502778,0.288462,0.610801,0.110526,0.125472
9163,4.0,4.0,1.0,0.170478,0.491667,0.250000,0.608007,0.110357,0.124214
50598,4.0,4.0,4.0,0.085239,0.522222,0.269231,0.505587,0.092869,0.105975
...,...,...,...,...,...,...,...,...,...
11284,2.0,1.0,3.0,0.176715,0.538889,0.307692,0.603352,0.110526,0.127358
44732,4.0,6.0,4.0,0.056133,0.500000,0.230769,0.468343,0.085059,0.096226
38158,2.0,4.0,7.0,0.027027,0.480556,0.288462,0.418063,0.075722,0.084906
860,3.0,0.0,2.0,0.145530,0.550000,0.307692,0.570764,0.102377,0.120126


In [29]:
from sklearn.metrics.pairwise import euclidean_distances

## Train a model using sklearn KNN Algorithm

In [30]:
from sklearn.neighbors import KNeighborsRegressor

In [31]:
knn=KNeighborsRegressor(n_neighbors=5)

In [32]:
knn

In [33]:
knn.fit(x_train_transformed,y_train)

In [34]:
y_pred=knn.predict(x_test_transformed)

In [35]:
from sklearn.metrics import r2_score

In [36]:
r2_score(y_pred,y_test)

0.9598128959744467

## Scrach implementation of knn

In [37]:
def knn_regression(X_train, y_train, X_test, k):
    preds = []

    for x_test in X_test.values:
        distances = np.sqrt(np.sum((X_train - x_test) ** 2, axis=1))
        k_idx = np.argsort(distances)[:k]
        preds.append(y_train.iloc[k_idx].mean())

    return np.array(preds)


In [38]:
k=5
y_pred=knn_regression(x_train_transformed,y_train,x_test_transformed,k)

In [39]:
from sklearn.metrics import r2_score
r2_score(y_pred,y_test)

0.9598134256703712

## Both the manual KNN implementation and the algorithm gave the same accuracy.