# KNN - Regression

In [1]:
# Importing Libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Allow plots to appear directly in the notebook
%matplotlib inline

## Example : Advertising Data

In [2]:
# Read data into a DataFrame

df = pd.read_csv('C:\\Users\\pc\\Desktop\\Data Science\\MachineLearning\\Starts\\Advertising.csv', index_col=0)

In [3]:
# Data Preparation
df.head(5)

Unnamed: 0,TV,Radio,Newspaper,Sales
1,230.1,37.8,69.2,22.1
2,44.5,39.3,45.1,10.4
3,17.2,45.9,69.3,9.3
4,151.5,41.3,58.5,18.5
5,180.8,10.8,58.4,12.9


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 200 entries, 1 to 200
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   TV         200 non-null    float64
 1   Radio      200 non-null    float64
 2   Newspaper  200 non-null    float64
 3   Sales      200 non-null    float64
dtypes: float64(4)
memory usage: 7.8 KB


## Preparing X and y using pandas

In [5]:
# declaring independent variable X & dependent vriable y

# create a Python list of feature names
feature_cols = ['TV', 'Radio', 'Newspaper']

# use the list to select a subset of the original DataFrame
X = df[feature_cols]

# equivalent command to do this in one line using double square brackets i.e., X = data[['TV', 'Radio', 'Newspaper']]
# inner bracket is a list
# outer bracker accesses a subset of the original DataFrame


# select target variable from the DataFrame
y = df['Sales']



In [6]:
# Check the shape of X & y
print(X.shape, y.shape)

(200, 3) (200,)


### Splitting X and y inot training and testing sets

In [7]:
# Importing module for splitting the data

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=0)

In [8]:
# default split is 75% for training and 25% for testing
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(150, 3)
(50, 3)
(150,)
(50,)


### Feature Scaling

In [9]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

### Fitting KNN regression model on training dataset

In [10]:
from sklearn import neighbors
regressor = neighbors.KNeighborsRegressor(n_neighbors=3)

In [11]:
# Fitting decision tree on data

regressor = regressor.fit(X_train, y_train)

### Make predictions on test dataset

In [12]:
# Make prdictions on the testing set

y_pred = regressor.predict(X_test)

In [13]:
# Import module to be used for model evaluation

from sklearn import metrics

In [14]:
# Mean Square Error
print("MSE", metrics.mean_squared_error(y_test, y_pred))

MSE 3.354955555555556


In [16]:
# Root Mean Square Error

print("RMSE", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

RMSE 1.8316537761147864


In [17]:
#cross-checking

df_check = pd.DataFrame({'Actual': y_test[:10].values, 'Predicted': y_pred[:10].ravel()})
df_check

Unnamed: 0,Actual,Predicted
0,11.3,10.2
1,8.4,8.3
2,8.7,9.2
3,25.4,25.633333
4,11.7,11.766667
5,8.7,7.266667
6,7.2,10.633333
7,13.2,13.3
8,9.2,9.5
9,16.6,15.833333


# Choosing Value of K

In [18]:
rmse_val = [] #to store rmse values for different k

for K in range(10):
    
    K = K+1
    regressor = neighbors.KNeighborsRegressor(n_neighbors = K)

    regressor.fit(X_train, y_train)  #fit the model
    pred = regressor.predict(X_test) #make prediction on test set
    
    error = np.sqrt(metrics.mean_squared_error(y_test,pred)) #calculate rmse
    rmse_val.append(error) #store rmse values
    print('RMSE value for k= ' , K , 'is:', error)

RMSE value for k=  1 is: 1.7532826355154494
RMSE value for k=  2 is: 1.780884611646695
RMSE value for k=  3 is: 1.8316537761147864
RMSE value for k=  4 is: 1.6848071106212723
RMSE value for k=  5 is: 1.70229727133659
RMSE value for k=  6 is: 1.840310964061358
RMSE value for k=  7 is: 2.010006599643953
RMSE value for k=  8 is: 2.003875152049149
RMSE value for k=  9 is: 2.1218364978142854
RMSE value for k=  10 is: 2.189726010257904
