# Assignment 4: K Nearest Neighbour Regressor

We have created the code of K Nearest Neighbor as Classifier in the class? You need to convert the code of K Nearest Neighbor from Classifier to regressor? You need to analyze the areas of code that need to be changed and make those changes and submit your updated code along with its usage on any regression dataset?

In [50]:
import numpy as np
import pandas as pd
import statistics
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [23]:
df = pd.read_csv('/kaggle/input/student-performance-grade-prediction-dataset/Student_Performance.csv')

# EDA

In [24]:
df.head()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,Yes,9,1,91.0
1,4,82,No,4,2,65.0
2,8,51,Yes,7,2,45.0
3,5,52,Yes,5,2,36.0
4,7,75,No,8,5,66.0


In [25]:
df.tail()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
9995,1,49,Yes,4,2,23.0
9996,7,64,Yes,8,5,58.0
9997,6,83,Yes,8,5,74.0
9998,9,97,Yes,7,0,95.0
9999,7,74,No,8,1,64.0


In [26]:
df.shape

(10000, 6)

In [27]:
df.isnull().sum()

Hours Studied                       0
Previous Scores                     0
Extracurricular Activities          0
Sleep Hours                         0
Sample Question Papers Practiced    0
Performance Index                   0
dtype: int64

In [28]:
# Encode 'Extracurricular Activities'
df['Extracurricular Activities'] = df['Extracurricular Activities'].map({'Yes': 1, 'No': 0})

In [29]:
# Separation of features
X = df.drop(columns='Performance Index', axis = 1)
Y = df['Performance Index']

In [30]:
X.head()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced
0,7,99,1,9,1
1,4,82,0,4,2
2,8,51,1,7,2
3,5,52,1,5,2
4,7,75,0,8,5


In [31]:
Y.head()

0    91.0
1    65.0
2    45.0
3    36.0
4    66.0
Name: Performance Index, dtype: float64

In [32]:
# 2. Converting the data to numpy array
X = X.to_numpy()
Y = Y.to_numpy()

In [34]:
X.shape

(10000, 5)

In [35]:
Y.shape

(10000,)

In [36]:
# 3. Train test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

In [37]:
print("\nX.shape:", X.shape, "X_train.shape:", X_train.shape, "X_test.shape:", X_test.shape)
print("Y.shape:", Y.shape, "Y_train.shape:", Y_train.shape, "Y_test.shape:", Y_test.shape)


X.shape: (10000, 5) X_train.shape: (8000, 5) X_test.shape: (2000, 5)
Y.shape: (10000,) Y_train.shape: (8000,) Y_test.shape: (2000,)


## Train Test Split

In [38]:
X_train = np.insert(X_train, X_train.shape[1], Y_train, axis=1) # Insert at the end of features

print("\nX_train after inserting Y_train (shape):", X_train.shape)


X_train after inserting Y_train (shape): (8000, 6)


In [39]:
print("First row of X_train after insert (features + target):", X_train[0]) 

First row of X_train after insert (features + target): [ 9 82  1  7  8 83]


# KNN Regressor Model Implementation

In [42]:
class KNN_Regressor():
    def __init__(self, distance_metric):
        self.distance_metric = distance_metric

    def get_distance_metric(self, training_data_point, test_data_point):
        if (self.distance_metric == 'euclidean'):
            dist = 0
            for i in range(len(training_data_point) -1):
                dist = dist + (training_data_point[i] - test_data_point[i])**2
            euclidean_dist = np.sqrt(dist)
            return euclidean_dist
        elif (self.distance_metric == 'manhattan'):
            dist = 0
            for i in range(len(training_data_point) -1):
                dist = dist + abs(training_data_point[i] - test_data_point[i])
            manhattan_dist = dist
            return manhattan_dist

    def nearest_neighbors(self, X_train, test_data, k):
        distance_list = []
        for training_data in X_train:
            distance = self.get_distance_metric(training_data, test_data)
            distance_list.append((training_data, distance))
        distance_list.sort(key=lambda x:x[1])
        neighbors_list = []
        for j in range(k):
            neighbors_list.append(distance_list[j][0])
        return neighbors_list

    def predict(self, X_train, test_data, k):
        neighbors = self.nearest_neighbors(X_train, test_data, k)

        target_values = []
        for data in neighbors:
            target_values.append(data[-1]) 

        predicted_value = np.mean(target_values) 
        return predicted_value


# Model Training and Prediction

In [43]:
regressor = KNN_Regressor(distance_metric='euclidean')

In [44]:
# Predict for a single test data point (e.g., the first one)
print("\n--- Single Prediction ---")
single_prediction = regressor.predict(X_train, X_test[0], k=5)
print("Test data point (X_test[0]):", X_test[0])
print("True value (Y_test[0]):", Y_test[0])
print("Predicted value for X_test[0]:", single_prediction)


--- Single Prediction ---
Test data point (X_test[0]): [ 3 48  1  7  4]
True value (Y_test[0]): 26.0
Predicted value for X_test[0]: 27.2


In [57]:
# Predict for all test data points
y_pred = []
X_test_size = X_test.shape[0]

for i in range(X_test_size):
    prediction = regressor.predict(X_train, X_test[i], k=5)
    y_pred.append(prediction)

In [59]:
y_pred = np.array(y_pred)

In [60]:
print(y_pred)

[27.2 36.  80.6 ... 44.4 32.6 68.4]


# Model Evaluation

In [62]:
mae = mean_absolute_error(Y_test, y_pred)
mse = mean_squared_error(Y_test, y_pred)
rmse = np.sqrt(mse)

In [63]:
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

Mean Absolute Error (MAE): 1.9087
Mean Squared Error (MSE): 5.8641
Root Mean Squared Error (RMSE): 2.4216
