In [67]:
import numpy as np
import matplotlib.pyplot as plt
import math
import pandas as pd

In [68]:
from sklearn.model_selection import train_test_split

df = pd.read_csv('Salary_dataset.csv')
df.drop(columns='Unnamed: 0', inplace=True)

X = df['YearsExperience']
y = df['Salary']
X_train, X_test, y_train, y_test  = train_test_split(X, y)

In [73]:
class KNN:
    def __init__(self):
        pass
    
    @staticmethod
    def calculate_distance(x1, x2):
        """
        Calculates the Euclidean distance between two points.
        """
        return np.sqrt(np.sum((x1 - x2) ** 2))

    @staticmethod
    def find_nearest_neighbors(X_train, y_train, query_point, num_neighbors, distance_func=None):
        """
        Finds the nearest neighbors to a query point.
        """
        # Input validation
        if not isinstance(num_neighbors, int) or num_neighbors <= 0:
            raise ValueError("num_neighbors must be a positive integer")
        if len(X_train) == 0:
            raise ValueError("X_train must contain at least one sample")
        if len(X_train) != len(y_train):
            raise ValueError("X_train and y_train must have the same length")
        if distance_func is not None and not callable(distance_func):
            raise ValueError("distance_func must be a callable function")
        
        # Default distance function (Euclidean distance)
        if distance_func is None:
            distance_func = lambda x1, x2: np.sqrt(np.sum((x1 - x2) ** 2))
        
        # Calculate distances using the provided distance function
        distances = np.array([distance_func(query_point, x) for x in X_train])
        
        # Find indices of nearest neighbors
        nearest_indices = np.argsort(distances)[:num_neighbors]
        nearest_neighbors = [(X_train[i], y_train[i], distances[i]) for i in nearest_indices]
        return nearest_neighbors

    @staticmethod
    def predict_knn_regression(X_train, y_train, query_point, num_neighbors, weighting='uniform', sigma=1.0):
        """
        Predicts a target value using KNN regression.
        """
        # Input validation
        if not isinstance(num_neighbors, int) or num_neighbors <= 0:
            raise ValueError("num_neighbors must be a positive integer")
        if len(X_train) == 0:
            raise ValueError("X_train must contain at least one sample")
        if len(X_train) != len(y_train):
            raise ValueError("X_train and y_train must have the same length")
        if weighting not in ['uniform', 'inverse_distance', 'gaussian']:
            raise ValueError("weighting must be one of 'uniform', 'inverse_distance', or 'gaussian'")
        
        # Find nearest neighbors
        nearest_neighbors = KNN.find_nearest_neighbors(X_train, y_train, query_point, num_neighbors)
        distances = np.array([neighbor[2] for neighbor in nearest_neighbors])
        
        # Handle special case when distance is 0
        if np.any(distances == 0):
            return nearest_neighbors[np.argmax(distances)][1]
        
        # Predict based on weighting scheme
        if weighting == 'uniform':
            neighbor_outputs = np.array([neighbor[1] for neighbor in nearest_neighbors])      
            predicted_value = np.mean(neighbor_outputs)
        elif weighting == 'inverse_distance':
            weights = 1 / distances
            weighted_sum = np.sum(weights * np.array([neighbor[1] for neighbor in nearest_neighbors]))
            total_weight = np.sum(weights)
            if total_weight == 0:
                raise ValueError("Total weight is zero, cannot perform inverse distance weighting")
            predicted_value = weighted_sum / total_weight
        elif weighting == 'gaussian':
            weights = np.exp(- (distances ** 2) / (2 * sigma ** 2))
            weighted_sum = np.sum(weights * np.array([neighbor[1] for neighbor in nearest_neighbors]))
            total_weight = np.sum(weights)
            if total_weight == 0:
                raise ValueError("Total weight is zero, cannot perform Gaussian weighting")
            predicted_value = weighted_sum / total_weight
        
        return predicted_value
    
    @staticmethod
    def predict_knn_classification(X_train, y_train, test_point, num_neighbors):
        """
        Predicts a class label using KNN classification.
        """
        # Input validation
        if not isinstance(num_neighbors, int) or num_neighbors <= 0:
            raise ValueError("num_neighbors must be a positive integer")
        if len(X_train) == 0:
            raise ValueError("X_train must contain at least one sample")
        if len(X_train) != len(y_train):
            raise ValueError("X_train and y_train must have the same length")
        
        # Find nearest neighbors
        nearest_neighbors = KNN.find_nearest_neighbors(X_train, y_train, test_point, num_neighbors)
        if len(nearest_neighbors) == 0:
            raise ValueError("No nearest neighbors found")
        
        # Count occurrences of each label among nearest neighbors
        neighbor_labels = np.array([neighbor[1] for neighbor in nearest_neighbors])
        unique_labels, counts = np.unique(neighbor_labels, return_counts=True)
        
        # Select the most common label as the prediction
        predicted_label = unique_labels[np.argmax(counts)]
        return predicted_label


In [74]:
model = KNN()

y_pred = []
for query_point in X_test:
    # Use the trained model to make predictions for each data point in X_test
    predicted_value = model.predict_knn_regression(X_train, y_train, query_point, num_neighbors=3, weighting='gaussian', sigma=1.0)
    y_pred.append(predicted_value)

KeyError: 13