In [15]:
import numpy as np
from collections import Counter

In [16]:
class CustomKnn:
    def __init__(self, k=3):
        if k <= 0:
            raise ValueError("k must be positive")
        self.k = k
        self.X = None
        self.y = None
        self.feature_encoding = {}

    def validate(self, X, y=None):
        if X is None or len(X) == 0:
            raise ValueError("X can't be empty")
        if y is not None and len(X) != len(y):
            raise ValueError("y and X must have same length")
        return True

In [17]:
def encode_categories(self, data, feature_idx, fit=True):
    if fit:
        unique_vals = sorted(set(data))
        self.feature_encoding[feature_idx] = {val: idx for idx, val in enumerate(unique_vals)}

    encoding_map = self.feature_encoding[feature_idx]
    encoded = []

    for val in data:
        if val not in encoding_map:
            encoded.append(np.median(list(encoding_map.values())))
        else:
            encoded.append(encoding_map[val])

    return encoded

CustomKnn.encode_categories = encode_categories



In [18]:
def preprocess(self, X, fit=True):
    X_processed = []
    n_features = len(X[0])

    for feature_idx in range(n_features):
        feature_values = [row[feature_idx] for row in X]

        is_numeric = all(isinstance(val, (int, float)) for val in feature_values)

        if is_numeric:
            valid_values = [val for val in feature_values if val is not None]
            if len(valid_values) == 0:
                feature_values = [0] * len(feature_values)
            else:
                mean_val = np.mean(valid_values)
                feature_values = [val if val is not None else mean_val for val in feature_values]
            X_processed.append(feature_values)
        else:
            encoded = self.encode_categories(feature_values, feature_idx, fit=fit)
            X_processed.append(encoded)


    return np.array(X_processed).T

CustomKnn.preprocess = preprocess

In [19]:
def fit(self, X, y):
    self.validate(X, y)
    self.X = self.preprocess(X, fit=True)
    self.y = np.array(y)
    return self

CustomKnn.fit = fit

In [20]:
def distance(self, x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))

CustomKnn.distance = distance

In [21]:
def get_neighbors(self, x):
    distances = []
    for idx, train_point in enumerate(self.X):
        dist = self.distance(x, train_point)

        distances.append((idx, dist))

    distances.sort(key=lambda x: x[1])
    neighbors = [idx for idx, dist in distances[:self.k]]

    return neighbors

CustomKnn.get_neighbors = get_neighbors


In [22]:
def predict(self, X, return_details=False):
    if self.X is None:
        raise ValueError("Model must be fitted first")

    self.validate(X)
    X_test = self.preprocess(X, fit=False)

    predictions = []
    for x in X_test:
        neighbors = self.get_neighbors(x)
        neighbor_labels = [self.y[idx] for idx in neighbors]
        most_common = Counter(neighbor_labels).most_common(1)

        if return_details:
            neighbor_distances = [self.distance(x, self.X[idx]) for idx in neighbors]
            predictions.append({
                'prediction': most_common[0][0],
                'neighbors': neighbors,
                'neighbor_labels': neighbor_labels,
                'neighbor_distances': neighbor_distances
            })
        else:
            predictions.append(most_common[0][0])

    return predictions if return_details else np.array(predictions)

CustomKnn.predict = predict


In [23]:
training_data = [
    [1, 4, 'A', 'D', 'Good'],
    [5, 9, 'E', 'I', 'Good'],
    [6, 7, 'F', 'G', 'Good'],
    [1, 1, 'A', 'A', 'Good'],
    [3, 8, 'C', 'H', 'Good'],
    [10, 4, 'J', 'D', 'Bad'],
    [5, 3, 'E', 'C', 'Bad'],
    [2, 3, 'B', 'C', 'Bad'],
    [6, 3, 'F', 'C', 'Bad'],
    [2, 8, 'B', 'H', 'Bad']
]

X = [[row[0], row[1], row[2], row[3]] for row in training_data]
y = [row[4] for row in training_data]

print(f"Number of training samples: {len(X)}")
print(f"Number of features: {len(X[0])}")
print(f"Classes: {set(y)}")
print("\nFirst 3 training samples:")
for i in range(3):
    print(f"  Sample {i+1}: {X[i]} -> {y[i]}")

Number of training samples: 10
Number of features: 4
Classes: {'Bad', 'Good'}

First 3 training samples:
  Sample 1: [1, 4, 'A', 'D'] -> Good
  Sample 2: [5, 9, 'E', 'I'] -> Good
  Sample 3: [6, 7, 'F', 'G'] -> Good


In [24]:
X_test = [[13, 7, 'Z', 'Z']]

print("Test Data Point to Classify:")
print(f"  x1 = {X_test[0][0]}")
print(f"  x2 = {X_test[0][1]}")
print(f"  x3 = {X_test[0][2]}")
print(f"  x4 = {X_test[0][3]}")

Test Data Point to Classify:
  x1 = 13
  x2 = 7
  x3 = Z
  x4 = Z


In [30]:

print("Testing Knn with k=3")
knn = CustomKnn(k=3)
knn.fit(X, y)

results = knn.predict(X_test, return_details=True)
result = results[0]

print(f"\nTest Point: {X_test[0]}")
print(f"\n✓ PREDICTION: {result['prediction']}")
print(f"\nThe 3 Nearest Neighbors:")


for j, idx in enumerate(result['neighbors']):
    print(f"\nNeighbor {j+1}:")
    print(f"  Training Data: {training_data[idx][:4]} -> {training_data[idx][4]}")
    print(f"  Distance: {result['neighbor_distances'][j]:.4f}")

label_counts = Counter(result['neighbor_labels'])

print(f"Vote Count: {dict(label_counts)}")
print(f"Winner: {result['prediction']} ({'GOOD' if result['prediction'] == 'Good' else 'BAD'})")


Testing Knn with k=3

Test Point: [13, 7, 'Z', 'Z']

✓ PREDICTION: Bad

The 3 Nearest Neighbors:

Neighbor 1:
  Training Data: [10, 4, 'J', 'D'] -> Bad
  Distance: 4.9497

Neighbor 2:
  Training Data: [6, 7, 'F', 'G'] -> Good
  Distance: 7.1764

Neighbor 3:
  Training Data: [6, 3, 'F', 'C'] -> Bad
  Distance: 8.3367
Vote Count: {np.str_('Bad'): 2, np.str_('Good'): 1}
Winner: Bad (BAD)


In [29]:

print("Testing knn with k=5")
knn5 = CustomKnn(k=5)
knn5.fit(X, y)

results5 = knn5.predict(X_test, return_details=True)
result5 = results5[0]

print(f"\nTest Point: {X_test[0]}")
print(f"\n✓ PREDICTION: {result5['prediction']}")
print(f"\nThe 5 Nearest Neighbors:")


for j, idx in enumerate(result5['neighbors']):
    print(f"\nNeighbor {j+1}:")
    print(f"  Training Data: {training_data[idx][:4]} -> {training_data[idx][4]}")
    print(f"  Distance: {result5['neighbor_distances'][j]:.4f}")

label_counts5 = Counter(result5['neighbor_labels'])

print(f"Vote Count: {dict(label_counts5)}")
print(f"Winner: {result5['prediction']} ({'GOOD' if result5['prediction'] == 'Good' else 'BAD'})")


Testing knn with k=5

Test Point: [13, 7, 'Z', 'Z']

✓ PREDICTION: Bad

The 5 Nearest Neighbors:

Neighbor 1:
  Training Data: [10, 4, 'J', 'D'] -> Bad
  Distance: 4.9497

Neighbor 2:
  Training Data: [6, 7, 'F', 'G'] -> Good
  Distance: 7.1764

Neighbor 3:
  Training Data: [6, 3, 'F', 'C'] -> Bad
  Distance: 8.3367

Neighbor 4:
  Training Data: [5, 9, 'E', 'I'] -> Good
  Distance: 8.6313

Neighbor 5:
  Training Data: [5, 3, 'E', 'C'] -> Bad
  Distance: 9.0830
Vote Count: {np.str_('Bad'): 3, np.str_('Good'): 2}
Winner: Bad (BAD)


In [28]:

print("Testing k=7")

knn7 = CustomKnn(k=7)
knn7.fit(X, y)

results7 = knn7.predict(X_test, return_details=True)
result7 = results7[0]

print(f"\nTest Point: {X_test[0]}")
print(f"\n✓ PREDICTION: {result7['prediction']}")
print(f"\nThe 7 Nearest Neighbors:")


for j, idx in enumerate(result7['neighbors']):
    print(f"\nNeighbor {j+1}:")
    print(f"  Training Data: {training_data[idx][:4]} -> {training_data[idx][4]}")
    print(f"  Distance: {result7['neighbor_distances'][j]:.4f}")

label_counts7 = Counter(result7['neighbor_labels'])
print(f"Vote Count: {dict(label_counts7)}")
print(f"Winner: {result7['prediction']} ({'GOOD' if result7['prediction'] == 'Good' else 'BAD'})")


Testing k=7

Test Point: [13, 7, 'Z', 'Z']

✓ PREDICTION: Bad

The 7 Nearest Neighbors:

Neighbor 1:
  Training Data: [10, 4, 'J', 'D'] -> Bad
  Distance: 4.9497

Neighbor 2:
  Training Data: [6, 7, 'F', 'G'] -> Good
  Distance: 7.1764

Neighbor 3:
  Training Data: [6, 3, 'F', 'C'] -> Bad
  Distance: 8.3367

Neighbor 4:
  Training Data: [5, 9, 'E', 'I'] -> Good
  Distance: 8.6313

Neighbor 5:
  Training Data: [5, 3, 'E', 'C'] -> Bad
  Distance: 9.0830

Neighbor 6:
  Training Data: [3, 8, 'C', 'H'] -> Good
  Distance: 10.1735

Neighbor 7:
  Training Data: [2, 8, 'B', 'H'] -> Bad
  Distance: 11.2472
Vote Count: {np.str_('Bad'): 4, np.str_('Good'): 3}
Winner: Bad (BAD)
