In [20]:
import pandas as pd
# read csv file
df = pd.read_csv("data/Weather_Play.csv")
#print(df.head())

# Convert to number
# outlook: 'sunny': 0, 'overcast': 1, 'rainy': 2
df['outlook'] = df['outlook'].map({'sunny': 0, 'overcast': 1, 'rainy': 2})

# temp.: 'hot': 0, 'cool': 1, 'mild':2 
df['temp.'] = df['temp.'].map({'hot': 0, 'cool': 1, 'mild': 2})

# humidity: 'normal': 0, 'high': 1
df['humidity'] = df['humidity'].map({'normal': 0, 'high': 1})

# windy: 'False': 0, 'True': 1
df['windy'] = df['windy'].map({False: 0, True: 1})

# play: 'no': 0, 'yes': 1
df['play'] = df['play'].map({'no': 0, 'yes': 1})

print(df)

X = df.drop(['play'], axis = 1).values
y = df['play'].values
#print(X)
#print(y)

    outlook  temp.  humidity  windy  play
0         0      0         1      0     0
1         0      0         1      1     0
2         1      0         1      0     1
3         2      2         1      0     1
4         2      1         0      0     1
5         2      1         0      1     0
6         1      1         0      1     1
7         0      2         1      0     0
8         0      1         0      0     1
9         2      2         0      0     1
10        0      2         0      1     1
11        1      2         1      1     1
12        1      0         0      0     1
13        2      2         1      1     0


In [21]:
import numpy as np
from collections import Counter

class KNN:
    """
    param k: Number of nearest neighbors to consider.
    param metric: Type of distance metric ('euclidean', 'manhattan').
    """
    def __init__(self, k=5, metric = 'euclidean'):
        self.k = k
        self.metric = metric
        self.X_train = None
        self.y_train = None

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    # Compute the distance between two points based on the chosen metric
    def distance(self, x1, x2):
        if self.metric == "euclidean":
            return np.sqrt(np.sum((x1 - x2)**2))
        
        if self.metric == "manhattan":
            return np.sum(np.abs(x1 - x2))
        
        else:
            raise ValueError(f"Unknown distance metric '{self.metric}'")

    def predict_batch(self, X):
        y_pred = [self.predict(x) for x in X]
        return y_pred

    def predict(self, x):
        # Compute distance to all points in train set
        distances = [self.distance(x, x_train) for x_train in self.X_train]
        # Sort the distance with index
        top_idx = np.argsort(distances)[:self.k]
        # Get top K label
        k_nearests = self.y_train[top_idx]
        # Predict the label
        label = Counter(k_nearests).most_common(1)[0][0]
        
        return label

In [22]:
from sklearn.model_selection import train_test_split

# Split the data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state=42)

# Create KNN instance with different metrics
knn_euclidean = KNN(k=5, metric='euclidean')
knn_manhattan = KNN(k=5, metric='manhattan')

# Fit the model
knn_euclidean.fit(X_train, y_train)
knn_manhattan.fit(X_train, y_train)

# Predict
predictions_euclidean = knn_euclidean.predict_batch(X_test)
predictions_manhattan = knn_manhattan.predict_batch(X_test)

# Convert predictions to Python list for clean output
predictions_euclidean = list(map(int, predictions_euclidean))
predictions_manhattan = list(map(int, predictions_manhattan))

print("Predictions (Euclidean):", predictions_euclidean)
print("Predictions (Manhattan):", predictions_manhattan)
print("Label of data test:", y_test)

Predictions (Euclidean): [1, 1, 1]
Predictions (Manhattan): [0, 1, 1]
[1 1 0]
