In [10]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
# Load data
url = "../data/knn.csv"
df = pd.read_csv(url)  
len(df)

6

In [11]:
# First, let's examine the structure of the data
print("Dataset shape:", df.shape)
print("\nFirst few rows:")
print(df.head(10))
print("\nColumn names:")
print(df.columns.tolist())
print("\nData types:")
print(df.dtypes)

Dataset shape: (6, 4)

First few rows:
   X1  X2  X3      Y
0   0   3   0    red
1   2   0   0    red
2   0   1   3    red
3   0   1   2  green
4  -1   0   1  green
5   1   1   1    red

Column names:
['X1', 'X2', 'X3', 'Y']

Data types:
X1     int64
X2     int64
X3     int64
Y     object
dtype: object


In [13]:
# Assuming the data has columns X1, X2, X3, and Y
# Test point coordinates
test_point = np.array([0, 0, 0])

# Extract feature columns (X1, X2, X3)
# Adjust column names based on your actual data structure
X_cols = ['X1', 'X2', 'X3']  # Replace with actual column names if different
feature_data = df[X_cols].values

# Compute Euclidean distances from each observation to test point (0,0,0)
distances = np.sqrt(np.sum((feature_data - test_point)**2, axis=1))

# Add distances to the dataframe for analysis
df_with_distances = df.copy()
df_with_distances['distance'] = distances

# Sort by distance
df_sorted = df_with_distances.sort_values('distance')

print("Observations sorted by distance from test point (0,0,0):")
print(df_sorted[['X1', 'X2', 'X3', 'Y', 'distance']].head(10))

# K=1 prediction
k1_nearest = df_sorted.iloc[0]
k1_prediction = k1_nearest['Y']
print(f"\nK=1 Prediction: {k1_prediction}")
print(f"Reason: The closest observation is at distance {k1_nearest['distance']:.4f}")
print(f"This observation has coordinates ({k1_nearest['X1']}, {k1_nearest['X2']}, {k1_nearest['X3']}) and Y={k1_prediction}")

# K=3 prediction
k3_nearest = df_sorted.iloc[:3]
#k3_prediction = k3_nearest['Y'].mean()  # For regression
k3_prediction = k3_nearest['Y'].mode()[0]

print(f"\nK=3 Prediction: {k3_prediction}")
print("Reason: Average of 3 nearest neighbors:")
for i in range(3):
    obs = k3_nearest.iloc[i]
    print(f"  Neighbor {i+1}: ({obs['X1']}, {obs['X2']}, {obs['X3']}) -> Y={obs['Y']}, distance={obs['distance']:.4f}")

Observations sorted by distance from test point (0,0,0):
   X1  X2  X3      Y  distance
4  -1   0   1  green  1.414214
5   1   1   1    red  1.732051
1   2   0   0    red  2.000000
3   0   1   2  green  2.236068
0   0   3   0    red  3.000000
2   0   1   3    red  3.162278

K=1 Prediction: green
Reason: The closest observation is at distance 1.4142
This observation has coordinates (-1, 0, 1) and Y=green

K=3 Prediction: red
Reason: Average of 3 nearest neighbors:
  Neighbor 1: (-1, 0, 1) -> Y=green, distance=1.4142
  Neighbor 2: (1, 1, 1) -> Y=red, distance=1.7321
  Neighbor 3: (2, 0, 0) -> Y=red, distance=2.0000
