In [1]:
# Libraries

from collections import Counter
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
# KNN Implementation

class KNN:
    def __init__(self, k=3):
        self.k = k
    
    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train
    
    def _euclidean_distance(self, x1, x2):
        return np.sqrt(np.sum((x1 - x2) ** 2))
    
    def predict(self, X_test):
        predictions = [self._predict(x) for x in X_test]
        return np.array(predictions)
    
    def _predict(self, x):
        # Compute distances between x and all examples in the training set
        distances = [self._euclidean_distance(x, x_train) for x_train in self.X_train]
        # Get the indices of the k nearest neighbors
        k_indices = np.argsort(distances)[:self.k]
        # Extract the labels of the k nearest neighbors
        k_nearest_labels = [self.y_train[i] for i in k_indices]
        # Majority vote: the most common label among the k nearest neighbors
        most_common = Counter(k_nearest_labels).most_common(1)
        return most_common[0][0]

In [3]:
# Function for generating synthetic data

def generate_synthetic_data(n_samples=100, n_features=2, separation=2.0):
    """
    Generates synthetic data for binary classification.
    
    Parameters:
        n_samples (int): Total number of samples (split equally between two classes).
        n_features (int): Number of features for each sample.
        separation (float): Distance between the means of the two classes.
        
    Returns:
        X (np.array): Feature matrix of shape (n_samples, n_features).
        y (np.array): Labels (0 or 1) of shape (n_samples,).
    """
    n_samples_per_class = n_samples // 2
    
    # Generate class 0 data around (0, 0)
    X_class0 = np.random.randn(n_samples_per_class, n_features) + np.array([0] * n_features)
    y_class0 = np.zeros(n_samples_per_class)
    
    # Generate class 1 data around (separation, separation)
    X_class1 = np.random.randn(n_samples_per_class, n_features) + np.array([separation] * n_features)
    y_class1 = np.ones(n_samples_per_class)
    
    # Combine the data
    X = np.vstack((X_class0, X_class1))
    y = np.hstack((y_class0, y_class1))
    
    return X, y

In [4]:
# Evaluation Metic

def f_beta_score(y_true, y_pred, beta=1.0):
    """
    Computes the F-beta score between true and predicted labels.
    
    Parameters:
        y_true (np.array): True labels (0 or 1).
        y_pred (np.array): Predicted labels (0 or 1).
        beta (float): Weighting factor for the F-beta score.
                      beta > 1 favors recall, beta < 1 favors precision.
    
    Returns:
        float: The F-beta score.
    """
    # Calculate True Positives (TP), False Positives (FP), and False Negatives (FN)
    tp = np.sum((y_true == 1) & (y_pred == 1))
    fp = np.sum((y_true == 0) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == 0))
    
    # Calculate precision and recall
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    
    # Calculate the F-beta score
    if precision + recall == 0:
        return 0.0
    f_beta = (1 + beta**2) * (precision * recall) / ((beta**2 * precision) + recall)
    
    return f_beta

In [5]:
# Generating synthetic data

X, y = generate_synthetic_data(n_samples=2000, n_features=2, separation=3.0)

In [6]:
# Split the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Create an instance of KNN with k=3

knn = KNN(k=3)
knn.fit(X_train, y_train)

In [8]:
# Generating predictions with KNN

y_preds = knn.predict(X_test)

In [9]:
# Calculate F1 score (beta=1)
f1_score = f_beta_score(y_test, y_preds, beta=1.0)
print("F1 Score:", f1_score)

F1 Score: 0.9924812030075187
