<a href="https://colab.research.google.com/github/MDJubayer255/ml_Lab01/blob/main/LabReport02_knnFromScratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import random
random.seed(42)
np.random.seed(42)

# Custom KNN Implementation
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))

class CustomKNN:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        return np.array([self._predict(x) for x in X])

    def _predict(self, x):
        distances = [euclidean_distance(x, x_train) for x_train in self.X_train]
        k_indices = np.argsort(distances)[:self.k]
        k_labels = [self.y_train[i] for i in k_indices]
        return Counter(k_labels).most_common(1)[0][0]

#Custom Evaluation Metrics
def accuracy(y_true, y_pred):
    return np.sum(y_true == y_pred) / len(y_true)

def confusion_matrix_custom(y_true, y_pred):
    labels = np.unique(y_true)
    matrix = np.zeros((len(labels), len(labels)), dtype=int)
    for t, p in zip(y_true, y_pred):
        matrix[t][p] += 1
    return matrix

def precision_recall_f1(y_true, y_pred):
    cm = confusion_matrix_custom(y_true, y_pred)
    num_classes = cm.shape[0]
    precision, recall, f1 = [], [], []

    for i in range(num_classes):
        tp = cm[i][i]
        fp = cm[:, i].sum() - tp
        fn = cm[i, :].sum() - tp
        prec = tp / (tp + fp) if (tp + fp) != 0 else 0
        rec = tp / (tp + fn) if (tp + fn) != 0 else 0
        f1s = 2 * prec * rec / (prec + rec) if (prec + rec) != 0 else 0
        precision.append(prec)
        recall.append(rec)
        f1.append(f1s)

    return np.mean(precision), np.mean(recall), np.mean(f1)

#Function to Find Best K and Split Ratio
def find_best_k_and_split(X, y, title):
    best_acc = 0
    best_k = 1
    best_split = 0.2
    k_range = range(1, 16)
    splits = [0.2, 0.3, 0.4]

    print(f"\n{'='*10} {title} Dataset: Finding Best k and Split {'='*10}")
    for split in splits:
        avg_scores = []
        for k in k_range:
            scores = []
            for _ in range(10):
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split)
                model = CustomKNN(k=k)
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)
                scores.append(accuracy(y_test, y_pred))
            avg_acc = np.mean(scores)
            avg_scores.append(avg_acc)
            print(f"Split={split}, k={k}, Avg Accuracy={avg_acc:.4f}")
            if avg_acc > best_acc:
                best_acc = avg_acc
                best_k = k
                best_split = split

    return best_k, best_split

iris = load_iris()
X_iris = iris.data
y_iris = iris.target

scaler = StandardScaler()
X_iris = scaler.fit_transform(X_iris)

best_k_iris, best_split_iris = find_best_k_and_split(X_iris, y_iris, "Iris")

X_train, X_test, y_train, y_test = train_test_split(X_iris, y_iris, test_size=best_split_iris)
custom_model = CustomKNN(k=best_k_iris)
custom_model.fit(X_train, y_train)
y_pred = custom_model.predict(X_test)

print(f"\nCustom KNN on Iris Dataset (k={best_k_iris}, split={best_split_iris}):")
print("Accuracy:", accuracy(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix_custom(y_test, y_pred))
p, r, f = precision_recall_f1(y_test, y_pred)
print(f"Precision: {p:.2f}, Recall: {r:.2f}, F1-score: {f:.2f}")

#Compare with scikit-learn
sk_model = KNeighborsClassifier(n_neighbors=best_k_iris)
sk_model.fit(X_train, y_train)
y_pred_sk = sk_model.predict(X_test)

print("\nSklearn KNN on Iris Dataset:")
print("Accuracy:", accuracy_score(y_test, y_pred_sk))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_sk))
print("Classification Report:\n", classification_report(y_test, y_pred_sk))

#News Dataset
news_data = {
    'text': [
        'Prime Minister holds a press conference on economy.',
        'Bangladesh wins gold in cricket final!',
        'Budget discussion continues in parliament.',
        'Shakib Al Hasan leads team to victory.',
        'New education policy announced.',
        'Champions League final draws huge crowd.',
        'Cabinet meeting reviews progress.',
        'Football match ends in dramatic penalty shootout.'
    ],
    'label': [
        'politics',
        'sports',
        'politics',
        'sports',
        'politics',
        'sports',
        'politics',
        'sports'
    ]
}

df_news = pd.DataFrame(news_data)
vectorizer = TfidfVectorizer()
X_news = vectorizer.fit_transform(df_news['text']).toarray()
y_news = pd.factorize(df_news['label'])[0]

X_news = StandardScaler().fit_transform(X_news)
best_k_news, best_split_news = find_best_k_and_split(X_news, y_news, "News")

X_train, X_test, y_train, y_test = train_test_split(X_news, y_news, test_size=best_split_news)
custom_model_news = CustomKNN(k=best_k_news)
custom_model_news.fit(X_train, y_train)
y_pred = custom_model_news.predict(X_test)

print(f"\nCustom KNN on News Dataset (k={best_k_news}, split={best_split_news}):")
print("Accuracy:", accuracy(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix_custom(y_test, y_pred))
p, r, f = precision_recall_f1(y_test, y_pred)
print(f"Precision: {p:.2f}, Recall: {r:.2f}, F1-score: {f:.2f}")

# Compare with sklearn for News
sk_model_news = KNeighborsClassifier(n_neighbors=best_k_news)
sk_model_news.fit(X_train, y_train)
y_pred_sk = sk_model_news.predict(X_test)

print("\nSklearn KNN on News Dataset:")
print("Accuracy:", accuracy_score(y_test, y_pred_sk))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_sk))
print("Classification Report:\n", classification_report(y_test, y_pred_sk))



Split=0.2, k=1, Avg Accuracy=0.9433
Split=0.2, k=2, Avg Accuracy=0.9467
Split=0.2, k=3, Avg Accuracy=0.9367
Split=0.2, k=4, Avg Accuracy=0.9400
Split=0.2, k=5, Avg Accuracy=0.9533
Split=0.2, k=6, Avg Accuracy=0.9600
Split=0.2, k=7, Avg Accuracy=0.9700
Split=0.2, k=8, Avg Accuracy=0.9367
Split=0.2, k=9, Avg Accuracy=0.9500
Split=0.2, k=10, Avg Accuracy=0.9500
Split=0.2, k=11, Avg Accuracy=0.9533
Split=0.2, k=12, Avg Accuracy=0.9667
Split=0.2, k=13, Avg Accuracy=0.9800
Split=0.2, k=14, Avg Accuracy=0.9667
Split=0.2, k=15, Avg Accuracy=0.9433
Split=0.3, k=1, Avg Accuracy=0.9489
Split=0.3, k=2, Avg Accuracy=0.9244
Split=0.3, k=3, Avg Accuracy=0.9444
Split=0.3, k=4, Avg Accuracy=0.9467
Split=0.3, k=5, Avg Accuracy=0.9511
Split=0.3, k=6, Avg Accuracy=0.9467
Split=0.3, k=7, Avg Accuracy=0.9444
Split=0.3, k=8, Avg Accuracy=0.9600
Split=0.3, k=9, Avg Accuracy=0.9444
Split=0.3, k=10, Avg Accuracy=0.9689
Split=0.3, k=11, Avg Accuracy=0.9467
Split=0.3, k=12, Avg Accuracy=0.9533
Split=0.3, k=13, A

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
