<a href="https://colab.research.google.com/github/MdSyfulIslam/CSE-412-Machine-Learning-Lab/blob/main/Lab_Report_2_KNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


In [6]:

def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2)**2))

class CustomKNN:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X_test):
        return np.array([self._predict(x) for x in X_test])

    def _predict(self, x):
        distances = [euclidean_distance(x, x_train) for x_train in self.X_train]
        k_indices = np.argsort(distances)[:self.k]
        k_labels = [self.y_train[i] for i in k_indices]
        return Counter(k_labels).most_common(1)[0][0]


In [7]:
def custom_accuracy(y_true, y_pred):
    return np.mean(y_true == y_pred)

def custom_confusion_matrix(y_true, y_pred):
    labels = np.unique(np.concatenate((y_true, y_pred)))
    matrix = np.zeros((len(labels), len(labels)), dtype=int)
    for t, p in zip(y_true, y_pred):
        matrix[t][p] += 1
    return matrix

def custom_precision(y_true, y_pred, label):
    tp = np.sum((y_pred == label) & (y_true == label))
    fp = np.sum((y_pred == label) & (y_true != label))
    return tp / (tp + fp) if (tp + fp) > 0 else 0

def custom_recall(y_true, y_pred, label):
    tp = np.sum((y_pred == label) & (y_true == label))
    fn = np.sum((y_pred != label) & (y_true == label))
    return tp / (tp + fn) if (tp + fn) > 0 else 0

def custom_f1(y_true, y_pred, label):
    p = custom_precision(y_true, y_pred, label)
    r = custom_recall(y_true, y_pred, label)
    return 2 * p * r / (p + r) if (p + r) > 0 else 0


In [9]:
iris = load_iris()
X_iris, y_iris = iris.data, iris.target

best_k_iris = None
best_acc_iris = 0
best_ratio_iris = 0
best_y_iris = {}

for k in range(1, 11):
    for ratio in [0.2, 0.3, 0.4]:
        X_train, X_test, y_train, y_test = train_test_split(X_iris, y_iris, test_size=ratio, random_state=42)
        model = CustomKNN(k)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc = custom_accuracy(y_test, y_pred)

        if acc > best_acc_iris:
            best_acc_iris = acc
            best_k_iris = k
            best_ratio_iris = ratio
            best_y_iris = {'y_test': y_test, 'y_pred': y_pred}

print(f"Iris Dataset: Best Accuracy = {best_acc_iris:.2f}, Best k = {best_k_iris}, Split = {int((1-best_ratio_iris)*100)}:{int(best_ratio_iris*100)}")
print("Confusion Matrix:\n", custom_confusion_matrix(best_y_iris['y_test'], best_y_iris['y_pred']))
for lbl in np.unique(best_y_iris['y_test']):
    print(f"Class {lbl}: Precision = {custom_precision(best_y_iris['y_test'], best_y_iris['y_pred'], lbl):.2f}, "
          f"Recall = {custom_recall(best_y_iris['y_test'], best_y_iris['y_pred'], lbl):.2f}, "
          f"F1 = {custom_f1(best_y_iris['y_test'], best_y_iris['y_pred'], lbl):.2f}")


Iris Dataset: Best Accuracy = 1.00, Best k = 1, Split = 80:20
Confusion Matrix:
 [[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]
Class 0: Precision = 1.00, Recall = 1.00, F1 = 1.00
Class 1: Precision = 1.00, Recall = 1.00, F1 = 1.00
Class 2: Precision = 1.00, Recall = 1.00, F1 = 1.00


In [10]:
news_df = pd.read_csv("news_data.csv").dropna()
texts = news_df['message'].astype(str).tolist()
labels = news_df['label'].tolist()

vocab = sorted(set(word.lower() for text in texts for word in text.split()))
def text_to_vector(text):
    words = text.lower().split()
    return np.array([words.count(word) for word in vocab])

X_news = np.array([text_to_vector(text) for text in texts])
le = LabelEncoder()
y_news = le.fit_transform(labels)

best_k_news = None
best_acc_news = 0
best_ratio_news = 0
best_y_news = {}

for k in range(1, 11):
    for ratio in [0.2, 0.3, 0.4]:
        X_train, X_test, y_train, y_test = train_test_split(X_news, y_news, test_size=ratio, random_state=42)
        model = CustomKNN(k)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc = custom_accuracy(y_test, y_pred)

        if acc > best_acc_news:
            best_acc_news = acc
            best_k_news = k
            best_ratio_news = ratio
            best_y_news = {'y_test': y_test, 'y_pred': y_pred}


print(f" News Dataset: Best Accuracy = {best_acc_news:.2f}, Best k = {best_k_news}, Split = {int((1-best_ratio_news)*100)}:{int(best_ratio_news*100)}")
print("Confusion Matrix:\n", custom_confusion_matrix(best_y_news['y_test'], best_y_news['y_pred']))
for lbl in np.unique(best_y_news['y_test']):
    print(f"Class {lbl} ({le.classes_[lbl]}): Precision = {custom_precision(best_y_news['y_test'], best_y_news['y_pred'], lbl):.2f}, "
          f"Recall = {custom_recall(best_y_news['y_test'], best_y_news['y_pred'], lbl):.2f}, "
          f"F1 = {custom_f1(best_y_news['y_test'], best_y_news['y_pred'], lbl):.2f}")


 News Dataset: Best Accuracy = 0.62, Best k = 1, Split = 60:40
Confusion Matrix:
 [[17  0]
 [15  8]]
Class 0 (politics): Precision = 0.53, Recall = 1.00, F1 = 0.69
Class 1 (sports): Precision = 1.00, Recall = 0.35, F1 = 0.52


In [11]:
from sklearn.neighbors import KNeighborsClassifier

# Iris Comparison
X_train, X_test, y_train, y_test = train_test_split(X_iris, y_iris, test_size=best_ratio_iris, random_state=42)
sk_model = KNeighborsClassifier(n_neighbors=best_k_iris)
sk_model.fit(X_train, y_train)
y_pred_sk = sk_model.predict(X_test)
print(f"\n Iris Sklearn Accuracy: {np.mean(y_test == y_pred_sk):.2f} vs Custom: {best_acc_iris:.2f}")

# News Comparison
X_train, X_test, y_train, y_test = train_test_split(X_news, y_news, test_size=best_ratio_news, random_state=42)
sk_model = KNeighborsClassifier(n_neighbors=best_k_news)
sk_model.fit(X_train, y_train)
y_pred_sk = sk_model.predict(X_test)
print(f" News Sklearn Accuracy: {np.mean(y_test == y_pred_sk):.2f} vs Custom: {best_acc_news:.2f}")



 Iris Sklearn Accuracy: 1.00 vs Custom: 1.00
 News Sklearn Accuracy: 0.62 vs Custom: 0.62
