In [6]:
class KNNClassifier:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X_test):
        return [self._predict(x) for x in X_test]

    def _predict(self, x):
        distances = [(self._euclidean_distance(x, x_train), label)
                     for x_train, label in zip(self.X_train, self.y_train)]
        distances.sort()
        k_nearest = [label for _, label in distances[:self.k]]
        return max(set(k_nearest), key=k_nearest.count)

    def _euclidean_distance(self, x1, x2):
        return sum((a - b) ** 2 for a, b in zip(x1, x2)) ** 0.5

In [7]:
def accuracy(y_true, y_pred):
    return sum(yt == yp for yt, yp in zip(y_true, y_pred)) / len(y_true)

def confusion_matrix(y_true, y_pred, labels):
    matrix = [[0 for _ in labels] for _ in labels]
    label_to_index = {label: idx for idx, label in enumerate(labels)}
    for yt, yp in zip(y_true, y_pred):
        matrix[label_to_index[yt]][label_to_index[yp]] += 1
    return matrix

def precision_recall_f1(y_true, y_pred):
    labels = list(set(y_true))
    scores = {}
    for label in labels:
        tp = sum((yt == label and yp == label) for yt, yp in zip(y_true, y_pred))
        fp = sum((yt != label and yp == label) for yt, yp in zip(y_true, y_pred))
        fn = sum((yt == label and yp != label) for yt, yp in zip(y_true, y_pred))

        prec = tp / (tp + fp) if (tp + fp) else 0
        rec = tp / (tp + fn) if (tp + fn) else 0
        f1 = 2 * prec * rec / (prec + rec) if (prec + rec) else 0

        scores[label] = {'precision': prec, 'recall': rec, 'f1': f1}
    return scores

In [11]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

iris = load_iris()
X, y = iris.data.tolist(), iris.target.tolist()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

model = KNNClassifier(k=3)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Accuracy:", accuracy(y_test, y_pred))
print("Confusion Matrix:", confusion_matrix(y_test, y_pred, list(set(y))))
print("Precision, Recall, F1:", precision_recall_f1(y_test, y_pred))

Accuracy: 1.0
Confusion Matrix: [[16, 0, 0], [0, 17, 0], [0, 0, 12]]
Precision, Recall, F1: {0: {'precision': 1.0, 'recall': 1.0, 'f1': 1.0}, 1: {'precision': 1.0, 'recall': 1.0, 'f1': 1.0}, 2: {'precision': 1.0, 'recall': 1.0, 'f1': 1.0}}


In [10]:

import pandas as pd
from sklearn.model_selection import train_test_split

# Load Kaggle news dataset
df = pd.read_json('/content/drive/MyDrive/ml/News_Category_Dataset_v3.json', lines=True)
df = df[['headline', 'category']]
df = df[df['category'].isin(['POLITICS', 'SPORTS', 'TECH'])]  # Keep 3 categories for simplicity
texts = df['headline'].tolist()
labels = df['category'].tolist()

# Use 5% of the dataset, split as 4% training and 1% testing
from sklearn.utils import shuffle
texts, labels = shuffle(texts, labels, random_state=42)
limit = int(0.05 * len(texts))
texts = texts[:limit]
labels = labels[:limit]

# TF-IDF preprocessing

def tokenize(text):
    return text.lower().split()

def compute_tf(text):
    tokens = tokenize(text)
    tf = {}
    for word in tokens:
        tf[word] = tf.get(word, 0) + 1
    total = len(tokens)
    for word in tf:
        tf[word] /= total
    return tf

def compute_idf(corpus):
    import math
    idf = {}
    total_docs = len(corpus)
    for doc in corpus:
        for word in set(tokenize(doc)):
            idf[word] = idf.get(word, 0) + 1
    for word in idf:
        idf[word] = math.log(total_docs / idf[word])
    return idf

def compute_tfidf(text, idf):
    tf = compute_tf(text)
    return {word: tf[word] * idf.get(word, 0) for word in tf}

def vectorize(tfidf_dict, vocab):
    return [tfidf_dict.get(word, 0.0) for word in vocab]

idf = compute_idf(texts)
vocab = sorted(idf.keys())
X = [vectorize(compute_tfidf(text, idf), vocab) for text in texts]
y = labels

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01)

model = KNNClassifier(k=3)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("News Accuracy:", accuracy(y_test, y_pred))
print("News F1:", precision_recall_f1(y_test, y_pred))

News Accuracy: 0.9090909090909091
News F1: {'SPORTS': {'precision': 0, 'recall': 0.0, 'f1': 0}, 'POLITICS': {'precision': 0.9090909090909091, 'recall': 1.0, 'f1': 0.9523809523809523}}
