In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB

In [None]:
def evaluate_knn(features, labels, n_neighbors=5, test_size=0.2):
    """
    Splits the dataset, trains a kNN classifier, and evaluates performance.

    Parameters:
    - features: Feature matrix (can be tfidf or LSA-transformed).
    - labels: Ground truth labels.
    - n_neighbors: Number of neighbors for kNN.
    - test_size: Fraction of data to use for testing.

    Returns:
    - accuracy: Accuracy score on the test set.
    """
    trainX, testX, trainY, testY = train_test_split(features, labels, test_size=test_size)

    knn = KNeighborsClassifier(n_neighbors=n_neighbors, weights='distance')
    knn.fit(trainX, trainY)

    predict = knn.predict(testX)
    accuracy = accuracy_score(testY, predict)
    print("Accuracy:", accuracy)
    print(classification_report(testY, predict))

    return accuracy, knn  # we return the trained model

In [None]:
def evaluate_decision_tree(X, y, test_size=0.2, max_depth=70):
    """
    Splits the dataset, trains a Decision Tree classifier,
    and evaluates performance.

    Parameters:
    - X: Feature matrix.
    - y: Ground truth labels.
    - test_size: Fraction of data to use for testing.
    - max_depth: Maximum depth of the decision tree.


    Returns:
    - accuracy: Accuracy score on the test set.
    - dt_model: The trained Decision Tree model.
    """
    # 1. Split Data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)

    # 2. Instantiate Model 
    dt_model = DecisionTreeClassifier(
        max_depth=max_depth,
        criterion='gini', 
        min_samples_split=2, 
        min_samples_leaf=1
    )

    # 3. Train
    dt_model.fit(X_train, y_train)

    # 4. Predict
    y_pred = dt_model.predict(X_test)

    # 5. Evaluate
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Decision Tree (max_depth={max_depth}) Accuracy:", accuracy)
    print("Classification Report:\n", classification_report(y_test, y_pred))

    return accuracy, dt_model

In [None]:
def evaluate_naive_bayes(X, y, test_size=0.2):
    """
    Splits the dataset, trains a Naive Bayes classifier (MultinomialNB),
    and evaluates performance.

    Parameters:
    - X: Feature matrix (may not allow LSA features because negative values).
    - y: Ground truth labels.
    - test_size: Fraction of data to use for testing.

    Returns:
    - accuracy: Accuracy score on the test set.
    - nb_model: The trained Naive Bayes model.
    """
    # 1. Split Data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)

    # 2. Instantiate Model
    nb_model = MultinomialNB()
    # nb_model = GaussianNB()        # uncomment for lsa tfidf

    # 3. Train
    nb_model.fit(X_train, y_train)

    # 4. Predict
    y_pred = nb_model.predict(X_test)

    # 5. Evaluate
    accuracy = accuracy_score(y_test, y_pred)
    print("Naive Bayes Accuracy:", accuracy)
    print("Classification Report:\n", classification_report(y_test, y_pred))

    return accuracy, nb_model

In [None]:
# Run everything

accuracy, model = evaluate_knn(X_tfidf, y, n_neighbors=5)

acc_dt, model_dt = evaluate_decision_tree(X_tfidf, y, max_depth=70)

acc_nb, model_nb = evaluate_naive_bayes(X_tfidf, y)