In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

class FeatureExtractor:
    def __init__(self, max_features=5000, ngram_range=(1, 2)):
        self.max_features = max_features
        self.ngram_range = ngram_range

        self.bow_vectorizer = CountVectorizer(
            max_features=max_features,
            ngram_range=ngram_range,
            min_df=2,
            max_df=0.8,
            stop_words="english"
        )

        self.tfidf_vectorizer = TfidfVectorizer(
            max_features=max_features,
            ngram_range=ngram_range,
            min_df=2,
            max_df=0.8,
            stop_words="english"
        )

    def fit_transform_bow(self, texts):
        X = self.bow_vectorizer.fit_transform(texts)
        return X, self.bow_vectorizer.get_feature_names_out()

    def fit_transform_tfidf(self, texts):
        X = self.tfidf_vectorizer.fit_transform(texts)
        return X, self.tfidf_vectorizer.get_feature_names_out()

    def get_top_features(self, vectorizer, X, n=20):
        avg_scores = np.asarray(X.mean(axis=0)).ravel()
        features = vectorizer.get_feature_names_out()
        top_idx = np.argsort(avg_scores)[::-1][:n]
        return [(features[i], avg_scores[i]) for i in top_idx]

    def visualize_top_features(self, vectorizer, X, n=20):
        top = self.get_top_features(vectorizer, X, n)
        labels, scores = zip(*top)

        plt.figure(figsize=(8, 5))
        plt.barh(labels[::-1], scores[::-1])
        plt.xlabel("Average TF-IDF Score")
        plt.title("Top Informative Features")
plt.show()


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix
)

class ModelTrainer:
    def __init__(self):
        self.models = {}
        self.results = {}
        self.best_model = None

    def train_logistic_regression(self, X_train, y_train, X_val, y_val):
        param_grid = {"C": [0.1, 1.0, 10.0]}
        lr = LogisticRegression(max_iter=1000, random_state=42)

        grid = GridSearchCV(lr, param_grid, cv=5, scoring="f1_weighted")
        grid.fit(X_train, y_train)

        model = grid.best_estimator_
        y_pred = model.predict(X_val)

        metrics = self._metrics(y_val, y_pred)
        self.models["Logistic Regression"] = model
        self.results["Logistic Regression"] = metrics

        return model, metrics

    def train_naive_bayes(self, X_train, y_train, X_val, y_val):
        model = MultinomialNB()
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)

        metrics = self._metrics(y_val, y_pred)
        self.models["Naive Bayes"] = model
        self.results["Naive Bayes"] = metrics

        return model, metrics

    def train_knn(self, X_train, y_train, X_val, y_val):
        param_grid = {"n_neighbors": [3, 5, 7]}
        knn = KNeighborsClassifier(metric="cosine")

        grid = GridSearchCV(knn, param_grid, cv=5, scoring="f1_weighted")
        grid.fit(X_train, y_train)

        model = grid.best_estimator_
        y_pred = model.predict(X_val)

        metrics = self._metrics(y_val, y_pred)
        self.models["KNN"] = model
        self.results["KNN"] = metrics

        return model, metrics

    def _metrics(self, y_true, y_pred):
        return {
            "accuracy": accuracy_score(y_true, y_pred),
            "precision": precision_score(y_true, y_pred, average="weighted", zero_division=0),
            "recall": recall_score(y_true, y_pred, average="weighted", zero_division=0),
            "f1": f1_score(y_true, y_pred, average="weighted", zero_division=0),
            "confusion_matrix": confusion_matrix(y_true, y_pred)
        }

    def compare_models(self):
        return pd.DataFrame(self.results).T


In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_curve, roc_auc_score, precision_recall_curve
import seaborn as sns

def evaluate_model(best_model, X_train, y_train, X_test, y_test, texts_test):
    y_pred = best_model.predict(X_test)
    y_proba = best_model.predict_proba(X_test)

    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Weighted F1:", f1_score(y_test, y_pred, average="weighted"))

    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt="d")
    plt.title("Confusion Matrix")
    plt.show()

    scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring="f1_weighted")
    print(f"CV Mean: {scores.mean():.3f} Â± {scores.std():.3f}")

    for i, label in enumerate(np.unique(y_test)):
        y_true_bin = (y_test == label).astype(int)
        fpr, tpr, _ = roc_curve(y_true_bin, y_proba[:, i])
        auc = roc_auc_score(y_true_bin, y_proba[:, i])
        plt.plot(fpr, tpr, label=f"{label} (AUC={auc:.2f})")

    plt.legend()
    plt.title("ROC Curves")
    plt.show()

    mis_idx = np.where(y_pred != y_test)[0][:5]
    for i in mis_idx:
        print(texts_test.iloc[i])
        print("Pred:", y_pred[i], "True:", y_test[i],
              "Conf:", np.max(y_proba[i]))
        print()
