In [7]:
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, classification_report
import xgboost as xgb
import pickle

In [2]:
def train_evaluate_xgboost(texts, labels, test_size: float = 0.2, random_state: int = 42, max_features: int = 5000, stop_words: str = "english", xgb_params: dict = None):
    X_train, X_test, y_train, y_test = train_test_split(texts, labels, stratify=labels, test_size=test_size, random_state=random_state)
    vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=max_features)
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec  = vectorizer.transform(X_test)
    default_params = {
        "objective": "multi:softprob" if len(set(labels)) > 2 else "binary:logistic",
        "eval_metric": "mlogloss" if len(set(labels)) > 2 else "logloss",
        "random_state": random_state
    }

    if xgb_params:
        default_params.update(xgb_params)

    model = xgb.XGBClassifier(**default_params)
    model.fit(X_train_vec.toarray(), y_train)

    y_pred = model.predict(X_test_vec.toarray())
    acc    = accuracy_score(y_test, y_pred)
    f1     = f1_score(y_test, y_pred, average="weighted")
    report = classification_report(y_test, y_pred)
    
    metrics = {
        "accuracy": acc,
        "f1_weighted": f1,
        "classification_report": report
    }

    return vectorizer, model, metrics

In [3]:
df = pd.read_csv(r"./../cleaned_datasets/cleaned_news_binary.csv")
vec, xgb_clf, stats = train_evaluate_xgboost(df["text"], df["label"])

print("Accuracy:", stats["accuracy"])
print("Weighted F1:", stats["f1_weighted"])
print(stats["classification_report"])

Accuracy: 0.8977981969486823
Weighted F1: 0.8975522023162581
              precision    recall  f1-score   support

           0       0.87      0.94      0.90      5825
           1       0.94      0.85      0.89      5711

    accuracy                           0.90     11536
   macro avg       0.90      0.90      0.90     11536
weighted avg       0.90      0.90      0.90     11536



In [4]:
df = pd.read_csv(r"./../cleaned_datasets/cleaned_news_multinomial.csv")
vec1, xgb_clf1, stats1 = train_evaluate_xgboost(df["text"], df["label"])

print("Accuracy:", stats1["accuracy"])
print("Weighted F1:", stats1["f1_weighted"])
print(stats1["classification_report"])

Accuracy: 0.826369625520111
Weighted F1: 0.7952779646401442
              precision    recall  f1-score   support

           0       0.78      0.97      0.87      5196
           1       0.98      0.92      0.95      4694
           2       0.26      0.14      0.18       491
           3       0.25      0.11      0.15       526
           4       0.25      0.07      0.11       420
           5       0.15      0.02      0.03       209

    accuracy                           0.83     11536
   macro avg       0.44      0.37      0.38     11536
weighted avg       0.78      0.83      0.80     11536



In [5]:
def train_xgb_with_oversampling(texts, labels, test_size=0.2, random_state=42, max_features=5000, stop_words="english"):
    X_train, X_test, y_train, y_test = train_test_split(texts, labels, stratify=labels, test_size=test_size, random_state=random_state)

    vec = TfidfVectorizer(stop_words=stop_words, max_features=max_features)
    X_train_vec = vec.fit_transform(X_train)
    X_test_vec = vec.transform(X_test)

    ros = RandomOverSampler(random_state=random_state)
    X_res, y_res = ros.fit_resample(X_train_vec, y_train)
    print("Resampled label counts:", dict(pd.Series(y_res).value_counts()))

    model = xgb.XGBClassifier(eval_metric='mlogloss' if len(set(labels))>2 else 'logloss', random_state=random_state)
    model.fit(X_res.toarray(), y_res)

    y_pred = model.predict(X_test_vec.toarray())
    acc    = accuracy_score(y_test, y_pred)
    f1     = f1_score(y_test, y_pred, average="weighted")
    report = classification_report(y_test, y_pred)

    metrics = {
        "accuracy": acc,
        "f1_weighted": f1,
        "classification_report": report
    }

    return vec, model, metrics

In [6]:
df = pd.read_csv(r"./../cleaned_datasets/cleaned_news_multinomial.csv")
vec2, xgb_clf2, stats2 = train_xgb_with_oversampling(df["text"], df["label"])

print("Accuracy:", stats2["accuracy"])
print("Weighted F1:", stats2["f1_weighted"])
print(stats2["classification_report"])

Resampled label counts: {1: 20780, 0: 20780, 3: 20780, 4: 20780, 5: 20780, 2: 20780}
Accuracy: 0.7662101248266296
Weighted F1: 0.8095594159327983
              precision    recall  f1-score   support

           0       0.98      0.78      0.87      5196
           1       1.00      0.91      0.95      4694
           2       0.20      0.36      0.26       491
           3       0.16      0.30      0.21       526
           4       0.15      0.24      0.19       420
           5       0.09      0.26      0.14       209

    accuracy                           0.77     11536
   macro avg       0.43      0.48      0.44     11536
weighted avg       0.87      0.77      0.81     11536



In [8]:
with open("../models/xgb_pipeline.pkl", "wb") as f:
    pickle.dump({"vectorizer": vec, "model": xgb_clf}, f)