In [1]:
import os
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report
from imblearn.over_sampling import RandomOverSampler

In [2]:
def train_evaluate_logistic_regression( texts, labels, test_size: float = 0.2, random_state: int = 42, max_features: int = 5000, stop_words: str = "english"):
    X_train, X_test, y_train, y_test = train_test_split(texts, labels, stratify=labels, test_size=test_size, random_state=random_state)
    vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=max_features)
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec  = vectorizer.transform(X_test)

    model = LogisticRegression(solver="lbfgs", max_iter=1000, class_weight="balanced")
    model.fit(X_train_vec, y_train)

    y_pred = model.predict(X_test_vec)
    acc    = accuracy_score(y_test, y_pred)
    f1     = f1_score(y_test, y_pred, average="weighted")
    report = classification_report(y_test, y_pred)

    metrics = {
        "accuracy": acc,
        "f1_weighted": f1,
        "classification_report": report
    }
    
    return vectorizer, model, metrics

In [3]:
df = pd.read_csv(r"../cleaned_datasets/cleaned_news_binary.csv")
vec, clf, stats = train_evaluate_logistic_regression(df['text'], df['label'])

print("Accuracy: ", stats['accuracy'])
print("Weighted F1: ", stats['f1_weighted'])
print(stats['classification_report'])

Accuracy:  0.9012656033287101
Weighted F1:  0.9012238143406656
              precision    recall  f1-score   support

           0       0.89      0.92      0.90      5825
           1       0.91      0.88      0.90      5711

    accuracy                           0.90     11536
   macro avg       0.90      0.90      0.90     11536
weighted avg       0.90      0.90      0.90     11536



In [4]:
df = pd.read_csv(r"../cleaned_datasets/cleaned_news_multinomial.csv")
vec1, clf1, stats1 = train_evaluate_logistic_regression(df['text'], df['label'])

print("Accuracy: ", stats1['accuracy'])
print("Weighted F1: ", stats1['f1_weighted'])
print(stats1['classification_report'])

Accuracy:  0.7838938973647711
Weighted F1:  0.8202753848114679
              precision    recall  f1-score   support

           0       0.97      0.83      0.89      5196
           1       1.00      0.91      0.95      4694
           2       0.21      0.35      0.26       491
           3       0.18      0.23      0.20       526
           4       0.16      0.28      0.21       420
           5       0.10      0.29      0.15       209

    accuracy                           0.78     11536
   macro avg       0.44      0.48      0.44     11536
weighted avg       0.87      0.78      0.82     11536



In [5]:
def train_with_oversampling(texts, labels, test_size=0.2, random_state=42, max_features=5000, stop_words="english"):
    X_train, X_test, y_train, y_test = train_test_split(texts, labels, stratify=labels, test_size=test_size, random_state=random_state)

    vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=max_features)
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec  = vectorizer.transform(X_test)

    ros = RandomOverSampler(random_state=random_state)
    X_res, y_res = ros.fit_resample(X_train_vec, y_train)
    print("Resampled label counts:", dict(pd.Series(y_res).value_counts()))

    model = LogisticRegression(solver="lbfgs", max_iter=1000)
    model.fit(X_res, y_res)

    y_pred = model.predict(X_test_vec)
    metrics = {
        "accuracy": accuracy_score(y_test, y_pred),
        "f1_weighted": f1_score(y_test, y_pred, average="weighted"),
        "report": classification_report(y_test, y_pred)
    }
    return vectorizer, model, metrics

In [6]:
df = pd.read_csv(r"../cleaned_datasets/cleaned_news_multinomial.csv")
vec2, clf2, stats2 = train_with_oversampling(df['text'], df['label'])

print("Accuracy: ", stats2['accuracy'])
print("Weighted F1: ", stats2['f1_weighted'])
print(stats2['report'])

Resampled label counts: {1: 20780, 0: 20780, 3: 20780, 4: 20780, 5: 20780, 2: 20780}
Accuracy:  0.7863210818307905
Weighted F1:  0.8211469496515972
              precision    recall  f1-score   support

           0       0.97      0.83      0.90      5196
           1       1.00      0.91      0.95      4694
           2       0.21      0.33      0.26       491
           3       0.18      0.25      0.21       526
           4       0.17      0.28      0.21       420
           5       0.10      0.27      0.15       209

    accuracy                           0.79     11536
   macro avg       0.44      0.48      0.44     11536
weighted avg       0.87      0.79      0.82     11536



In [None]:
with open("../models/logistic_pipeline.pkl", "wb") as f:
    pickle.dump({"vectorizer": vec2, "model": clf2}, f)