In [1]:
import os, json
import pandas as pd
import numpy as np
from joblib import dump
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

from features import extract_handcrafted_features

In [2]:
DATA_PATH = '../data/messages.csv'
ART_DIR = '../artifacts/log_reg'
os.makedirs(ART_DIR, exist_ok = True)

In [3]:
def main():
    df = pd.read_csv(DATA_PATH)

    # the data is expected in a csv, columns form, text, label (0/1)
    texts = df['text'].astype(str).tolist()
    y = df['label'].astype(int).to_numpy()

    X_train_txt, X_test_txt, y_train, y_test = train_test_split(texts,
                                                                y,
                                                                test_size = 0.2,
                                                                random_state = 42,
                                                                stratify = y) # stratify ensures that both the labels have equal representation in train and test
                                                                              # splits

    vec = TfidfVectorizer(
        lowercase =True,
        ngram_range = (1, 2),
        min_df = 2,
        max_features = 40000
    )

    X_train_tfidf = vec.fit_transform(X_train_txt)
    X_test_tfidf = vec.transform(X_test_txt)

    X_train_hand = extract_handcrafted_features(X_train_txt)
    X_test_hand = extract_handcrafted_features(X_test_txt)

    # Combine sparse tfidf with dense handcrafted
    X_train = hstack([X_train_tfidf, X_train_hand])
    X_test = hstack([X_test_tfidf, X_test_hand])

    model = LogisticRegression(max_iter = 2000, class_weight = 'balanced')
    model.fit(X_train, y_train)

    proba = model.predict_proba(X_test)[:, 1]
    pred = (proba >= 0.5).astype(int)

    print("\nConfusion Matrix:\n", confusion_matrix(y_test, pred))
    print("\nClassification Report:\n", classification_report(y_test, pred, digits = 4))
    print("ROC-AUC:", roc_auc_score(y_test, proba))

    dump(vec, f'{ART_DIR}/vec.joblib')
    dump(model, f'{ART_DIR}/mod_logreg.joblib')
    with open(f'{ART_DIR}/feature_config.json', 'w') as f:
        json.dump({
            "handcrafted_features": ["num_urls", "num_digits", "msg_len", "num_exclaim", "has_currency", "has_urgent"],
        }, f, indent = 2)

    print("\nSaved artifacts to: ", ART_DIR)

if __name__ == '__main__':
    main()


Confusion Matrix:
 [[898   6]
 [  8 120]]

Classification Report:
               precision    recall  f1-score   support

           0     0.9912    0.9934    0.9923       904
           1     0.9524    0.9375    0.9449       128

    accuracy                         0.9864      1032
   macro avg     0.9718    0.9654    0.9686      1032
weighted avg     0.9864    0.9864    0.9864      1032

ROC-AUC: 0.9972950082964601

Saved artifacts to:  ../artifacts/log_reg
