### My introduction to the Ag News Dataset

In [46]:
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer # https://habr.com/ru/companies/otus/articles/755772/
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

In [19]:
dataset = load_dataset("ag_news")



In [20]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})

In [21]:
dataset["train"][:5]

{'text': ["Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.",
  'Carlyle Looks Toward Commercial Aerospace (Reuters) Reuters - Private investment firm Carlyle Group,\\which has a reputation for making well-timed and occasionally\\controversial plays in the defense industry, has quietly placed\\its bets on another part of the market.',
  "Oil and Economy Cloud Stocks' Outlook (Reuters) Reuters - Soaring crude prices plus worries\\about the economy and the outlook for earnings are expected to\\hang over the stock market next week during the depth of the\\summer doldrums.",
  'Iraq Halts Oil Exports from Main Southern Pipeline (Reuters) Reuters - Authorities have halted oil export\\flows from the main pipeline in southern Iraq after\\intelligence showed a rebel militia could strike\\infrastructure, an oil official said on Saturday.',
  'Oil prices soar to all-time record, posing new menace to 

In [22]:
dataset["train"].features

{'text': Value('string'),
 'label': ClassLabel(names=['World', 'Sports', 'Business', 'Sci/Tech'])}

In [23]:
for i in range(3):
    print(f'{i+1}:\n    Text: { dataset["train"][i]["text"] }\n    Label: {dataset["train"][i]["label"]}\n')

1:
    Text: Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.
    Label: 2

2:
    Text: Carlyle Looks Toward Commercial Aerospace (Reuters) Reuters - Private investment firm Carlyle Group,\which has a reputation for making well-timed and occasionally\controversial plays in the defense industry, has quietly placed\its bets on another part of the market.
    Label: 2

3:
    Text: Oil and Economy Cloud Stocks' Outlook (Reuters) Reuters - Soaring crude prices plus worries\about the economy and the outlook for earnings are expected to\hang over the stock market next week during the depth of the\summer doldrums.
    Label: 2



In [29]:
# Проверка на кол-во пропущенных данных в ДатаСете.
# Их нет поскольку кол-во таких missing данных равно 0.
texts = dataset["train"][:2000]["text"]
missing = sum(len(t.strip()) == 0 for t in texts)
missing

0

In [30]:
# Разделим наш Train часть из ДатаСета на два:
#   1) 90% также для train
#   2) 10% для валидации и настройки гиперпараметров
split = dataset["train"].train_test_split(test_size=0.1, seed=42)
split

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 108000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 12000
    })
})

In [31]:
# Инициализация данных на:
#   1) Training
#   2) Validation
#   3) Testing
train_ds = split['train']
val_ds = split['test']
test_ds = dataset['test']

In [32]:
len(train_ds), len(val_ds), len(test_ds)

(108000, 12000, 7600)

In [34]:
# Разделим наши Train данные на таргетную (label будет храниться в y_train)
# и на text по которой будем классифицировать
X_train, y_train = train_ds['text'], train_ds['label']
X_train[0], y_train[0] 

('Despair and Anger in Small Russian Town After Siege  BESLAN, Russia (Reuters) - The killing of more than 320  children, parents and teachers during the bloody end to a  53-hour school siege left barely a family untouched in the  small Russian town of Beslan.',
 0)

In [35]:
# То же самое и для validation и testing
X_val, y_val = val_ds["text"], val_ds["label"]
X_test, y_test = test_ds["text"], test_ds["label"]

### TF-IDF (превращаем тексты в числа)
Формула вычисления TF-IDF
Формула TF-IDF комбинирует понятия TF и IDF, чтобы вычислить важность каждого слова в каждом документе. Формально, формула выглядит следующим образом:

    TF-IDF(t, d) = TF(t, d) * IDF(t)
где:

    TF(t, d) - Частота термина (TF) для слова "t" в документе "d".

    IDF(t) - Обратная частота документа (IDF) для слова "t".

In [57]:
tfidf = TfidfVectorizer(
    ngram_range=(1, 2),   # униграммы + биграммы (сочетание только слов так и двух слов грубо говоря)
    min_df=2,             # отсекаем совсем редкие
    max_df=0.95           # отсекаем слишком частые
)

In [58]:
X_train_vec = tfidf.fit_transform(X_train)
X_val_vec = tfidf.transform(X_val)
X_test_vec= tfidf.transform(X_test)

In [59]:
X_train_vec.shape, X_val_vec.shape

((108000, 421815), (12000, 421815))

In [60]:
clf = LogisticRegression(
    max_iter=1000,
    n_jobs=-1
)

In [61]:
clf.fit(X_train_vec, y_train)

In [62]:
val_pred = clf.predict(X_val_vec)

In [63]:
acc = accuracy_score(y_val, val_pred)
f1  = f1_score(y_val, val_pred, average="macro")

acc, f1

(0.9185, 0.9179970584966738)

In [64]:
print(classification_report(y_val, val_pred))
confusion_matrix(y_val, val_pred)

              precision    recall  f1-score   support

           0       0.94      0.90      0.92      3009
           1       0.95      0.98      0.97      3034
           2       0.90      0.88      0.89      2900
           3       0.89      0.91      0.90      3057

    accuracy                           0.92     12000
   macro avg       0.92      0.92      0.92     12000
weighted avg       0.92      0.92      0.92     12000



array([[2706,   98,  117,   88],
       [  28, 2980,   13,   13],
       [  73,   25, 2553,  249],
       [  86,   35,  153, 2783]])

In [66]:
test_pred = clf.predict(X_test_vec)

In [68]:
test_acc = accuracy_score(y_test, test_pred)
test_f1  = f1_score(y_test, test_pred, average="macro")

print(classification_report(y_test, test_pred))
test_acc, test_f1

              precision    recall  f1-score   support

           0       0.93      0.90      0.92      1900
           1       0.95      0.98      0.96      1900
           2       0.89      0.88      0.89      1900
           3       0.89      0.90      0.90      1900

    accuracy                           0.92      7600
   macro avg       0.92      0.92      0.92      7600
weighted avg       0.92      0.92      0.92      7600



(0.9168421052631579, 0.916625204795872)

In [71]:
from pathlib import Path
import joblib

Path("models").mkdir(exist_ok=True)

joblib.dump(tfidf, "models/tfidf.joblib")
joblib.dump(clf,   "models/logreg.joblib")

['models/logreg.joblib']

In [74]:
import json
from datetime import datetime

metrics = {
    "model": "tfidf_logreg",
    "dataset": "ag_news",
    "val_accuracy": acc,
    "val_f1_macro": f1,
    "test_accuracy": test_acc,
    "test_f1_macro": test_f1,
    "n_features": X_train_vec.shape[1],
    "timestamp": datetime.now().isoformat()
}

Path("reports").mkdir(exist_ok=True)
with open("reports/metrics_tfidf_logreg.json", "w") as f:
    json.dump(metrics, f, indent=2)