# Linear Baseline

In [1]:
import numpy as np

from datasets import load_dataset, load_metric

import sklearn
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
import wandb

import class_attention as cat

In [2]:
news_dataset = load_dataset("Fraser/news-category-dataset")

Using custom data configuration default
Reusing dataset news_category (/home/vlialin/.cache/huggingface/datasets/news_category/default/0.0.0/737b7b6dff469cbba49a6202c9e94f9d39da1fed94e13170cf7ac4b61a75fb9c)


In [3]:
# some magic is happening here to make a toy dataset that is consistent, read carefuly
p = 1.0

all_classes = list(set(news_dataset['train']['category']))
classes_left = all_classes

if p < 1.0:
    train_set = cat.utils.sample_dataset(news_dataset['train'], p=0.1)

    classes_left = list(set(train_set['category']))

    valid_set = news_dataset['validation']
    if len(all_classes) > len(classes_left):
        _, valid_set = split_classes(valid_set, valid_classes=classes_left)

    valid_set = cat.utils.sample_dataset(valid_set, p=0.1)

else:
    train_set = news_dataset['train']
    valid_set = news_dataset['validation']


In [4]:
vectorizer = TfidfVectorizer(ngram_range=(1, 4), max_features=100_000)
X_train = vectorizer.fit_transform(train_set['headline'])
y_train = train_set['category']

model = LinearSVC()
model.fit(X_train, y_train)

LinearSVC()

In [5]:
X_test = vectorizer.transform(valid_set['headline'])
y_test = valid_set['category']

In [6]:
model.score(X_test, y_test)

0.5882704371203824

# BERT baseline

In [7]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

In [8]:
MODEL = "distilbert-base-uncased"

num_labels = len(set(news_dataset['train']['category']))

tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=num_labels)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

In [9]:
def preprocess_function(examples):
    return {**tokenizer(examples["headline"], truncation=True), "label": examples["category_num"]}

In [10]:
encoded_train_set = train_set.map(preprocess_function, batched=True)
encoded_valid_set = valid_set.map(preprocess_function, batched=True)

Loading cached processed dataset at /home/vlialin/.cache/huggingface/datasets/news_category/default/0.0.0/737b7b6dff469cbba49a6202c9e94f9d39da1fed94e13170cf7ac4b61a75fb9c/cache-4e82867a3b7eb9ca.arrow
Loading cached processed dataset at /home/vlialin/.cache/huggingface/datasets/news_category/default/0.0.0/737b7b6dff469cbba49a6202c9e94f9d39da1fed94e13170cf7ac4b61a75fb9c/cache-a292d1160d0db749.arrow


In [11]:
batch_size = 24

args = TrainingArguments(
    output_dir="debug_outputs",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
#     weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

In [12]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    return {"accuracy": np.sum(predictions == labels) / predictions.shape[0]}


In [13]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_train_set,
    eval_dataset=encoded_valid_set,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.args._n_gpu = 1

In [14]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mguitaricet[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.10.19 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Epoch,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
1,1.3176,1.244684,0.639052,4.0006,2510.357
2,1.1013,1.192125,0.649109,4.0008,2510.272
3,0.909,1.212866,0.656676,4.0014,2509.894
4,0.7573,1.245844,0.657274,3.9985,2511.697
5,0.6572,1.28406,0.655481,3.9996,2511.03


TrainOutput(global_step=33480, training_loss=0.9981828969866572, metrics={'train_runtime': 1625.4308, 'train_samples_per_second': 20.598, 'total_flos': 7818527364200568, 'epoch': 5.0})