# Transformer Model

In [1]:
import torch
from sklearn.datasets import fetch_20newsgroups
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
import torch
from torch.utils.data import Dataset
print(torch.__version__)

  from .autonotebook import tqdm as notebook_tqdm


1.10.1+cu113


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [3]:
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))

print(f"Number of training samples: {len(newsgroups_train.data)}")
print(f"Number of test samples: {len(newsgroups_test.data)}")
print(f"Number of classes: {len(newsgroups_train.target_names)}")

Number of training samples: 11314
Number of test samples: 7532
Number of classes: 20


In [4]:
model_name = 'bert-base-uncased'

tokenizer = AutoTokenizer.from_pretrained(model_name)

train_encodings = tokenizer(newsgroups_train.data, truncation=True, padding=True)
test_encodings = tokenizer(newsgroups_test.data, truncation=True, padding=True)

In [5]:
class NewsgroupsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = NewsgroupsDataset(train_encodings, newsgroups_train.target)
test_dataset = NewsgroupsDataset(test_encodings, newsgroups_test.target)

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(newsgroups_train.target_names))
model.to(device) 

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    # logging_dir='./logs',
    logging_steps=50000,
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)


trainer.train()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

{'train_runtime': 773.131, 'train_samples_per_second': 43.902, 'train_steps_per_second': 5.491, 'train_loss': 0.8346691190002945, 'epoch': 3.0}





TrainOutput(global_step=4245, training_loss=0.8346691190002945, metrics={'train_runtime': 773.131, 'train_samples_per_second': 43.902, 'train_steps_per_second': 5.491, 'train_loss': 0.8346691190002945, 'epoch': 3.0})

## Predict

In [6]:
texts = ["The new Health tools have made cardiovascular and heart surgeries much easier.", "AI has vastly improved the software scene. Software engineers are much more efficient."]

model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

encodings = tokenizer(texts, truncation=True, padding=True, return_tensors="pt")

input_ids = encodings['input_ids'].to(device)
attention_mask = encodings['attention_mask'].to(device)

with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)

logits = outputs.logits


probabilities = torch.softmax(logits, dim=1)

predictions = torch.argmax(probabilities, dim=1)

predicted_labels = [newsgroups_train.target_names[prediction] for prediction in predictions.cpu().numpy()]

print(predicted_labels)

['sci.med', 'comp.sys.mac.hardware']
