-------------------------
**Author**: Gunnvant

**Description**: Classification training loop using pytorch

------------------------

In [2]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AdamW

In [3]:
from datasets import load_dataset
import evaluate

In [4]:
raw_data = load_dataset("csv",data_files="../dataset.csv")

### Dataprep
- train,test and evaluation split
- tokenize and pad the dataset

In [5]:
ckpt = "distilbert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(ckpt,num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(ckpt)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
raw_train_test=raw_data['train'].train_test_split(0.2)

In [8]:
raw_test_valid = raw_train_test['test'].train_test_split(0.2)

In [9]:
from datasets import DatasetDict
raw_train_test_valid = DatasetDict({
'train':raw_train_test['train'],
'test':raw_test_valid['train'],
'valid':raw_train_test['test']
})

In [10]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer)

In [11]:
raw_train_test_valid['train'][0]

{'text': 'How does harry potter get down a hill? walking. jk. rowling.',
 'humor': True}

In [12]:
def get_label(example):
    example['labels'] = int(example['humor'])
    return example
def preprocess(example):
    return tokenizer(example['text'],padding=True)

In [13]:
preprocessed_dataset = raw_train_test_valid.map(get_label)
preprocessed_dataset = preprocessed_dataset.map(preprocess)

Map: 100%|██████████████████████████████████████████████████████| 160000/160000 [00:07<00:00, 21036.49 examples/s]
Map: 100%|████████████████████████████████████████████████████████| 32000/32000 [00:01<00:00, 22161.36 examples/s]
Map: 100%|████████████████████████████████████████████████████████| 40000/40000 [00:01<00:00, 22140.16 examples/s]
Map: 100%|███████████████████████████████████████████████████████| 160000/160000 [00:23<00:00, 6906.70 examples/s]
Map: 100%|█████████████████████████████████████████████████████████| 32000/32000 [00:04<00:00, 7073.69 examples/s]
Map: 100%|█████████████████████████████████████████████████████████| 40000/40000 [00:05<00:00, 7186.99 examples/s]


In [14]:
preprocessed_dataset['train'][0]

{'text': 'How does harry potter get down a hill? walking. jk. rowling.',
 'humor': True,
 'labels': 1,
 'input_ids': [101,
  2129,
  2515,
  4302,
  10693,
  2131,
  2091,
  1037,
  2940,
  1029,
  3788,
  1012,
  1046,
  2243,
  1012,
  5216,
  2989,
  1012,
  102],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [15]:
preprocessed_dataset = preprocessed_dataset.remove_columns(['text','humor'])
preprocessed_dataset.set_format("torch")

In [16]:
from torch.utils.data import DataLoader
train_dataloader = DataLoader(
    preprocessed_dataset["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    preprocessed_dataset["valid"], batch_size=8, collate_fn=data_collator
)

In [18]:
b1 = next(iter(train_dataloader))

In [27]:
b1

{'labels': tensor([1, 0, 0, 0, 0, 0, 1, 1]), 'input_ids': tensor([[  101,  2129,  2079,  2017,  2113,  2065,  2017,  1005,  2128,  2012,
          1037,  5637, 26375,  1029,  1996,  2980, 16168,  2015,  5510,  2066,
          4485,  1012,   102],
        [  101,  5294,  1010,  4121,  8398,  4171,  2933,  1024,  2028,  2062,
          4872,  2921,   102,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0],
        [  101, 16941,  4491,  2006,  5085,  2024,  3976,  1997,  8169,  1999,
          3617,  3690,   102,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0],
        [  101,  1045,  2439,  3635,  1024,  8201,  3766,  2439,  6445,  7038,
          2000,  5547, 11888,  3255,   102,     0,     0,     0,     0,     0,
             0,     0,     0],
        [  101,  2129,  2115, 19857,  6593, 24133,  3637,  6134,  2003, 23217,
          2075,  2115,  6897,  5119,   102,     0,     0,     0,     0,     0,
             0,     0,     0

### Model check

In [19]:
output = model(**b1)

In [25]:
output

SequenceClassifierOutput(loss=tensor(0.7317, grad_fn=<NllLossBackward0>), logits=tensor([[-0.1140,  0.0048],
        [-0.1782,  0.0135],
        [-0.1257,  0.0554],
        [-0.1708,  0.0559],
        [-0.1644,  0.0187],
        [-0.1333, -0.0180],
        [-0.1235, -0.0148],
        [-0.1320, -0.0266]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [24]:
optimizer = AdamW(model.parameters(), lr=5e-5)



### Model training

In [28]:
from transformers import get_scheduler
num_epochs = 1
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

20000


In [26]:
from tqdm.auto import tqdm

In [None]:
progress_bar = tqdm(range(num_training_steps))
model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)