In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
#Loading the data
train = load_dataset("csv", data_files="data/train_cause.csv")
#Splitting the dataset into train and test
train = train['train'].train_test_split(test_size=0.3)

#Loading the tokenizer for the bert-base-uncased model
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

#Function to Tokenize the data
def tokenize_function(example):
    return tokenizer(example["description"], truncation=True)

#Tokenizing the data
tokenized_train = train.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/2724 [00:00<?, ? examples/s]

Map:   0%|          | 0/1168 [00:00<?, ? examples/s]

In [2]:
#Preprocessing the tokenized data
tokenized_train = tokenized_train.remove_columns(["description", "Index"])
tokenized_train = tokenized_train.rename_column("label", "labels")
tokenized_train.set_format("torch")
tokenized_train["train"].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [3]:
#Loading the data using pytorch Dataloader
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_train["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_train["test"], batch_size=8, collate_fn=data_collator
)

In [4]:
#Loading the Model
from transformers import AutoModelForSequenceClassification
#There are 12 values for causes so num_labels is 12
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=12)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
from transformers import AdamW
from transformers import get_scheduler
import torch
torch.manual_seed(42)

#Setting the device
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

#Initializing the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)


#Initializing the Learning rate scheduler from transformers library
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

1023




In [6]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

#Finetuning the model 
model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/1023 [00:00<?, ?it/s]

In [7]:
# Evaluating the Finetuned Model
acc, nos = 0, 0
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    acc += (predictions == batch['labels']).sum().item()
    nos += batch['labels'].size(0)

print("Accuracy: %.2f%%" %((acc/nos)*100))

Accuracy: 99.83%


In [8]:
# Saving the model
model.save_pretrained("cause_model")