In [1]:
import pandas as pd
import torch
import tqdm
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from torch.utils.data import Subset

from transformers import BertTokenizer, BertModel,BertForSequenceClassification



  from .autonotebook import tqdm as notebook_tqdm


### Let's load the classificationd at from asnq

In [2]:
dataset = load_dataset("asnq")

{'question': 'what is the use of fn key in mac', 'sentence': 'It is typically found on laptops due to their keyboard size restrictions .', 'label': 0, 'sentence_in_long_answer': False, 'short_answer_in_sentence': False}


In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [16]:

labels=dataset['train']['label']
labels=pd.Series(labels)
labels0=labels[labels==0]
labels1=labels[labels==1]
max_len=2048
assert(len(labels)==len(labels0)+len(labels1))
selected_labels_train=list(pd.concat([labels0.sample(max_len),labels1.sample(max_len)]).index)



labels=dataset['validation']['label']
labels=pd.Series(labels)
labels0=labels[labels==0]
labels1=labels[labels==1]
max_len=256
assert(len(labels)==len(labels0)+len(labels1))
selected_labels_val=list(pd.concat([labels0.sample(max_len),labels1.sample(max_len)]).index)

In [17]:
class ASNQDataset(Dataset):
    def __init__(self, asnq_split):
        self.data = asnq_split

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Retrieve the question and answers
        question = self.data[idx]['question']
        answer = self.data[idx]['sentence']
        label= self.data[idx]['label']
        
        output = tokenizer(question + " [SEP] " + answer, 
                                           add_special_tokens=True,
                                           truncation=True, 
                                           max_length=128,
                                           padding='max_length',
                                           return_tensors='pt')
        output['label']=torch.tensor([label],dtype=torch.float32)
        return output

In [18]:
train_dataset=Subset(ASNQDataset(dataset['train']),selected_labels_train)
val_dataset=Subset(ASNQDataset(dataset['validation']),selected_labels_val)
train_loader=DataLoader(train_dataset,batch_size=32,shuffle=True)
val_loader=DataLoader(val_dataset,batch_size=32)

In [31]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
criterion = nn.CrossEntropyLoss()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [32]:
def evaluate():
    corrects=0
    total=0
    model.eval()
    for batch in tqdm.tqdm(val_loader):
        input_ids=batch['input_ids'].squeeze(1).to(device)
        attention_mask=batch['attention_mask'].squeeze(1).to(device)
        label=batch['label'].to(device)
        output=model(input_ids,attention_mask)
        corrects+=((output.logits>0.5)*1.==label).sum()
        total+=output.logits.shape[0]
    return corrects/total


In [33]:
# evaluate()

In [34]:

def train():
    for epoch in range(3):
        total_loss = 0
        model.train()
        for batch in tqdm.tqdm(train_loader):
            input_ids=batch['input_ids'].squeeze(1).to(device)
            attention_mask=batch['attention_mask'].squeeze(1).to(device)
            label=batch['label'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=label)
            loss = outputs.loss
            optimizer.zero_grad()
            loss.backward() 
            optimizer.step()

            total_loss += loss.item()
        acc=evaluate()
        avg_loss = total_loss / len(train_loader)
        print(epoch,avg_loss,acc.item())
    return avg_loss

In [35]:
train()

100%|██████████| 128/128 [00:45<00:00,  2.82it/s]
100%|██████████| 16/16 [00:06<00:00,  2.35it/s]


0 0.18299589978414588 0.8359375


100%|██████████| 128/128 [02:06<00:00,  1.01it/s]
100%|██████████| 16/16 [00:05<00:00,  2.81it/s]


1 0.09128859001793899 0.837890625


100%|██████████| 128/128 [02:09<00:00,  1.01s/it]
100%|██████████| 16/16 [00:05<00:00,  2.85it/s]


2 0.04259881856341963 0.853515625


0.04259881856341963

In [36]:
dataset = load_dataset("wiki_qa")


Downloading readme: 100%|██████████| 13.8k/13.8k [00:00<00:00, 13.9MB/s]
Downloading data: 100%|██████████| 594k/594k [00:00<00:00, 1.62MB/s]
Downloading data: 100%|██████████| 264k/264k [00:00<00:00, 1.50MB/s]
Downloading data: 100%|██████████| 2.00M/2.00M [00:00<00:00, 6.66MB/s]
Generating test split: 100%|██████████| 6165/6165 [00:00<00:00, 343868.56 examples/s]
Generating validation split: 100%|██████████| 2733/2733 [00:00<00:00, 909836.72 examples/s]
Generating train split: 100%|██████████| 20360/20360 [00:00<00:00, 1696519.97 examples/s]


In [38]:
dataset

DatasetDict({
    test: Dataset({
        features: ['question_id', 'question', 'document_title', 'answer', 'label'],
        num_rows: 6165
    })
    validation: Dataset({
        features: ['question_id', 'question', 'document_title', 'answer', 'label'],
        num_rows: 2733
    })
    train: Dataset({
        features: ['question_id', 'question', 'document_title', 'answer', 'label'],
        num_rows: 20360
    })
})

In [39]:

labels=dataset['train']['label']
labels=pd.Series(labels)
labels0=labels[labels==0]
labels1=labels[labels==1]
max_len=512
assert(len(labels)==len(labels0)+len(labels1))
selected_labels_train=list(pd.concat([labels0.sample(max_len),labels1.sample(max_len)]).index)



labels=dataset['validation']['label']
labels=pd.Series(labels)
labels0=labels[labels==0]
labels1=labels[labels==1]
max_len=64
assert(len(labels)==len(labels0)+len(labels1))
selected_labels_val=list(pd.concat([labels0.sample(max_len),labels1.sample(max_len)]).index)

In [50]:
class WikiDataset(Dataset):
    def __init__(self, asnq_split):
        self.data = asnq_split

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Retrieve the question and answers
        question = self.data[idx]['question']
        answer = self.data[idx]['answer']
        label= self.data[idx]['label']
        
        output = tokenizer(question + " [SEP] " + answer, 
                                           add_special_tokens=True,
                                           truncation=True, 
                                           max_length=128,
                                           padding='max_length',
                                           return_tensors='pt')
        output['label']=torch.tensor([label],dtype=torch.float32)
        return output

In [53]:
train_dataset=Subset(WikiDataset(dataset['train']),selected_labels_train)
val_dataset=Subset(WikiDataset(dataset['validation']),selected_labels_val)
train_loader=DataLoader(train_dataset,batch_size=32,shuffle=True)
val_loader=DataLoader(val_dataset,batch_size=32)

In [55]:
train()

100%|██████████| 32/32 [00:21<00:00,  1.46it/s]
100%|██████████| 4/4 [00:00<00:00,  5.34it/s]


0 0.1914022695273161 0.7265625


100%|██████████| 32/32 [00:20<00:00,  1.54it/s]
100%|██████████| 4/4 [00:00<00:00,  5.28it/s]


1 0.10124774661380798 0.6953125


100%|██████████| 32/32 [00:21<00:00,  1.49it/s]
100%|██████████| 4/4 [00:01<00:00,  3.82it/s]

2 0.04649127341690473 0.7265625





0.04649127341690473