In [1]:
from transformers import AutoTokenizer, AdamW, AutoModelForSequenceClassification, get_scheduler
from datasets import load_metric
from accelerate import Accelerator
from torch.utils.data import DataLoader
import torch
from torch.utils.data import Dataset
from tqdm import tqdm
import pandas as pd
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


### Data Processing

In [2]:
df = pd.read_csv("ind.csv")
df = df.drop_duplicates()
df.shape

(79, 2)

In [3]:
checkpoint = "bert-base-uncased"

In [4]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [5]:
texts = df["name"].tolist()
labels = df["label"].tolist()

In [6]:
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2)

In [7]:
tokenized_train = tokenizer(train_texts, padding=True, truncation=True)
tokenized_test = tokenizer(test_texts, padding=True, truncation=True)

In [8]:
class CustomDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.texts.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float32)
        return item

In [9]:
train_ds = CustomDataset(tokenized_train, train_labels)
test_ds = CustomDataset(tokenized_test, test_labels)

In [10]:
batch_size = 8

train_dl = DataLoader(train_ds, shuffle=True, batch_size=batch_size)
test_dl = DataLoader(test_ds, shuffle=True, batch_size=batch_size)

### Load Model

In [11]:
# define accelerator that eases train-test process
accelerator = Accelerator()

In [12]:
# define model
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=1)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
# define optimizer
optimizer = AdamW(model.parameters(), lr=0.00001)



In [14]:
# set dataloaders, model and optimizer to accelerator (to gpu or smth)
train_dl, test_dl, model, optimizer = accelerator.prepare(train_dl, test_dl, model, optimizer)

In [15]:
# define epochs
epochs = 100
training_steps = epochs * len(train_dl)

In [16]:
# define scheduler for training
scheduler = get_scheduler(
    "linear",
    optimizer = optimizer,
    num_warmup_steps=0,
    num_training_steps=training_steps
)

### Train Model

In [17]:
# set metrics
metric = load_metric("accuracy")

  metric = load_metric("accuracy")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [18]:
for epoch in tqdm(range(epochs)):
    
    # TRAINING
    model.train()

    for batch in train_dl:

        # forward pass
        output = model(**batch)
        loss = output.loss
        
        # metrics
        logits = output.logits
        preds = torch.round(logits)
        metric.add_batch(predictions=preds, references=batch["labels"])
        
        # backprop
        accelerator.backward(loss)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    print("Training: ", metric.compute())

    
    # TESTING
    model.eval()

    with torch.inference_mode():
        for batch in test_dl:

            # forward pass
            output = model(**batch)
            
            # metrics
            logits = output.logits
            preds = torch.round(logits)
            metric.add_batch(predictions=preds, references=batch["labels"])

    print("Test: ", metric.compute())

  attn_output = torch.nn.functional.scaled_dot_product_attention(
  1%|          | 1/100 [00:00<01:35,  1.04it/s]

Training:  {'accuracy': 0.2222222222222222}
Test:  {'accuracy': 0.5625}


  2%|▏         | 2/100 [00:01<01:21,  1.21it/s]

Training:  {'accuracy': 0.47619047619047616}
Test:  {'accuracy': 0.8125}


  3%|▎         | 3/100 [00:02<01:14,  1.30it/s]

Training:  {'accuracy': 0.8412698412698413}
Test:  {'accuracy': 1.0}


  4%|▍         | 4/100 [00:03<01:15,  1.27it/s]

Training:  {'accuracy': 0.9047619047619048}
Test:  {'accuracy': 1.0}


  5%|▌         | 5/100 [00:03<01:12,  1.31it/s]

Training:  {'accuracy': 0.9682539682539683}
Test:  {'accuracy': 1.0}


  6%|▌         | 6/100 [00:04<01:10,  1.33it/s]

Training:  {'accuracy': 0.9841269841269841}
Test:  {'accuracy': 1.0}


  7%|▋         | 7/100 [00:05<01:09,  1.33it/s]

Training:  {'accuracy': 0.9841269841269841}
Test:  {'accuracy': 1.0}


  8%|▊         | 8/100 [00:06<01:07,  1.37it/s]

Training:  {'accuracy': 1.0}
Test:  {'accuracy': 1.0}


  9%|▉         | 9/100 [00:06<01:06,  1.36it/s]

Training:  {'accuracy': 1.0}
Test:  {'accuracy': 1.0}


 10%|█         | 10/100 [00:07<01:04,  1.39it/s]

Training:  {'accuracy': 1.0}
Test:  {'accuracy': 1.0}


 11%|█         | 11/100 [00:08<01:03,  1.41it/s]

Training:  {'accuracy': 1.0}
Test:  {'accuracy': 1.0}


 12%|█▏        | 12/100 [00:08<01:00,  1.45it/s]

Training:  {'accuracy': 1.0}
Test:  {'accuracy': 1.0}


 13%|█▎        | 13/100 [00:09<00:59,  1.46it/s]

Training:  {'accuracy': 0.9841269841269841}
Test:  {'accuracy': 1.0}


 14%|█▍        | 14/100 [00:10<00:59,  1.46it/s]

Training:  {'accuracy': 0.9841269841269841}
Test:  {'accuracy': 1.0}


 15%|█▌        | 15/100 [00:10<00:58,  1.46it/s]

Training:  {'accuracy': 0.9841269841269841}
Test:  {'accuracy': 1.0}


 16%|█▌        | 16/100 [00:11<00:58,  1.44it/s]

Training:  {'accuracy': 1.0}
Test:  {'accuracy': 1.0}


 17%|█▋        | 17/100 [00:12<00:59,  1.39it/s]

Training:  {'accuracy': 1.0}
Test:  {'accuracy': 1.0}


 17%|█▋        | 17/100 [00:12<01:02,  1.33it/s]


KeyboardInterrupt: 