Future Implementation:
Also add Western names to training dataset to minimize bias

In [1]:
from transformers import AutoTokenizer, AdamW, AutoModelForSequenceClassification, get_scheduler
from datasets import load_metric
from accelerate import Accelerator
from torch.utils.data import DataLoader
import torch
from torch.utils.data import Dataset
from tqdm import tqdm
import pandas as pd
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


### Data Processing

In [2]:
df = pd.read_csv("ind.csv")
df = df.drop_duplicates()
df.shape

(79, 2)

In [3]:
checkpoint = "bert-base-uncased"

In [4]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [5]:
texts = df["name"].tolist()
labels = df["label"].tolist()

In [6]:
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2)

In [7]:
tokenized_train = tokenizer(train_texts, padding=True, truncation=True)
tokenized_test = tokenizer(test_texts, padding=True, truncation=True)

In [8]:
class CustomDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.texts.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float32)
        return item

In [9]:
train_ds = CustomDataset(tokenized_train, train_labels)
test_ds = CustomDataset(tokenized_test, test_labels)

In [10]:
batch_size = 8

train_dl = DataLoader(train_ds, shuffle=True, batch_size=batch_size)
test_dl = DataLoader(test_ds, shuffle=True, batch_size=batch_size)

### Load Model

In [11]:
# define accelerator that eases train-test process
accelerator = Accelerator()

In [12]:
# define model
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=1)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
# define optimizer
optimizer = AdamW(model.parameters(), lr=0.00001)



In [14]:
# set dataloaders, model and optimizer to accelerator (to gpu or smth)
train_dl, test_dl, model, optimizer = accelerator.prepare(train_dl, test_dl, model, optimizer)

In [15]:
# define epochs
epochs = 20
training_steps = epochs * len(train_dl)

In [16]:
# define scheduler for training
scheduler = get_scheduler(
    "linear",
    optimizer = optimizer,
    num_warmup_steps=0,
    num_training_steps=training_steps
)

### Train Model

In [17]:
# set metrics
metric = load_metric("accuracy")

  metric = load_metric("accuracy")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [18]:
for epoch in tqdm(range(epochs)):
    
    # TRAINING
    model.train()

    for batch in train_dl:

        # forward pass
        output = model(**batch)
        loss = output.loss
        
        # metrics
        logits = output.logits
        preds = torch.round(logits)
        metric.add_batch(predictions=preds, references=batch["labels"])
        
        # backprop
        accelerator.backward(loss)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    print("Training: ", metric.compute())

    
    # TESTING
    model.eval()

    with torch.inference_mode():
        for batch in test_dl:

            # forward pass
            output = model(**batch)
            
            # metrics
            logits = output.logits
            preds = torch.round(logits)
            metric.add_batch(predictions=preds, references=batch["labels"])

    print("Test: ", metric.compute())

  attn_output = torch.nn.functional.scaled_dot_product_attention(
  5%|▌         | 1/20 [00:01<00:21,  1.12s/it]

Training:  {'accuracy': 0.5396825396825397}
Test:  {'accuracy': 0.9375}


 10%|█         | 2/20 [00:01<00:15,  1.15it/s]

Training:  {'accuracy': 0.8412698412698413}
Test:  {'accuracy': 0.9375}


 15%|█▌        | 3/20 [00:02<00:13,  1.24it/s]

Training:  {'accuracy': 0.9365079365079365}
Test:  {'accuracy': 1.0}


 20%|██        | 4/20 [00:03<00:12,  1.32it/s]

Training:  {'accuracy': 1.0}
Test:  {'accuracy': 1.0}


 25%|██▌       | 5/20 [00:04<00:11,  1.31it/s]

Training:  {'accuracy': 1.0}
Test:  {'accuracy': 1.0}


 30%|███       | 6/20 [00:04<00:10,  1.35it/s]

Training:  {'accuracy': 1.0}
Test:  {'accuracy': 1.0}


 35%|███▌      | 7/20 [00:05<00:09,  1.39it/s]

Training:  {'accuracy': 0.9523809523809523}
Test:  {'accuracy': 1.0}


 40%|████      | 8/20 [00:06<00:08,  1.42it/s]

Training:  {'accuracy': 1.0}
Test:  {'accuracy': 1.0}


 45%|████▌     | 9/20 [00:06<00:07,  1.40it/s]

Training:  {'accuracy': 1.0}
Test:  {'accuracy': 1.0}


 50%|█████     | 10/20 [00:07<00:06,  1.44it/s]

Training:  {'accuracy': 1.0}
Test:  {'accuracy': 1.0}


 55%|█████▌    | 11/20 [00:08<00:06,  1.45it/s]

Training:  {'accuracy': 0.9682539682539683}
Test:  {'accuracy': 1.0}


 60%|██████    | 12/20 [00:08<00:05,  1.51it/s]

Training:  {'accuracy': 1.0}
Test:  {'accuracy': 1.0}


 65%|██████▌   | 13/20 [00:09<00:04,  1.55it/s]

Training:  {'accuracy': 1.0}
Test:  {'accuracy': 1.0}


 70%|███████   | 14/20 [00:09<00:03,  1.54it/s]

Training:  {'accuracy': 1.0}
Test:  {'accuracy': 1.0}


 75%|███████▌  | 15/20 [00:10<00:03,  1.52it/s]

Training:  {'accuracy': 1.0}
Test:  {'accuracy': 1.0}


 80%|████████  | 16/20 [00:11<00:02,  1.55it/s]

Training:  {'accuracy': 1.0}
Test:  {'accuracy': 1.0}


 85%|████████▌ | 17/20 [00:11<00:01,  1.54it/s]

Training:  {'accuracy': 1.0}
Test:  {'accuracy': 1.0}


 90%|█████████ | 18/20 [00:12<00:01,  1.49it/s]

Training:  {'accuracy': 1.0}
Test:  {'accuracy': 1.0}


 95%|█████████▌| 19/20 [00:13<00:00,  1.50it/s]

Training:  {'accuracy': 1.0}
Test:  {'accuracy': 1.0}


100%|██████████| 20/20 [00:13<00:00,  1.44it/s]

Training:  {'accuracy': 1.0}
Test:  {'accuracy': 1.0}





### Evaluate Model

In [46]:
test_value = "Fleeting Heart"

encoded_name = tokenizer(test_value, truncation=True, padding=True, return_tensors="pt")

model.to("cpu")

model.eval()
with torch.inference_mode():
    output = model(**encoded_name)
    pred = output.logits.round()

if pred.item() == 1:
    print("Indonesian name detected")
else:
    print("Not an Indonesian name")

Not an Indonesian name
