In [39]:
from typing import Dict, List, Optional
from collections import Counter
import os
import csv
from torch.utils.data import DataLoader
import torch.optim as optim
import torchmetrics
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
from conlleval import evaluate

In [40]:
class Tokenizer:
    def __init__(self):
        # two special tokens for padding and unknown
        self.token2idx = {"<pad>": 0, "<unk>": 1}
        self.idx2token = ["<pad>", "<unk>"]
        self.is_fit = False
    
    @property
    def pad_id(self):
        return self.token2idx["<pad>"]
    
    def __len__(self):
        return len(self.idx2token)
    
    def fit(self, train_texts: List[str]):
        counter = Counter()
        for text in train_texts:
            counter.update(text.lower().split())
        
        # manually set a vocabulary size for the data set
        vocab_size = 40000
        self.idx2token.extend([token for token, count in counter.most_common(vocab_size - 2)])
        for (i, token) in enumerate(self.idx2token):
            self.token2idx[token] = i
            
        self.is_fit = True
                
    def encode(self, text: str, max_length: Optional[int] = None) -> List[int]:
        if not self.is_fit:
            raise Exception("Please fit the tokenizer on the training tokens")
            
        # Split the text into tokens and encode each token using the token2idx mapping
        tokens = text.lower().split()
        token_ids = [self.token2idx.get(token, self.token2idx["<unk>"]) for token in tokens]

        # Pad or truncate the token ids based on the max_length parameter
        if max_length is not None:
            if len(token_ids) < max_length:
                token_ids += [self.token2idx["<pad>"]] * (max_length - len(token_ids))
            else:
                token_ids = token_ids[:max_length]

        return token_ids


In [41]:
def load_raw_data(filepath: str, with_tags: bool = True):
    data = {'text': []}
    if with_tags:
        data['tags'] = []
        with open(filepath) as f:
            reader = csv.reader(f)
            for text, tags in reader:
                data['text'].append(text)
                data['tags'].append(tags)
    else:
        with open(filepath) as f:
            for line in f:
                data['text'].append(line.strip())
    return data

In [42]:
tokenizer = Tokenizer()
data_dir = os.getcwd()
train_raw = load_raw_data(os.path.join(data_dir, "train.csv"))
val_raw = load_raw_data(os.path.join(data_dir, "val.csv"))
test_raw = load_raw_data(os.path.join(data_dir, "test_tokens.txt"), with_tags=False)
# fit the tokenizer on the training tokens
tokenizer.fit(train_raw['text'])

In [43]:
newtext = 'how are you doing today ?'
tokenizer.encode(newtext, max_length=10)


[803, 57, 256, 1313, 755, 1813, 0, 0, 0, 0]

In [44]:
#upload the dataset
#for google colb, use this
#from google.colab import files
#uploaded = files.upload()

In [45]:
class NERDataset: 
    tag2idx = {'O': 1, 'B-PER': 2, 'I-PER': 3, 'B-ORG': 4, 'I-ORG': 5, 'B-LOC': 6, 'I-LOC': 7, 'B-MISC': 8, 'I-MISC': 9}
    idx2tag = ['<pad>', 'O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG','B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']
  
    def __init__(self, raw_data: Dict[str, List[str]], tokenizer: Tokenizer, max_length: int = 128):
        self.tokenizer = tokenizer
        self.token_ids = []
        self.tag_ids = []
        self.with_tags = False
        for text in raw_data['text']:
            self.token_ids.append(tokenizer.encode(text, max_length=max_length))
        if 'tags' in raw_data:
            self.with_tags = True
            for tags in raw_data['tags']:
                self.tag_ids.append(self.encode_tags(tags, max_length=max_length))

    
    def encode_tags(self, tags: str, max_length: Optional[int] = None):
        tag_ids = [self.tag2idx[tag] for tag in tags.split()]
        if max_length is None:
            return tag_ids
        # truncate the tags if longer than max_length
        if len(tag_ids) > max_length:
            return tag_ids[:max_length]
        # pad with 0s if shorter than max_length
        else:
            return tag_ids + [0] * (max_length - len(tag_ids))  # 0 as padding for tags
        
    def __len__(self):
        return len(self.token_ids)
    
    def __getitem__(self, idx):
        token_ids = torch.LongTensor(self.token_ids[idx])
        mask = token_ids == self.tokenizer.pad_id  # padding tokens
        if self.with_tags:
            # for training and validation
            return token_ids, mask, torch.LongTensor(self.tag_ids[idx])
        else:
            # for testing
            return token_ids, mask
        

In [46]:
tr_data = NERDataset(train_raw, tokenizer)
va_data = NERDataset(val_raw, tokenizer)
te_data = NERDataset(test_raw, tokenizer)

In [47]:
samplenum = 265
print(len(tr_data))
print(train_raw['text'][samplenum])
print(tr_data[samplenum])


14041
The Greek socialist party 's executive bureau gave Prime Minister Costas Simitis its backing if he chooses to call snap elections , its general secretary Costas Skandalidis told reporters on Thursday .
(tensor([    2,  1638,  1466,   147,    15,   996,  1877,   407,   229,   103,
         2415,  2672,    63,  2698,   141,    26, 11142,     7,   629,  2673,
          269,     4,    63,   335,   750,  2415,  4652,    90,   524,    13,
           70,     3,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
           

In [48]:
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, embed_size, num_heads, hidden_size, num_layers):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        encoder_layer = nn.TransformerEncoderLayer(embed_size, num_heads, hidden_size,dropout= 0.1)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers)
        self.layer_norm = nn.LayerNorm(embed_size)
        self.linear = nn.Linear(embed_size, 10)

    def forward(self, src, src_mask):
        # src shape: (batch_size, max_length)
        # src_mask shape: (batch_size, max_length)
        embedded = self.embedding(src)  # shape: (batch_size, max_length, embed_size)
        encoded = self.transformer_encoder(embedded.transpose(0, 1), src_key_padding_mask=src_mask).transpose(0, 1)  # shape: (batch_size, max_length, embed_size)
        output = self.linear(encoded)  # shape: (batch_size, max_length, num_classes)
        return output

In [49]:
def validate(
    model: nn.Module, 
    dataloader: DataLoader, 
    device: torch.device,
):
    acc_metric = torchmetrics.Accuracy(task = 'multiclass', num_classes = 10, compute_on_step=False).to(device)
    loss_metric = torchmetrics.MeanMetric(compute_on_step=False).to(device)
    model.eval()
    
    with torch.no_grad():
        for batch in tqdm(dataloader):
            input_ids, input_mask, tags = batch[0].to(device), batch[1].to(device), batch[2].to(device)
            # output shape: (batch_size, max_length, num_classes)
            logits = model(input_ids, input_mask)
            # ignore padding index 0 when calculating loss
            loss = F.cross_entropy(logits.reshape(-1, 10), tags.reshape(-1), ignore_index=0)
                
            loss_metric.update(loss, input_mask.numel() - input_mask.sum())
            is_active = torch.logical_not(input_mask)  # non-padding elements
            # only consider non-padded tokens when calculating accuracy
            acc_metric.update(logits[is_active], tags[is_active])

    print(f"| Epoch {epoch+1} | Validate | loss {loss_metric.compute():.4f} | acc {acc_metric.compute():.4f} |!!!!!!!!!!!!!!!!!!!!!!!!!")
    val_loss = loss_metric.compute()

    return val_loss



In [50]:
#modify as required
def train(
    model: nn.Module, 
    dataloader: DataLoader, 
    optimizer: optim.Optimizer,
    device: torch.device,
    epoch: int,
):
    acc_metric = torchmetrics.Accuracy(task = 'multiclass', num_classes = 10, compute_on_step=False).to(device)
    loss_metric = torchmetrics.MeanMetric(compute_on_step=False).to(device)
    model.train()


    # loop through all batches in the training
    for batch in tqdm(dataloader):
        input_ids, input_mask, tags = batch[0].to(device), batch[1].to(device), batch[2].to(device)
        optimizer.zero_grad()
        # output shape: (batch_size, max_length, num_classes)
        logits = model(input_ids, input_mask)
        # ignore padding index 0 when calculating loss
        loss = F.cross_entropy(logits.reshape(-1, 10), tags.reshape(-1), ignore_index=0)
        
        loss.backward()
        optimizer.step()
        
        loss_metric.update(loss, input_mask.numel() - input_mask.sum())
        is_active = torch.logical_not(input_mask)  # non-padding elements
        # only consider non-padded tokens when calculating accuracy
        acc_metric.update(logits[is_active], tags[is_active])
    
    print(f"| Epoch {epoch+1} | Train | loss {loss_metric.compute():.4f} | acc {acc_metric.compute():.4f} |")


In [51]:
def predict(model: nn.Module, dataloader: DataLoader, device: torch.device) -> List[List[str]]:
    model.eval()
    preds = []

    idx2tag = ['<pad>', 'O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

    with torch.no_grad():
        for batch in tqdm(dataloader):
            input_ids, src_mask = batch[0].to(device), batch[1].to(device)
            logits = model(input_ids, src_mask=src_mask)
            batch_preds = logits.argmax(dim=-1).tolist()
            for i, tags in enumerate(batch_preds):
                # Get the true length of the unpadded sequence using the input mask
                seq_len = 128 - src_mask[i].sum().item()  # .item() converts a one-element tensor to a scalar
                # print(src_mask[i])
                # print(seq_len)
                # Convert the predicted tag indices to tag labels
                tag_preds = [idx2tag[idx] for idx in tags[:seq_len]]  # Slice the predicted tags to seq_len
                # print(tag_preds)
                # Append the tag predictions to the list of predictions
                preds.append(tag_preds)

    return preds

In [52]:
def init_weights(module):
    if isinstance(module, (nn.Linear, nn.Embedding)):
        nn.init.xavier_uniform_(module.weight)
#
# def init_weights(module):
#     if isinstance(module, (nn.Linear, nn.Embedding)):
#         nn.init.kaiming_uniform_(module.weight)

In [53]:
#simple trainer
torch.manual_seed(321)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

#hyperparameters
BATCH_SIZE = 32
EMBED_SIZE = 256
NUM_HEADS = 4
HIDDEN_SIZE = 256
NUM_LAYERS = 3
LEARNING_RATE = 0.000456008621520148
EPOCHS = 7

# data loaders
train_dataloader = DataLoader(tr_data, batch_size = BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(va_data, batch_size = BATCH_SIZE)
test_dataloader = DataLoader(te_data, batch_size = BATCH_SIZE)

# move the model to device
model = TransformerModel(vocab_size = len(tokenizer),
    embed_size = EMBED_SIZE,
    num_heads = NUM_HEADS,
    hidden_size = HIDDEN_SIZE,
    num_layers = NUM_LAYERS,).to(device)

optimizer = optim.Adam(model.parameters(), lr= LEARNING_RATE)
model.apply(init_weights)

for epoch in range(EPOCHS):
    train(model, train_dataloader, optimizer, device, epoch)
    validate(model, val_dataloader, device)


prediction = predict(model, val_dataloader, device)
pred_tags = []
for tags in prediction:
    pred_tags.extend(tags)
    pred_tags.append('O')

true_tags = []
for tags in val_raw['tags']:
    true_tags.extend(tags.strip().split())
    true_tags.append('O')

evaluate(true_tags, pred_tags)

cuda


100%|██████████| 439/439 [00:08<00:00, 48.89it/s]


| Epoch 1 | Train | loss 0.3059 | acc 0.9179 |


100%|██████████| 102/102 [00:00<00:00, 146.99it/s]


| Epoch 1 | Validate | loss 0.2547 | acc 0.9320 |!!!!!!!!!!!!!!!!!!!!!!!!!


100%|██████████| 439/439 [00:08<00:00, 50.55it/s]


| Epoch 2 | Train | loss 0.0954 | acc 0.9696 |


100%|██████████| 102/102 [00:00<00:00, 150.86it/s]


| Epoch 2 | Validate | loss 0.2584 | acc 0.9371 |!!!!!!!!!!!!!!!!!!!!!!!!!


100%|██████████| 439/439 [00:08<00:00, 50.96it/s]


| Epoch 3 | Train | loss 0.0578 | acc 0.9803 |


100%|██████████| 102/102 [00:00<00:00, 145.10it/s]


| Epoch 3 | Validate | loss 0.2995 | acc 0.9375 |!!!!!!!!!!!!!!!!!!!!!!!!!


100%|██████████| 439/439 [00:08<00:00, 51.06it/s]


| Epoch 4 | Train | loss 0.0435 | acc 0.9853 |


100%|██████████| 102/102 [00:00<00:00, 146.14it/s]


| Epoch 4 | Validate | loss 0.3157 | acc 0.9380 |!!!!!!!!!!!!!!!!!!!!!!!!!


100%|██████████| 439/439 [00:08<00:00, 50.94it/s]


| Epoch 5 | Train | loss 0.0330 | acc 0.9888 |


100%|██████████| 102/102 [00:00<00:00, 150.84it/s]


| Epoch 5 | Validate | loss 0.3041 | acc 0.9371 |!!!!!!!!!!!!!!!!!!!!!!!!!


100%|██████████| 439/439 [00:08<00:00, 50.23it/s]


| Epoch 6 | Train | loss 0.0273 | acc 0.9908 |


100%|██████████| 102/102 [00:00<00:00, 153.94it/s]


| Epoch 6 | Validate | loss 0.3608 | acc 0.9374 |!!!!!!!!!!!!!!!!!!!!!!!!!


100%|██████████| 439/439 [00:08<00:00, 51.60it/s]


| Epoch 7 | Train | loss 0.0232 | acc 0.9923 |


100%|██████████| 102/102 [00:00<00:00, 150.43it/s]


| Epoch 7 | Validate | loss 0.3923 | acc 0.9373 |!!!!!!!!!!!!!!!!!!!!!!!!!


100%|██████████| 102/102 [00:01<00:00, 100.34it/s]


processed 54612 tokens with 5942 phrases; found: 6002 phrases; correct: 4274.
accuracy:  69.67%; (non-O)
accuracy:  94.10%; precision:  71.21%; recall:  71.93%; FB1:  71.57
              LOC: precision:  85.16%; recall:  83.12%; FB1:  84.13  1793
             MISC: precision:  74.03%; recall:  74.51%; FB1:  74.27  928
              ORG: precision:  62.11%; recall:  63.68%; FB1:  62.89  1375
              PER: precision:  63.27%; recall:  65.47%; FB1:  64.35  1906


(71.20959680106631, 71.9286435543588, 71.56731413261889)

In [54]:
samplenum = 2157

print(val_raw['text'][samplenum])
print(val_raw['tags'][samplenum])
print(prediction[samplenum])

9. Mauricio Gugelmin ( Brazil ) , Reynard Ford Cosworth , 54.762
O B-PER I-PER O B-LOC O O B-ORG I-ORG I-ORG O O
['O', 'B-PER', 'I-PER', 'O', 'B-LOC', 'O', 'O', 'I-PER', 'B-MISC', 'I-PER', 'O', 'I-PER']


In [55]:
# YOU SHOULD NOT CHANGE THIS CODEBLOCK
# make prediction on the test set and save to submission.txt
preds = predict(model, test_dataloader, device)
with open("submission.txt", "w") as f:
    for tags in preds:
        f.write(" ".join(tags) + "\n")

100%|██████████| 108/108 [00:00<00:00, 127.46it/s]


In [56]:
pwd

'D:\\UCSB\\CS190I\\mp2'

In [57]:
ls

 Volume in drive D is 新加卷
 Volume Serial Number is 76AF-40CE

 Directory of D:\UCSB\CS190I\mp2

03/17/2023  10:37 PM    <DIR>          .
03/15/2023  03:49 PM    <DIR>          ..
03/17/2023  10:35 PM    <DIR>          .idea
03/17/2023  09:35 PM    <DIR>          __pycache__
03/17/2023  09:33 PM           270,227 autobert.ipynb
03/16/2023  07:43 PM           160,863 autotune.ipynb
03/16/2023  12:55 AM            51,492 bert.ipynb
03/17/2023  09:35 PM             7,502 conlleval.py
03/17/2023  09:54 PM    <DIR>          flagged
03/17/2023  10:08 PM         1,929,345 full.ipynb
03/17/2023  10:18 PM             2,848 stupid methos.ipynb
03/17/2023  10:37 PM           125,188 submission.txt
03/17/2023  10:37 PM            29,352 submission_template.ipynb
03/17/2023  09:33 PM            45,992 template.ipynb
03/17/2023  08:47 PM            10,532 test.py
03/10/2023  10:27 PM           246,547 test_tokens.txt
03/10/2023  10:27 PM         1,668,107 train.csv
03/17/2023  10:18 PM           129,