<a href="https://colab.research.google.com/github/Koowater/goorm-Magicians/blob/main/edit_distance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###편집거리 함수 정의

In [None]:
def edit(s1, s2, debug=False):
    if len(s1) < len(s2):
        return edit(s2, s1, debug)

    if len(s2) == 0:
        return len(s1)

    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))

        if debug:
            print(current_row[1:])

        previous_row = current_row

    return previous_row[-1]

### 학습할때 loss와 함께 편집거리 계산

In [None]:
import os
from statistics import mean

import torch.nn.functional as F
from torch.nn.utils import clip_grad_norm_
from tqdm.notebook import tqdm

os.makedirs('dump', exist_ok=True)
train_epoch=2  #train_epoch설정
train_losses = []
dev_losses = []
dev_edit_ds=[]

step = 0

for epoch in range(train_epoch): 
    print("Epoch", epoch)
    # Training
    running_loss = 0.
    losses = []
    progress_bar = tqdm(train_loader, desc='Train')
    for batch in progress_bar:
        del batch['guid'], batch['context'], batch['question'], batch['answer'] #동영님 코드에는 batch["offsets"] 추가
        batch = {key: value.cuda() for key, value in batch.items()}
        start = batch.pop('start')
        end = batch.pop('end')
        
        start_logits, end_logits = model(**batch, return_dict=False)
        loss = F.cross_entropy(start_logits, start) + F.cross_entropy(end_logits, end)
        (loss / accumulation).backward()
        running_loss += loss.item()
        del batch, start, end, start_logits, end_logits, loss
        
        step += 1
        if step % accumulation:
            continue

        clip_grad_norm_(model.parameters(), max_norm=1.)
        optimizer.step()
        optimizer.zero_grad(set_to_none=True)

        losses.append(running_loss / accumulation)
        running_loss = 0.
        progress_bar.set_description(f"train_loss: {losses[-1]:.3f}")
    train_losses.append(mean(losses))
    #wandb.log({'train_loss': mean(train_losses)})  #wandb 기록
    print(f"train score: {train_losses[-1]:.3f}")

    # Evaluation
    losses = []
    edit_ds=[]
    for batch in tqdm(dev_loader, desc="evaluation"): #batch는 16개씩 모아놓은 세트(batch_size=64, accumulation=4 인 경우)
        for i in range(len(batch["question"])): #len(batch["question"])=64/4=16
          input_ids, token_type_ids=[
              torch.tensor(batch[key][i],dtype=torch.long, device='cuda')
              for key in ("input_ids", "token_type_ids")
              ]
          with torch.no_grad():
              start_logits, end_logits = model(input_ids=input_ids[None, :], token_type_ids=token_type_ids[None, :], return_dict=False)
          start_logits.squeeze_(0), end_logits.squeeze_(0)

          start = torch.argmax(start_logits).item()
          end = torch.argmax(end_logits).item()

          y_true=tokenizer.decode(tokenizer(batch['question'][i] + '[SEP]' + batch['context'][i])["input_ids"][batch['start'][i]:batch['end'][i]+1])
          y_pred=tokenizer.decode(tokenizer(batch['question'][i] + '[SEP]' + batch['context'][i])['input_ids'][start:end+1]) 
          if end-start>10:
            y_pred=""
          edit_ds.append(edit(y_pred, y_true))
        del batch['guid'], batch['context'], batch['question'], batch['answer'] #동영님 코드에는 batch["offsets"] 추가
        batch = {key: value.cuda() for key, value in batch.items()} 
        s = batch.pop('start') 
        e = batch.pop('end') 
        
        with torch.no_grad():
          start_logits, end_logits = model(**batch, return_dict=False) 
          start = torch.argmax(start_logits).item()
          end = torch.argmax(end_logits).item()
        loss = F.cross_entropy(start_logits, s) + F.cross_entropy(end_logits, e)
        losses.append(loss.item())

        del batch, start, end, start_logits, end_logits, loss, s, e
        
        
    dev_losses.append(mean(losses))
    dev_edit_ds.append(mean(edit_ds))
    print(f"Evaluation score: {dev_losses[-1]:.3f}")
    print(f"dev_edit_distance: {dev_edit_ds[-1]:.3f}")
    #wandb에 기록
    #wandb.log({'dev_loss': mean(dev_losses), 'dev_edit_distance': mean(dev_edit_ds)})
    model.save_pretrained(f'dump/model.{epoch}')   

###학습시킨 후에 dev dataset에 대해서만 편집거리 계산

In [None]:
edit_ds=[]
for idx, sample in zip(range(len(dev_dataset)), dev_dataset):  
    input_ids, token_type_ids = [
        torch.tensor(sample[key], dtype=torch.long, device="cuda")
        for key in ("input_ids", "token_type_ids")
    ]
    
    with torch.no_grad():
        start_logits, end_logits = model(input_ids=input_ids[None, :], token_type_ids=token_type_ids[None, :], return_dict=False)
    start_logits.squeeze_(0), end_logits.squeeze_(0)


    start = torch.argmax(start_logits).item()
    end = torch.argmax(end_logits).item()

    y_true=tokenizer.decode(tokenizer(sample['question'] + '[SEP]' + sample['context'])['input_ids'][sample['start']:sample['end']+1])
    y_pred=tokenizer.decode(tokenizer(sample['question'] + '[SEP]' + sample['context'])['input_ids'][start:end+1])
    if end-start>10:
      y_pred=''

    edit_ds.append(edit(y_pred, y_true))
    dev_edit_distance=mean(edit_ds)

print(f"dev_edit_distance: {mean(edit_ds):.3f}")