In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%pip install transformers plotly evaluate sacrebleu

In [None]:
from huggingface_hub import login

login()

In [None]:
CAUSAL_CKPT = ""
CAUSAL_COMMIT = None
SEQ2SEQ_CKPT = ""
SEQ2SEQ_COMMIT=None

In [None]:
import torch

DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'

In [None]:
import json

%cd "/content/drive/MyDrive/02 ColabDir/na-lllm finetune-bart"

with open('./inqueries_aug_1_0.json', encoding='utf-8') as f:
    sample = json.load(f)['data']['test']

_join = lambda l: '\n'.join(l)
causalSample = list(map(lambda e: (f"<{e['organization']}> {e['title']}\n{_join(e['question'])}\n----답변----\n",
                                   _join(e['answer'])
                                  ), sample))
seqSample = list(map(lambda e: (f"<{e['organization']}> {e['title']}\n{_join(e['question'])}", _join(e['answer'])), sample))
seqSample[:10], causalSample[:10]

## Testing for the Causal Model

In [None]:
# result

config = {
    "max_new_tokens":128,
    "early_stopping":True,
    "do_sample":False,
    "num_beams":2,
    "num_beam_groups":1,
    "use_cache":False,
    "temperature":1.0,
    "top_k":10,
    "top_p":1.0, # If set to float < 1, only the smallest set of
                 # most probable tokens with probabilities that add up to top_p or
                 # higher are kept for generation.
                 # (https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig.top_p)
    "diversity_penalty":0.0,
    "repetition_penalty":1.2,
    "length_penalty":1.0,
}

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

causalModel = AutoModelForCausalLM.from_pretrained(CAUSAL_CKPT,
                                                revision=CAUSAL_COMMIT).to(DEVICE)
causalTokenizer = AutoTokenizer.from_pretrained(CAUSAL_CKPT,
                                                revision=CAUSAL_COMMIT)

### Surveying the HuggingFace implementation of `generate()`

In [None]:
%%time
x = "<경상남도 사천시> 소독업 신고\n소독업 신고는 어떻게 하나요?\n----답변----\n"
x_ids = causalTokenizer([x], padding=True, return_tensors='pt').input_ids.to(DEVICE)
x_mask = causalTokenizer([x], padding=True, return_tensors='pt').attention_mask.to(DEVICE)
x = causalModel.generate(inputs=x_ids,
                      attention_mask=x_mask,
                      pad_token_id=causalTokenizer.eos_token_id,
                      max_new_tokens=512,
                      do_sample=False,
                      num_beams=1,
                      # greedy search
                      )[0].tolist()
x = causalTokenizer.decode(x)
x

In [None]:
x = "<경상남도 사천시> 소독업 신고\n소독업 신고는 어떻게 하나요?\n----답변----\n"
x_ids = causalTokenizer([x], padding=True, return_tensors='pt').input_ids.to(DEVICE)
x_mask = causalTokenizer([x], padding=True, return_tensors='pt').attention_mask.to(DEVICE)
x = causalModel.generate(inputs=x_ids,
                      attention_mask=x_mask,
                      pad_token_id=causalTokenizer.eos_token_id,
                      max_new_tokens=512,
                      do_sample=False,
                      num_beams=10,
                      # beam search
                      # beam search performance diverge at eiter beam size = 10
                      # or beam size = 5
                      # Refer: Hargreaves et al. (2021)
                      #     https://doi.org/10.18653/v1/2021.eacl-main.219.
                      )[0].tolist()
x = causalTokenizer.decode(x)
x

In [None]:
x = "<경상남도 사천시> 소독업 신고\n소독업 신고는 어떻게 하나요?\n----답변----\n"
x_ids = causalTokenizer([x], padding=True, return_tensors='pt').input_ids.to(DEVICE)
x_mask = causalTokenizer([x], padding=True, return_tensors='pt').attention_mask.to(DEVICE)
x = causalModel.generate(inputs=x_ids,
                      attention_mask=x_mask,
                      pad_token_id=causalTokenizer.eos_token_id,
                      max_new_tokens=512,
                      do_sample=True,
                      num_beams=9,
                      # beam search multinomial
                      )[0].tolist()
x = causalTokenizer.decode(x)
x

In [None]:
x = "<경상남도 사천시> 소독업 신고\n소독업 신고는 어떻게 하나요?\n----답변----\n"
x_ids = causalTokenizer([x], padding=True, return_tensors='pt').input_ids.to(DEVICE)
x_mask = causalTokenizer([x], padding=True, return_tensors='pt').attention_mask.to(DEVICE)
x = causalModel.generate(inputs=x_ids,
                      attention_mask=x_mask,
                      pad_token_id=causalTokenizer.eos_token_id,
                      max_new_tokens=512,
                      num_beams=10,
                      # should be n*(num_beam_groups) where n is an integer
                      num_beam_groups=2,
                      # diverse beam search
                      )[0].tolist()
x = causalTokenizer.decode(x)
x

In [None]:
x = "<경상남도 사천시> 소독업 신고\n소독업 신고는 어떻게 하나요?\n----답변----\n"
x_ids = causalTokenizer([x], padding=True, return_tensors='pt').input_ids.to(DEVICE)
x_mask = causalTokenizer([x], padding=True, return_tensors='pt').attention_mask.to(DEVICE)
x = causalModel.generate(inputs=x_ids,
                      attention_mask=x_mask,
                      pad_token_id=causalTokenizer.eos_token_id,
                      max_new_tokens=512,
                      penalty_alpha=0.6, # [0, 1]
                                         # greater means prioritize unique generations
                                         # lesser means prioritize model confidence
                      top_k=8, # typically [3,10]
                                # determines how many candidtates to make
                      # contrastive search
                      # refer: Su et al. (2022)
                      # https://arxiv.org/pdf/2202.06417
                      repetition_penalty=1.1,
                      )[0].tolist()
x = causalTokenizer.decode(x, skip_special_tokens=True,
                        clean_up_tokenization_spaces=True)
print(x)

Overall, **beam search** with proper repetition penal gives out the most-liekly results.

### Running experiments

In [None]:
def generate(txts, model, tokenizer, args):
    x = txts
    x = model.generate(
            inputs=tokenizer(txts, return_tensors='pt',
                                padding='longest',
                                # pad_token_id=tokenizer.eos_token_id,
                                ).input_ids.to(DEVICE),
            attention_mask=tokenizer(txts, return_tensors='pt',
                                padding='longest',
                                # pad_token_id=tokenizer.eos_token_id,
                                ).attention_mask.to(DEVICE),
            max_new_tokens=args['max_new_tokens'],
            early_stopping=args['early_stopping'],
            do_sample=args['do_sample'],
            num_beams=args['num_beams'],
            num_beam_groups=args['num_beam_groups'],
            use_cache=args['use_cache'],
            temperature=args['temperature'],
            top_k=args['top_k'],
            top_p=args['top_p'],
            diversity_penalty=args['diversity_penalty'],
            repetition_penalty=args['repetition_penalty'],
            length_penalty=args['length_penalty'],
        ).tolist()
    torch.cuda.empty_cache()
    x = [tokenizer.decode(_) for _ in x]
    return x

defaultconfig = {
    "max_new_tokens":100,
    "early_stopping":True,
    "do_sample":False,
    "num_beams":1,
    "num_beam_groups":1,
    "use_cache":False,
    "temperature":1.0,
    "top_k":50,
    "top_p":1.0, # If set to float < 1, only the smallest set of
                 # most probable tokens with probabilities that add up to top_p or
                 # higher are kept for generation.
                 # (https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig.top_p)
    "diversity_penalty":0.0,
    "repetition_penalty":2.5,
    "length_penalty":1.0,
}

In [None]:
generations = list()
elapseds = list()

In [None]:
len(causalSample)/4

In [None]:
torch.cuda.empty_cache()

In [None]:
# generation time per max_length

import plotly.express as px

from datetime import datetime

elapsed = list()
generated = list()

max_lenghts = range(64, 513, 64)

torch.cuda.empty_cache()

causalTokenizer.padding_side='left'
causalTokenizer.pad_token_id = causalTokenizer.eos_token_id

for i in max_lenghts:
    print(i)
    config=defaultconfig
    config['max_new_tokens']=i
    for j in range(int(len(causalSample)/4) + 1):
        since = datetime.now()
        generated.extend((i, generate([s[0] for s in causalSample[4*j : 4*(j+1)]], causalModel, causalTokenizer, config)))
        elapsed.append({"max_new_tokens": i, "elapsed": (datetime.now()-since).seconds})
        torch.cuda.empty_cache()

generations.append(('max_new_tokens>time', generated))
elapseds.append(('max_new_tokens>time', elapsed))

px.line(elapsed, x='max_new_tokens', y='elapsed')

In [None]:
# generation performance per max_length

import evaluate

import plotly.express as px

from datetime import datetime

metric = evaluate.load('sacrebleu')

elapsed = list()
generated = list()

max_lenghts = range(64, 513, 64)

torch.cuda.empty_cache()

causalTokenizer.padding_side='left'
causalTokenizer.pad_token_id = causalTokenizer.eos_token_id

for i in max_lenghts:
    print(i)
    config=defaultconfig
    config['max_new_tokens']=i
    gens = list()
    for j in range(int(len(causalSample)/4) + 1):
        since = datetime.now()
        gen = generate([s[0] for s in causalSample[4*j : 4*(j+1)]], causalModel, causalTokenizer, config)
        elapsed.append({"max_new_tokens": i, "elapsed": (datetime.now()-since).seconds,
                        "score": metric.compute(
                            predictions = [g.split('\n----답변----\n')[-1] for g in gen],
                            references = [[i[1]] for i in seqSample[4*j : 4*(j+1)]],
                            tokenize='char'
                        )['score']})
        gens.extend(gen)
        torch.cuda.empty_cache()
    generated.append((i, gens))

generations.append(('max_new_tokens>bleu', generated))
elapseds.append(('max_new_tokens>bleu', elapsed))

px.line(elapsed, x='max_new_tokens', y='score')

In [None]:
# generation performance per num_beams

from datetime import datetime

import evaluate
import plotly.express as px

from tqdm import tqdm

VAR_CONTROL = 'num_beams'
VAR_DEPEND = 'sacrebleu'

metric = evaluate.load('sacrebleu')

elapsed = list()
generated = list()

num_beams_list = range(1, 11, 1)

for i in tqdm(num_beams_list):
    config = defaultconfig
    config['max_new_tokens'] = 128
    config[VAR_CONTROL] = i
    since = datetime.now()
    gens = list()
    for j in range(int(len(causalSample)/4) + 1):
        since = datetime.now()
        gen = generate([s[0] for s in causalSample[4*j : 4*(j+1)]], causalModel, causalTokenizer, config)
        elapsed.append({"num_beam": i, "elapsed": (datetime.now()-since).seconds,
                        "score": metric.compute(
                            predictions = [g.split('\n----답변----\n')[-1] for g in gen],
                            references = [[i[1]] for i in seqSample[4*j : 4*(j+1)]],
                            tokenize='char'
                        )['score']})
        gens.extend(gen)
        torch.cuda.empty_cache()
    generated.append((i, gens))

generations.append((f"{VAR_CONTROL}>{VAR_DEPEND}", generated))
elapseds.append((f"{VAR_CONTROL}>{VAR_DEPEND}", elapsed))

px.line(elapsed, x=VAR_CONTROL, y='score')

## Testing for the Sequence-to-Sequence model

In [None]:
# result

config = {
    "max_new_tokens":120,
    "early_stopping":True,
    "do_sample":False,
    "num_beams":1,
    "num_beam_groups":1,
    "use_cache":False,
    "temperature":1.0,
    "top_k":10,
    "top_p":1.0, # If set to float < 1, only the smallest set of
                 # most probable tokens with probabilities that add up to top_p or
                 # higher are kept for generation.
                 # (https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig.top_p)
    "diversity_penalty":0.0,
    "repetition_penalty":1.2,
    "length_penalty":1.0,
    "penalty_alpha": 0.6,
}

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

seqModel = AutoModelForSeq2SeqLM.from_pretrained(SEQ2SEQ_CKPT,
                                                revision=SEQ2SEQ_COMMIT).to(DEVICE)
seqTokenizer = AutoTokenizer.from_pretrained(SEQ2SEQ_CKPT,
                                                revision=SEQ2SEQ_COMMIT)

### Surveying the HuggingFace implementation of `generate()`

In [None]:
x = "<경상남도 사천시> 소독업 신고\n소독업 신고는 어떻게 하나요?"
x = seqTokenizer([x], padding=True, return_tensors='pt').input_ids.to(DEVICE)
x = seqModel.generate(inputs=x,
                      do_sample=False,
                      num_beams=1,
                      # greedy search
                      )[0].tolist()
x = seqTokenizer.decode(x)
x

In [None]:
x = "<경상남도 사천시> 소독업 신고\n소독업 신고는 어떻게 하나요?"
x = seqTokenizer([x], padding=True, return_tensors='pt').input_ids.to(DEVICE)
x = seqModel.generate(inputs=x,
                      do_sample=False,
                      num_beams=10,
                      # beam search
                      # beam search performance diverge at eiter beam size = 10
                      # or beam size = 5
                      # Refer: Hargreaves et al. (2021)
                      #     https://doi.org/10.18653/v1/2021.eacl-main.219.
                      )[0].tolist()
x = seqTokenizer.decode(x)
x

In [None]:
x = "<경상남도 사천시> 소독업 신고\n소독업 신고는 어떻게 하나요?"
x = seqTokenizer([x], padding=True, return_tensors='pt').input_ids.to(DEVICE)
x = seqModel.generate(inputs=x,
                      do_sample=True,
                      num_beams=9,
                      max_new_tokens=100,
                      # beam search multinomial
                      )[0].tolist()
x = seqTokenizer.decode(x)
x

In [None]:
x = "<경상남도 사천시> 소독업 신고\n소독업 신고는 어떻게 하나요?"
x = seqTokenizer([x], padding=True, return_tensors='pt').input_ids.to(DEVICE)
x = seqModel.generate(inputs=x,
                      do_sample=False,
                      num_beams=10,
                      # should be n*(num_beam_groups) where n is an integer
                      num_beam_groups=2,
                      max_new_tokens=100,
                      # diverse beam search
                      )[0].tolist()
x = seqTokenizer.decode(x)
x

In [None]:
x = "<서울시> 민원 처리는 언제 진행되나요?\n민원 처리는 언제 진행되나요?"
x = seqTokenizer([x], padding=True, return_tensors='pt').input_ids.to(DEVICE)
x = seqModel.generate(inputs=x,
                      max_new_tokens=120,
                      penalty_alpha=0.6, # [0, 1]
                                         # greater means prioritize unique generations
                                         # lesser means prioritize model confidence
                      top_k=8, # typically [3,10]
                                # determines how many candidtates to make
                      # contrastive search
                      # refer: Su et al. (2022)
                      # https://arxiv.org/pdf/2202.06417
                      repetition_penalty=1.1,
                      )[0].tolist()
x = seqTokenizer.decode(x, skip_special_tokens=True,
                        clean_up_tokenization_spaces=True)
print(x)

Overall, **constrastive search** with proper repetition penal gives out the most-liekly results.

### Running experiments

In [None]:
generate = lambda txts, model, tokenizer, args: [tokenizer.decode(x) for x in
                    model.generate(
                        inputs=tokenizer(txts, padding=True, return_tensors='pt').input_ids.to(DEVICE),
                        max_new_tokens=args['max_new_tokens'],
                        early_stopping=args['early_stopping'],
                        do_sample=args['do_sample'],
                        num_beams=args['num_beams'],
                        num_beam_groups=args['num_beam_groups'],
                        use_cache=args['use_cache'],
                        temperature=args['temperature'],
                        top_k=args['top_k'],
                        top_p=args['top_p'],
                        diversity_penalty=args['diversity_penalty'],
                        repetition_penalty=args['repetition_penalty'],
                        length_penalty=args['length_penalty'],
                    ).tolist()
            ]

defaultconfig = {
    "max_new_tokens":100,
    "early_stopping":True,
    "do_sample":False,
    "num_beams":1,
    "num_beam_groups":1,
    "use_cache":False,
    "temperature":1.0,
    "top_k":50,
    "top_p":1.0, # If set to float < 1, only the smallest set of
                 # most probable tokens with probabilities that add up to top_p or
                 # higher are kept for generation.
                 # (https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig.top_p)
    "diversity_penalty":0.0,
    "repetition_penalty":2.5,
    "length_penalty":1.0,
}

In [None]:
generations = list()
elapseds = list()

In [None]:
# generation time per max_length

import plotly.express as px

from datetime import datetime

elapsed = list()
generated = list()

max_lenghts = range(20, 201, 20)

for i in max_lenghts:
    config=defaultconfig
    config['max_new_tokens']=i
    since = datetime.now()
    generated.append((i, generate([s[0] for s in seqSample], seqModel, seqTokenizer, config)))
    elapsed.append({"max_new_tokens": i, "elapsed": (datetime.now()-since).seconds})

generations.append(('max_new_tokens>time', generated))
elapseds.append(('max_new_tokens>time', elapsed))

px.line(elapsed, x='max_new_tokens', y='elapsed')

In [None]:
# generation performance per max_length

import evaluate

import plotly.express as px

from datetime import datetime

metric = evaluate.load('sacrebleu')

elapsed = list()
generated = list()

max_lenghts = range(20, 201, 20)

for i in max_lenghts:
    config=defaultconfig
    config['max_new_tokens']=i
    since = datetime.now()
    gen = generate([s[0] for s in seqSample], seqModel, seqTokenizer, config)
    elapsed.append({"max_new_tokens": i, "elapsed": (datetime.now()-since).seconds,
                    "score": metric.compute(
                        predictions = gen,
                        references = [[i[1]] for i in seqSample],
                        tokenize='char'
                    )['score']})
    generated.append((i, gen))


generations.append(('max_new_tokens>bleu', generated))
elapseds.append(('max_new_tokens>bleu', elapsed))


px.line(elapsed, x='max_new_tokens', y='score')

In [None]:
# generation performance per penalty_alpha

from datetime import datetime

import evaluate
import plotly.express as px

from tqdm import tqdm

VAR_CONTROL = 'penalty_alpha'
VAR_DEPEND = 'sacrebleu'

metric = evaluate.load('sacrebleu')

elapsed = list()
generated = list()

penalty_alpha_list = [i/10 for i in range(1, 11, 1)]

for i in tqdm(penalty_alpha_list):
    config = defaultconfig
    config[VAR_CONTROL] = i
    since = datetime.now()
    gen = generate([s[0] for s in seqSample], seqModel, seqTokenizer, config)
    elapsed.append({VAR_CONTROL: i, "elapsed": (datetime.now()-since).seconds,
                    "score": metric.compute(
                        predictions = gen,
                        references = [[i[1]] for i in seqSample],

                    )['score']})
    generated.append((i, gen))


generations.append((f"{VAR_CONTROL}>{VAR_DEPEND}", generated))
elapseds.append((f"{VAR_CONTROL}>{VAR_DEPEND}", elapsed))

px.line(elapsed, x=VAR_CONTROL, y='score')

In [None]:
# generation performance per top_k

from datetime import datetime

import evaluate
import plotly.express as px

from tqdm import tqdm

VAR_CONTROL = 'top_k'
VAR_DEPEND = 'sacrebleu'

metric = evaluate.load('sacrebleu')

elapsed = list()
generated = list()

top_k_list = range(3, 11)

for i in tqdm(top_k_list):
    config = defaultconfig
    config[VAR_CONTROL] = i
    since = datetime.now()
    gen = generate([s[0] for s in seqSample], seqModel, seqTokenizer, config)
    elapsed.append({VAR_CONTROL: i, "elapsed": (datetime.now()-since).seconds,
                    "score": metric.compute(
                        predictions = gen,
                        references = [[i[1]] for i in seqSample],

                    )['score']})
    generated.append((i, gen))


generations.append((f"{VAR_CONTROL}>{VAR_DEPEND}", generated))
elapseds.append((f"{VAR_CONTROL}>{VAR_DEPEND}", elapsed))

px.line(elapsed, x=VAR_CONTROL, y='score')

In [None]:
# generation performance per repetition_penalty

from datetime import datetime

import evaluate
import plotly.express as px

from tqdm import tqdm

VAR_CONTROL = 'repetition_penalty'
VAR_DEPEND = 'sacrebleu'

metric = evaluate.load('sacrebleu')

elapsed = list()
generated = list()

repetition_penalty_list = [i/10 for i in range(1, 21)]

for i in tqdm(repetition_penalty_list):
    config = defaultconfig
    config[VAR_CONTROL] = i
    since = datetime.now()
    gen = generate([s[0] for s in seqSample], seqModel, seqTokenizer, config)
    elapsed.append({VAR_CONTROL: i, "elapsed": (datetime.now()-since).seconds,
                    "score": metric.compute(
                        predictions = gen,
                        references = [[i[1]] for i in seqSample],

                    )['score']})
    generated.append((i, gen))

generations.append((f"{VAR_CONTROL}>{VAR_DEPEND}", generated))
elapseds.append((f"{VAR_CONTROL}>{VAR_DEPEND}", elapsed))

px.line(elapsed, x=VAR_CONTROL, y='score')

In [None]:
# generation performance per temperature

from datetime import datetime

import evaluate
import plotly.express as px

from tqdm import tqdm

VAR_CONTROL = 'temperature'
VAR_DEPEND = 'sacrebleu'

metric = evaluate.load('sacrebleu')

elapsed = list()
generated = list()

temperature_list = [i/10 for i in range(1, 21)]

for i in tqdm(temperature_list):
    config = defaultconfig
    config[VAR_CONTROL] = i
    since = datetime.now()
    gen = generate([s[0] for s in seqSample], seqModel, seqTokenizer, config)
    elapsed.append({VAR_CONTROL: i, "elapsed": (datetime.now()-since).seconds,
                    "score": metric.compute(
                        predictions = gen,
                        references = [[i[1]] for i in seqSample],

                    )['score']})
    generated.append((i, gen))

generations.append((f"{VAR_CONTROL}>{VAR_DEPEND}", generated))
elapseds.append((f"{VAR_CONTROL}>{VAR_DEPEND}", elapsed))

px.line(elapsed, x=VAR_CONTROL, y='score')

_tests below are ignore as they don't really matter for constrastivs search_

In [None]:
# generation performance per temperature

# import evaluate

# import plotly.express as px

# from datetime import datetime

# metric = evaluate.load('sacrebleu')

# elapsed = list()
# generated = list()

# temperaturesBy10 = range(1, 21, 1)

# for i in temperaturesBy10:
#     config=defaultconfig
#     config['temperature']=i/10
#     since = datetime.now()
#     gen = generate([s[0] for s in seqSample], seqModel, seqTokenizer, config)
#     elapsed.append({"temperature": i/10, "elapsed": (datetime.now()-since).seconds,
#                     "score": metric.compute(
#                         predictions = gen,
#                         references = [[i[1]] for i in seqSample],
#                         tokenize='char'
#                     )['score']})
#     generated.append((i/10, gen))


# generations.append(('temperature>bleu', generated))
# elapseds.append(('temperature>bleu', elapsed))

# px.line(elapsed, x='temperature', y='score')

In [None]:
# generation performance per num_beam

# import evaluate

# import plotly.express as px

# from datetime import datetime

# metric = evaluate.load('sacrebleu')

# elapsed = list()
# generated = list()

# num_beams = range(1, 11)
# # refer Hargreaves et al. (2021) for survey of beam size-BLEU relations
# # https://aclanthology.org/2021.eacl-main.219.pdf

# for i in num_beams:
#     config=defaultconfig
#     config['num_beam']=i
#     since = datetime.now()
#     gen = generate([s[0] for s in seqSample], seqModel, seqTokenizer, config)
#     elapsed.append({"num_beam": i, "elapsed": (datetime.now()-since).seconds,
#                     "score": metric.compute(
#                         predictions = gen,
#                         references = [[i[1]] for i in seqSample],
#                         tokenize='char'
#                     )['score']})
#     generated.append((i, gen))


# generations.append(('num_beam>bleu', generated))
# elapseds.append(('num_beam>bleu', elapsed))

# px.line(elapsed, x='num_beam', y='score')