In [1]:
import os, sys
os.environ["CUDA_VISIBLE_DEVIVICES"]='1'
os.environ["HF_HOME"]='/home/jovyan/.cache/huggingface'
os.environ["HUGGINGFACE_HUB_CACHE"]='/home/jovyan/.cache/huggingface/hub' 
os.environ["TRANSFORMERS_CACHE"]='/home/jovyan/.cache/huggingface/hub'
os.environ["HF_DATASETS_CACHE"]='/home/jovyan/.cache/huggingface/datasets'

from transformers import AutoModelForSeq2SeqLM, AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel, PeftConfig
import pandas as pd
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
prompts = list(map(lambda x: x.strip() if x.strip()[-1] == '?' else x.strip() + '?', pd.read_csv('local_prompt.csv', sep='\t')['prompt'].values))

# mT5 V2

In [None]:
tokenizer = AutoTokenizer.from_pretrained('./output_v2/cendol-mt5-xxl-merged-chat')
for model_path in [
    'indonlp/cendol-mt5-small-chat'
    'indonlp/cendol-mt5-base-chat'
    'indonlp/cendol-mt5-large-chat'
    'indonlp/cendol-mt5-xl-chat'
    'indonlp/cendol-mt5-xxl-merged-chat'
]:
    model = AutoModelForSeq2SeqLM.from_pretrained(model_path, torch_dtype=torch.float16).cuda()
    model_name = model_path.split('/')[-1]
    print(f'===== {model_name} =====')
    responses = []
    for prompt in prompts:
        responses.append(
            tokenizer.decode(
                model.generate(tokenizer(prompt, return_tensors='pt')['input_ids'].cuda(), top_p=0.9, top_k=50, min_length=0, max_length=256, do_sample=True)[0]
            )
        )
        print(f'prompt: {prompt}')
        print(f'response: {responses[-1]}')
    pd.DataFrame({'prompts': prompts, 'responses': responses}).to_csv(f'./gen_save/{model_name}_v2.csv', index=False)

# mT5 V1

In [None]:
tokenizer = AutoTokenizer.from_pretrained('indonlp/cendol-mt5-small')
for model_path in [
    'indonlp/cendol-mt5-small'
    'indonlp/cendol-mt5-base'
    'indonlp/cendol-mt5-large'
    'indonlp/cendol-mt5-xl'
    'indonlp/cendol-mt5-xxl-merged'
]:
    model = AutoModelForSeq2SeqLM.from_pretrained(model_path, torch_dtype=torch.float16).cuda()
    model_name = model_path.split('/')[-1]
    print(f'===== {model_name} =====')
    responses = []
    for prompt in prompts:
        responses.append(
            tokenizer.decode(
                model.generate(tokenizer(prompt, return_tensors='pt')['input_ids'].cuda(), top_p=0.9, top_k=50, min_length=0, max_length=256, do_sample=True)[0]
            )
        )
        print(f'prompt: {prompt}')
        print(f'response: {responses[-1]}')
    print()
    pd.DataFrame({'prompts': prompts, 'responses': responses}).to_csv(f'./gen_save/{model_name}_v1.csv', index=False)

# LLaMA V2

In [None]:
tokenizer = AutoTokenizer.from_pretrained('indonlp/cendol-llama2-13b-merged-chat', token='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')
for model_path in [
#     'indonlp/cendol-llama2-7b-chat',
    'indonlp/cendol-llama2-13b-merged-chat'
]:
    model = AutoModelForCausalLM.from_pretrained(model_path, token='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')
    model_name = model_path.split('/')[-1]
    print(f'===== {model_name} =====')
    responses = []
    for prompt in prompts:
        responses.append(
            tokenizer.decode(
                model.generate(tokenizer(prompt, return_tensors='pt')['input_ids'], top_p=0.9, top_k=50, min_length=0, max_length=512, do_sample=True)[0]
            )
        )
        print(f'prompt: {prompt}')
        print(f'response: {responses[-1]}')
    pd.DataFrame({'prompts': prompts, 'responses': responses}).to_csv(f'./gen_save/{model_name}_v2.csv', index=False)

Loading checkpoint shards: 100%|██████████| 11/11 [01:33<00:00,  8.53s/it]


===== cendol-llama2-13b-merged-chat =====
prompt: Apa itu STNK?
response: <s> Apa itu STNK?</s>
prompt: Gimana caranya perpanjang STNK?
response: <s> Gimana caranya perpanjang STNK?</s>
prompt: Tutorial perpanjang STNK?
response: <s> Tutorial perpanjang STNK?</s>
prompt: Di mana perpanjang STNK?
response: <s> Di mana perpanjang STNK?</s>
prompt: Cara mengurus STNK yang sudah tidak berlaku?
response: <s> Cara mengurus STNK yang sudah tidak berlaku?</s>
prompt: Cara mengurus KTP hilang?
response: <s> Cara mengurus KTP hilang? Cara mengurus KTP hilang begini nih: 1. Mengajukan permohonan penggantian KTP 
 1. Cari dokumen yang diperlukan 
 2. Daftar ke kantor kependudukan 
 3. Pilihlah jenis kartu yang diinginkan 
 4. Periksa dokumen yang diperlukan 
 5. Mengajukan permohonan 
 6. Menunggu hasil penggantian KTP</s>
prompt: KTP gue ilang coy?
response: <s> KTP gue ilang coy?</s>
prompt: Kapan harus bayar pajak?
response: <s> Kapan harus bayar pajak?</s>
prompt: Gimana cara ngurus NPWP?
resp

# LLaMA V1

In [None]:
tokenizer = AutoTokenizer.from_pretrained('indonlp/cendol-llama2-13b-merged', token='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')
for model_path in [
    'indonlp/cendol-llama2-7b',
    'indonlp/cendol-llama2-13b-merged'
]:
    model = AutoModelForCausalLM.from_pretrained(model_path, token='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')
    model_name = model_path.split('/')[-1]
    print(f'===== {model_name} =====')
    responses = []
    for prompt in prompts:
        responses.append(
            tokenizer.decode(
                model.generate(tokenizer(prompt, return_tensors='pt')['input_ids'], top_p=0.9, top_k=50, min_length=0, max_length=512, do_sample=True)[0]
            )
        )
        print(f'prompt: {prompt}')
        print(f'response: {responses[-1]}')
    pd.DataFrame({'prompts': prompts, 'responses': responses}).to_csv(f'./gen_save/{model_name}_v1.csv', index=False)

tokenizer_config.json: 100%|██████████| 776/776 [00:00<00:00, 638kB/s]
tokenizer.model: 100%|██████████| 500k/500k [00:00<00:00, 507kB/s]
tokenizer.json: 100%|██████████| 1.84M/1.84M [00:01<00:00, 1.54MB/s]
config.json: 100%|██████████| 636/636 [00:00<00:00, 412kB/s]
pytorch_model.bin.index.json: 100%|██████████| 23.9k/23.9k [00:00<00:00, 13.2MB/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s][A
pytorch_model-00001-of-00002.bin:   0%|          | 10.5M/9.98G [00:02<36:28, 4.55MB/s][A
pytorch_model-00001-of-00002.bin:   0%|          | 21.0M/9.98G [00:02<21:00, 7.90MB/s][A
pytorch_model-00001-of-00002.bin:   0%|          | 31.5M/9.98G [00:03<15:30, 10.7MB/s][A
pytorch_model-00001-of-00002.bin:   0%|          | 41.9M/9.98G [00:04<13:46, 12.0MB/s][A
pytorch_model-00001-of-00002.bin:   1%|          | 52.4M/9.98G [00:04<12:46, 12.9MB/s][A
pytorch_model-00001-of-00002.bin:   1%|          | 62.9M/9.98G [00:05<11:24, 14.5MB/s][A
pytorch_model-00001-of-0000

===== cendol-llama2-7b =====
prompt: Apa itu STNK?
response: <s> Apa itu STNK?</s>
prompt: Gimana caranya perpanjang STNK?
response: <s> Gimana caranya perpanjang STNK?</s>
prompt: Tutorial perpanjang STNK?
response: <s> Tutorial perpanjang STNK?</s>
prompt: Di mana perpanjang STNK?
response: <s> Di mana perpanjang STNK?</s>
prompt: Cara mengurus STNK yang sudah tidak berlaku?
response: <s> Cara mengurus STNK yang sudah tidak berlaku?</s>
prompt: Cara mengurus KTP hilang?
response: <s> Cara mengurus KTP hilang? How to lose your ID card?</s>
prompt: KTP gue ilang coy?
response: <s> KTP gue ilang coy?</s>
prompt: Kapan harus bayar pajak?
response: <s> Kapan harus bayar pajak?</s>
prompt: Gimana cara ngurus NPWP?
response: <s> Gimana cara ngurus NPWP?</s>
prompt: Cara pembuatan NPWP?
response: <s> Cara pembuatan NPWP?</s>
prompt: Apa itu BPJS?
response: <s> Apa itu BPJS?</s>
prompt: Gimana caranya daftar BPJS?
response: <s> Gimana caranya daftar BPJS?</s>
prompt: Bagaimana prosedur daftar

Loading checkpoint shards: 100%|██████████| 6/6 [00:50<00:00,  8.39s/it]


===== cendol-llama2-13b-merged =====
prompt: Apa itu STNK?
response: <s> Apa itu STNK?</s>
prompt: Gimana caranya perpanjang STNK?
response: <s> Gimana caranya perpanjang STNK?</s>
prompt: Tutorial perpanjang STNK?
response: <s> Tutorial perpanjang STNK?</s>
prompt: Di mana perpanjang STNK?
response: <s> Di mana perpanjang STNK?</s>
prompt: Cara mengurus STNK yang sudah tidak berlaku?
response: <s> Cara mengurus STNK yang sudah tidak berlaku?</s>
prompt: Cara mengurus KTP hilang?
response: <s> Cara mengurus KTP hilang?</s>
prompt: KTP gue ilang coy?
response: <s> KTP gue ilang coy?!?!?! FYI. My ID card is missing?!?!?! FYI.</s>
prompt: Kapan harus bayar pajak?
response: <s> Kapan harus bayar pajak?</s>
prompt: Gimana cara ngurus NPWP?
response: <s> Gimana cara ngurus NPWP?</s>
prompt: Cara pembuatan NPWP?
response: <s> Cara pembuatan NPWP?</s>
prompt: Apa itu BPJS?
response: <s> Apa itu BPJS?</s>
prompt: Gimana caranya daftar BPJS?
response: <s> Gimana caranya daftar BPJS?</s>
prompt: 

# Baselines

In [3]:
for model_path in [
#     'aisingapore/sealion7b-instruct-nc',
#     'Ichsan2895/Merak-7B-v4',
#     'SeaLLMs/SeaLLM-7B-Chat',
#     'meta-llama/Llama-2-7b-chat-hf',
#     'bigscience/bloom-7b1---MBZUAI/bactrian-x-bloom-7b1-lora',
#     'bigscience/bloom-7b1---haonan-li/bactrian-id-bloom-7b1-lora',
    'bigscience/bloomz-7b1',
#     'meta-llama/Llama-2-13b-chat-hf',
#     'bigscience/mt0-xxl',
]:
    if 'bactrian' in model_path:
        model_name, adapter_name = model_path.split('---')

        tokenizer = AutoTokenizer.from_pretrained(model_name, truncation_side='left', padding_side='right', trust_remote_code=True)
        model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
        model = PeftModel.from_pretrained(model, adapter_name, torch_dtype=torch.float16).cuda()
        model = model.merge_and_unload()
        model_name = adapter_name.split('/')[-1]
    elif 'mt0' in model_path:
        tokenizer = AutoTokenizer.from_pretrained(model_path, token='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')
        model = AutoModelForSeq2SeqLM.from_pretrained(model_path, torch_dtype=torch.float16, token='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')
        model_name = model_path.split('/')[-1]
    else:
        tokenizer = AutoTokenizer.from_pretrained(model_path, token='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', trust_remote_code=True)
        model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, token='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', trust_remote_code=True).cuda()
        model_name = model_path.split('/')[-1]

    print(f'===== {model_name} =====')
    responses = []
    for prompt in prompts:
        if 'sealion' in model_path:
            prompt_template = "### USER:\n{human_prompt}\n\n### RESPONSE:\n"
            formatted_prompt = prompt_template.format(human_prompt=prompt)
        elif 'bactrian' in model_name or 'bloomz' in model_name:
            prompt_template = "USER: {human_prompt}\nRESPONSE:"
            formatted_prompt = prompt_template.format(human_prompt=prompt)
        else:
            chats = [
               {"role": "user", "content": prompt},
               {"role": "assistant", "content": ""},
            ]
            formatted_prompt = tokenizer.apply_chat_template(chats, tokenize=False)
        input_ids = tokenizer(formatted_prompt, return_tensors='pt', add_special_tokens=False)['input_ids'].cuda()
        responses.append(
            tokenizer.decode(
                model.generate(
                    input_ids, eos_token_id=tokenizer.eos_token_id, top_p=0.9, top_k=50, min_length=0, max_length=768, do_sample=True
                )[0,input_ids.shape[1]:], skip_special_tokens=True
            )
        )
        print(f'prompt: {prompt}')
        print(f'response: {responses[-1]}')
    pd.DataFrame({'prompts': prompts, 'responses': responses}).to_csv(f'./gen_save/baseline_{model_name}.csv', index=False)

===== bloomz-7b1 =====
prompt: Apa itu STNK?
response:  Surat Tanda Nomor Kendaraan
prompt: Gimana caranya perpanjang STNK?
response:  Via internet
prompt: Tutorial perpanjang STNK?
response:  Yes
prompt: Di mana perpanjang STNK?
response:  kantor samsat
prompt: Cara mengurus STNK yang sudah tidak berlaku?
response:  Anda bisa langsung pergi ke kantor polisi untuk mengurusnya.
prompt: Cara mengurus KTP hilang?
response:  Coba hubungi kelurahan
prompt: KTP gue ilang coy?
response:  Oh, ya saya tau..
prompt: Kapan harus bayar pajak?
response:  Setiap tahun
prompt: Gimana cara ngurus NPWP?
response:  Cek tabel yang disediakan di situs Kementerian Keuangan.
prompt: Cara pembuatan NPWP?
response:  Anda perlu mengisi formulir dan membayar biaya yang diminta.
prompt: Apa itu BPJS?
response:  Badan Penyelenggara Jaminan Sosial
prompt: Gimana caranya daftar BPJS?
response:  Menelepon nomor 135
prompt: Bagaimana prosedur daftar BPJS Ketenagakerjaan?
response:  bisa melalui aplikasi Mobile JKK.
p

In [4]:
for model_path in [
#     'meta-llama/Llama-2-13b-chat-hf',
    'bigscience/mt0-xxl',
]:
    print(f'===== {model_path} =====')
    if 'bactrian' in model_path:
        model_name, adapter_name = model_path.split('---')

        tokenizer = AutoTokenizer.from_pretrained(model_name, truncation_side='left', padding_side='right', trust_remote_code=True)
        model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
        model = PeftModel.from_pretrained(model, adapter_name)
        model.merge_and_unload()
        model_name = adapter_name.split('/')[-1]
    elif 'mt0' in model_path:
        tokenizer = AutoTokenizer.from_pretrained(model_path, token='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')
        model = AutoModelForSeq2SeqLM.from_pretrained(model_path, torch_dtype=torch.float16, token='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')
        model_name = model_path.split('/')[-1]
    else:
        tokenizer = AutoTokenizer.from_pretrained(model_path, token='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', trust_remote_code=True)
        model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, token='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', trust_remote_code=True).cuda()
        model_name = model_path.split('/')[-1]

    print(f'===== Loaded {model_name} =====')
    responses = []
    for prompt in prompts:
        if 'sealion' in model_path:
            prompt_template = "### USER:\n{human_prompt}\n\n### RESPONSE:\n"
            formatted_prompt = prompt_template.format(human_prompt=prompt)
        else:
            chats = [
               {"role": "user", "content": prompt},
               {"role": "assistant", "content": ""},
            ]
            formatted_prompt = tokenizer.apply_chat_template(chats, tokenize=False)
        input_ids = tokenizer(formatted_prompt, return_tensors='pt', add_special_tokens=False)['input_ids']
        if 'mt0' in model_path:
            responses.append(
                tokenizer.decode(
                    model.generate(
                        input_ids, eos_token_id=tokenizer.eos_token_id, top_p=0.9, top_k=50, min_length=0, max_length=768, do_sample=True
                    )[0,:], skip_special_tokens=True
                )
            )
        else:
            responses.append(
                tokenizer.decode(
                    model.generate(
                        input_ids, eos_token_id=tokenizer.eos_token_id, top_p=0.9, top_k=50, min_length=0, max_length=768, do_sample=True
                    )[0,input_ids.shape[1]:], skip_special_tokens=True
                )
            )
        print(f'prompt: {prompt}')
        print(f'response: {responses[-1]}')
    pd.DataFrame({'prompts': prompts, 'responses': responses}).to_csv(f'./gen_save/baseline_{model_name}.csv', index=False)

===== bigscience/mt0-xxl =====


Loading checkpoint shards: 100%|██████████| 6/6 [00:52<00:00,  8.70s/it]

No chat template is defined for this tokenizer - using a default chat template that implements the ChatML format (without BOS/EOS tokens!). If the default is not appropriate for your model, please set `tokenizer.chat_template` to an appropriate template. See https://huggingface.co/docs/transformers/main/chat_templating for more information.



===== Loaded mt0-xxl =====


RuntimeError: "addmm_impl_cpu_" not implemented for 'Half'