In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM

In [1]:
from trl import CPOConfig

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from transformers import TrainingArguments

In [5]:
training_args = TrainingArguments(
        output_dir = '',
        num_train_epochs=1,
        evaluation_strategy='steps',
        metric_for_best_model='eval_loss',
        # per_device_train_batch_size=4,
        gradient_accumulation_steps=2,
        gradient_checkpointing=True if True else False,
        bf16=False, # bf16 is not supported by non-Ampere GPUs
        fp16=True,
        tf32=False,
        group_by_length=True, # pad batches by its group, more efficient
        load_best_model_at_end=True,
        report_to = 'wandb',
        include_inputs_for_metrics=True,
        # disable_tqdm=False,  # disable tqdm since with packing values are in correct
    )

training_args = training_args.set_dataloader(train_batch_size=16,
                                            eval_batch_size=16,
                                            pin_memory=True,
                                            num_workers=4,
                                            sampler_seed=42)
training_args = training_args.set_lr_scheduler(name='cosine', num_epochs=1, warmup_ratio=0.01,)
training_args = training_args.set_optimizer(name='paged_adamw_8bit', learning_rate=1e-5, weight_decay=0.1,)
training_args = training_args.set_evaluate(strategy = 'steps', steps = 100, delay = 0, accumulation_steps=10, batch_size = 16)
training_args = training_args.set_save(strategy="steps", steps = 100, total_limit=10)
training_args = training_args.set_logging(strategy="steps", steps=100, report_to = ['wandb'])

In [7]:
CPOConfig(*training_args)

TypeError: trl.trainer.cpo_config.CPOConfig() argument after * must be an iterable, not TrainingArguments

In [5]:
tokenizer = AutoTokenizer.from_pretrained('mistralai/Mistral-7B-Instruct-v0.2', cache_dir = '/data2/brian/.cache')
tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-2-7b-chat-hf', cache_dir = '/data2/brian/.cache')

tokenizer_config.json: 100%|██████████| 1.62k/1.62k [00:00<00:00, 3.96MB/s]
tokenizer.model: 100%|██████████| 500k/500k [00:00<00:00, 12.9MB/s]
tokenizer.json: 100%|██████████| 1.84M/1.84M [00:00<00:00, 3.27MB/s]
special_tokens_map.json: 100%|██████████| 414/414 [00:00<00:00, 967kB/s]


In [3]:
model_path = 'unsloth/gemma-7b-it-bnb-4bit'
tokenizer = AutoTokenizer.from_pretrained(model_path, cache_dir = '/data2/brian/.cache')
model = AutoModelForCausalLM.from_pretrained(model_path, cache_dir = '/data2/brian/.cache')

`low_cpu_mem_usage` was None, now set to True since model is quantized.


KeyboardInterrupt: 

In [2]:
model_path = '/data2/brian/personal/translation/checkpoints/llama2_04240005/checkpoint-38220'
tokenizer = AutoTokenizer.from_pretrained(model_path, cache_dir = '/data2/brian/.cache')
model = AutoModelForCausalLM.from_pretrained(model_path, cache_dir = '/data2/brian/.cache')

NameError: name 'AutoTokenizer' is not defined

In [5]:
import datasets
from datasets import load_dataset_builder, VerificationMode
builder = load_dataset_builder(
    './wmt14/wmt_utils.py',
    language_pair = ('fr', 'en'),
    subsets = {
        datasets.Split.TRAIN: ["europarl_v7", "newscommentary_v9"],
        datasets.Split.VALIDATION: ['newstest2013'],
        datasets.Split.TEST: ['newstest2014']
    },
    cache_dir = '/data2/brian/.cache/dataset'
)
builder.download_and_prepare(verification_mode=VerificationMode.NO_CHECKS)
dataset = builder.as_dataset()

In [6]:
dataset['train'][1000]['translation']['en']

'It is also legitimate for there to be inter-company agreements and voluntary restraint agreements, since all these agreements make it possible to reduce the detrimental effects of competition on social or ecological requirements.'

In [7]:
tokenizer(dataset['train'][1000]['translation']['en'], return_tensors='pt').input_ids

tensor([[    1,   739,   338,   884, 25204,  6490,   363,   727,   304,   367,
          1006, 29899, 14518,  8571,  4110,   322, 27081,   653,  1791,  5270,
          8571,  4110, 29892,  1951,   599,  1438,  8571,  4110,  1207,   372,
          1950,   304, 10032,   278,  1439, 29878,  2073,   284,  9545,   310,
         13888,   373,  5264,   470,   321, 21553,   936, 11780, 29889]])

In [6]:
prompt = lambda x: f"<s>### Instruction: Please translate the input sentence written in French to English\n### Input: {x['fr']}\n### Output: {x['en']}<\s>"

In [8]:
dataset

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 2185517
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 3000
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 3003
    })
})

In [41]:
import random
batch = dataset['test']['translation'][random.randint(0, len(dataset))]
sys_prompt = 'You are a translator. Translate the sentence in French to English. Do not continue writing with anything that is unrelated to the given sentence.'
inference_sample = f"<s>[INST] {sys_prompt} {batch['fr']} [/INST] "
inference_sample = f"<s>[INST] <<SYS>>\n{sys_prompt}\n<</SYS>>\n\n{batch['fr']} [/INST] "


In [42]:
print(inference_sample, batch['en'])

<s>[INST] <<SYS>>
You are a translator. Translate the sentence in French to English. Do not continue writing with anything that is unrelated to the given sentence.
<</SYS>>

Equipé d'un wingsuit (une combinaison munie d'ailes), il est passé à 160 km/h au-dessus du célèbre sanctuaire Monserrate, situé à plus de 3 000 mètres d'altitude, où de nombreux badauds s'étaient rassemblés pour observer son exploit. [/INST]  Wearing a wingsuit, he flew past over the famous Monserrate Sanctuary at 160km/h. The sanctuary is located at an altitude of over 3000 meters and numerous spectators had gathered there to watch his exploit.


In [43]:
output = model.generate(**tokenizer(inference_sample, return_tensors = 'pt'), max_length = 256)
print(tokenizer.decode(output[0]))

<s><s> [INST] <<SYS>>
You are a translator. Translate the sentence in French to English. Do not continue writing with anything that is unrelated to the given sentence.
<</SYS>>

Equipé d'un wingsuit (une combinaison munie d'ailes), il est passé à 160 km/h au-dessus du célèbre sanctuaire Monserrate, situé à plus de 3 000 mètres d'altitude, où de nombreux badauds s'étaient rassemblés pour observer son exploit. [/INST] Equipped with a wingsuit (a suit with wings), he flew at 160 km/h over the famous Monserrate sanctuary, located at more than 3 000 meters above sea level, where many spectators had gathered to watch him.</s>


In [3]:
o_tokenizer = AutoTokenizer.from_pretrained('unsloth/mistral-7b-bnb-4bit', cache_dir = '/data2/brian/.cache')
o_model = AutoModelForCausalLM.from_pretrained('unsloth/mistral-7b-bnb-4bit', cache_dir = '/data2/brian/.cache')

tokenizer_config.json: 100%|██████████| 971/971 [00:00<00:00, 3.08MB/s]
tokenizer.model: 100%|██████████| 493k/493k [00:00<00:00, 12.8MB/s]
tokenizer.json: 100%|██████████| 1.80M/1.80M [00:00<00:00, 1.82MB/s]
special_tokens_map.json: 100%|██████████| 438/438 [00:00<00:00, 1.34MB/s]
`low_cpu_mem_usage` was None, now set to True since model is quantized.


In [11]:
o_output = o_model.generate(**o_tokenizer(inference_sample, return_tensors = 'pt'), max_length = 256)
print(o_tokenizer.decode(o_output[0]))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<s> ### Instruction: Please translate the input sentence written in French to English
### Input: Equipé d'un wingsuit (une combinaison munie d'ailes), il est passé à 160 km/h au-dessus du célèbre sanctuaire Monserrate, situé à plus de 3 000 mètres d'altitude, où de nombreux badauds s'étaient rassemblés pour observer son exploit.
### Output: Equipped with a wingsuit (a suit equipped with wings), he reached 160 km/h above the famous Monserrate sanctuary, located at more than 3,000 meters above sea level, where many spectators had gathered to observe his feat.

import re
import nltk
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.bleu_score import ngram_bleu_score
from nltk.translate.


In [1]:
from trl import SFTTrainer
from utils import apply_chat_template_llama
from train import compute_metrics
trainer = SFTTrainer(model = model, formatting_func = apply_chat_template_llama, compute_metrics=compute_metrics)

NameError: name 'model' is not defined

In [46]:
"""
<s><s> ### Instruction: Please translate the input sentence written in French to English
### Input: Spectaculaire saut en "wingsuit" au-dessus de Bogota
### Output: Spectacular Wingsuit Jump Over Bogota
### Input: Le débat est clos.
### Output: The debate is closed.
### Input: Le débat est clos.
### Output: The debate is closed.
### Input: Le débat est clos.
### Output: The debate is closed.
### Input: Le débat est clos.
### Output: The debate is closed.
### Input: Le débat est clos.
### Output: The debate is closed.
### Input: Le débat est clos.
### Output: The debate is closed.
### Input: Le débat est clos.
### Output: The debate is closed.
### Input: Le débat est clos.
### Output: The debate is closed.
### Input: Le débat est clos.
### Output: The debate is closed.
### Input: Le débat est clos.
### Output: The debate is closed.
### Input: Le débat est clos.
### Output
"""[154+len('### Output: '):]
# .find('### Output:')

'Spectacular Wingsuit Jump Over Bogota\n### Input: Le débat est clos.\n### Output: The debate is closed.\n### Input: Le débat est clos.\n### Output: The debate is closed.\n### Input: Le débat est clos.\n### Output: The debate is closed.\n### Input: Le débat est clos.\n### Output: The debate is closed.\n### Input: Le débat est clos.\n### Output: The debate is closed.\n### Input: Le débat est clos.\n### Output: The debate is closed.\n### Input: Le débat est clos.\n### Output: The debate is closed.\n### Input: Le débat est clos.\n### Output: The debate is closed.\n### Input: Le débat est clos.\n### Output: The debate is closed.\n### Input: Le débat est clos.\n### Output: The debate is closed.\n### Input: Le débat est clos.\n### Output\n'