In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import TrainingArguments
from transformers.integrations import WandbCallback
from trl.trainer import DataCollatorForCompletionOnlyLM
from trl import SFTTrainer
from datasets import load_dataset, Dataset
import torch
import wandb
import numpy as np
import os
model_name = "facebook/opt-125m"   # Small enough to do fairly fast generation on CPU (0.8s per small sentence) => Might be good for development, debugging and testing...
dataset_name = "GAIR/lima"
test=False


device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)
ds = load_dataset(dataset_name, 'plain_text')

# Dataset setup
## Data collator: To tell the trainer how to split and mask the dataset (in teacher forcing)
instruction_template = "### Human:"
response_template = "### Assistant:"
collator = DataCollatorForCompletionOnlyLM(instruction_template=instruction_template, response_template=response_template, tokenizer=tokenizer, mlm=False)
    # DataCollatorForCompletionOnlyLM finds the instruction template (by looping through the tokens - as we could easily implement ourselves) and cuts. Then it cuts again after the assistant template.
    # The cuts are, however, not made in the input_ids... Here the model is being given the answers too.... But the labels are set to -100 for everything until the answer begins, making the loss function ignore them...
    # Hence it implements teacher forcing when trained....
    # Furthermore, it seems that the trainer's dataloader calls its DataCollatorForCompletionOnlyLM.torch_call method either during dataset building or during fetching

## Then pre-process the dataset
def process_ds(ds):
    def aux(example):
        x = example["conversations"]
        assert len(x) == 2
        #return {"prompt": x[0], "completion": x[1]}
        return {'text': f"{instruction_template} {x[0]}{response_template} {x[1]}"}
    
    return ds.filter(
        lambda x: x["source"] != "multi_turn"    # We only want the instruction tuning bit
    ).map(
        aux, remove_columns=["conversations", "source"]
    )

train_ds = process_ds(ds["train"]).train_test_split(train_size=1 if test else 0.8)   # Test size can be given as fraction or absolute number.... Also useful for testing.
train_split, val_split = train_ds["train"], train_ds["test"]
eval_ds = ds["test"]["conversations"]
len_eval = len(eval_ds)
len_train_split = len(train_split)

# Setup trainer
training_args = TrainingArguments(
    num_train_epochs=200 if test else 3,
    per_device_train_batch_size=1 if test else 16,
    lr_scheduler_type="cosine",
    gradient_accumulation_steps=1,
    gradient_checkpointing=True,
    logging_steps=25,
    do_train=True,
    output_dir="/work3/s184399/checkpoints/lima-opt-125m",
    logging_dir="/work3/s184399/logs/lima-opt-125m",
    overwrite_output_dir=True,
    report_to="wandb",
    run_name="opt-125m code test" if test else "finetune-opt-125m",
)
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=collator,
    dataset_text_field="text",
    args=training_args,
    train_dataset=train_split, #train_ds,
    eval_dataset=val_split,
    max_seq_length=1024,
)
print("\n\nDataloader")
example = next(iter(trainer.get_train_dataloader()))   # Evidently calls the data collators `.torch_call` during fetching. (There's an issue with "Could not find response key `### Assistant:` " which disappears when commenting out this line).
example = {k:v[0] for k,v in example.items()}
print(example)   # The labels are fucked up, but it seems the collator will not fix it (after calling?... What on earth's going on here??)
print("\n\n--------\n\n")
print(trainer.tokenizer.decode(example['labels'][example['labels']!=-100]))


print("\n\nData collator")
### Manually testing the data collator
ex = trainer.data_collator.torch_call([example['input_ids']])
tokens = ex['input_ids'][0]
labels = ex['labels'][0]

print(trainer.tokenizer.decode(tokens))
print("\n\n--------\n\n")
print(trainer.tokenizer.decode(labels[labels!=-100]))

Using the latest cached version of the dataset since GAIR/lima couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'plain_text' at /zhome/18/7/137746/.cache/huggingface/datasets/GAIR___lima/plain_text/0.0.1/68958e98267f5fb4a52a03ebcdae4ae59213fa7c (last modified on Fri Mar  1 11:09:21 2024).




Dataloader
{'input_ids': tensor([    2, 48134,  3861,  ...,     1,     1,     1]), 'attention_mask': tensor([1, 1, 1,  ..., 0, 0, 0]), 'labels': tensor([-100, -100, -100,  ..., -100, -100, -100])}


--------


 Setting isBodyHtml to true allows you to use HTML tags in the message body:
```
msg = new MailMessage("xxxx@gmail.com",
                "yyyy@gmail.com", "Message from PSSP System",
                "This email sent by the PSSP system<br />" +
                "<b>this is bold text!</b>");

msg.IsBodyHtml = true;
```


Data collator
</s>### Human: How to send HTML-formatted email in C#?### Assistant: Setting isBodyHtml to true allows you to use HTML tags in the message body:
```
msg = new MailMessage("xxxx@gmail.com",
                "yyyy@gmail.com", "Message from PSSP System",
                "This email sent by the PSSP system<br />" +
                "<b>this is bold text!</b>");

msg.IsBodyHtml = true;
```<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad

In [9]:
s = trainer.tokenizer.decode(tokens)
s.split(response_template)[0].replace(instruction_template, '')

'</s> How to send HTML-formatted email in C#?'

# Other datasets

In [19]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM  # Source: https://github.com/huggingface/trl/blob/14e0d788078be6406e580a2e8aa94cd451e5f909/trl/trainer/utils.py#L63

dataset = load_dataset("timdettmers/openassistant-guanaco", split="train")

model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m")
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")

instruction_template = "### Human:"
response_template = "### Assistant:"
collator = DataCollatorForCompletionOnlyLM(instruction_template=instruction_template, response_template=response_template, tokenizer=tokenizer, mlm=False)
    # DataCollatorForCompletionOnlyLM finds the instruction template (by looping through the tokens - as we could easily implement ourselves) and cuts. Then it cuts again after the assistant template.
    # The cuts are, however, not made in the input_ids... Here the model is being given the answers too.... But the labels are set to -100 for everything until the answer begins, making the loss function ignore them...
    # Hence it implements teacher forcing when trained....


trainer = SFTTrainer(
    model,
    train_dataset=dataset,
    dataset_text_field="text",
    data_collator=collator,
)


Repo card metadata block was not found. Setting CardData to empty.
Detected kernel version 5.4.268, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [21]:
dataset['text'][0]

'### Human: Can you write a short introduction about the relevance of the term "monopsony" in economics? Please use examples related to potential monopsonies in the labour market and cite relevant research.### Assistant: "Monopsony" refers to a market structure where there is only one buyer for a particular good or service. In economics, this term is particularly relevant in the labor market, where a monopsony employer has significant power over the wages and working conditions of their employees. The presence of a monopsony can result in lower wages and reduced employment opportunities for workers, as the employer has little incentive to increase wages or provide better working conditions.\n\nRecent research has identified potential monopsonies in industries such as retail and fast food, where a few large companies control a significant portion of the market (Bivens & Mishel, 2013). In these industries, workers often face low wages, limited benefits, and reduced bargaining power, lead

In [93]:
dl = trainer.get_train_dataloader()
i=0
example = next(iter(dl))
tokens = example["input_ids"][i]
mask = example["attention_mask"][i]
print(trainer.tokenizer.decode(token_ids=tokens, mask=mask))
print("------")
print(trainer.tokenizer.decode(token_ids=tokens))

</s>How do I get a ```PriorityQueue``` to sort on what I want it to sort on?
Also, is there a difference between the ```offer``` and ```add``` methods?</s>## Java 8 solution
We can use ```lambda expression``` or ```method reference``` introduced in Java 8. In case we have some String values stored in the Priority Queue (having capacity 5) we can provide inline comparator (based on length of String) : 
Using lambda expression
```PriorityQueue pq=
                    new PriorityQueue(5,(a,b) -> a.length() - b.length());
```
Using Method reference
```PriorityQueue pq=
                new PriorityQueue(5, Comparator.comparing(String::length));
```
Then we can use any of them as:
```public static void main(String[] args) {
        PriorityQueue pq=
                new PriorityQueue(5, (a,b) -> a.length() - b.length());
       // or pq = new PriorityQueue(5, Comparator.comparing(String::length));
        pq.add("Apple");
        pq.add("PineApple");
        pq.add("Custard Apple");
        

In [94]:
example

{'input_ids': tensor([[   2, 6179,  109,  ...,    1,    1,    1],
        [   2, 7608,   16,  ...,    1,    1,    1],
        [   2, 2264,   32,  ...,    1,    1,    1],
        ...,
        [   2,  281,   10,  ...,    1,    1,    1],
        [   2, 7608,   16,  ...,    1,    1,    1],
        [   2, 1779,   74,  ...,    1,    1,    1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[   2, 6179,  109,  ..., -100, -100, -100],
        [   2, 7608,   16,  ..., -100, -100, -100],
        [   2, 2264,   32,  ..., -100, -100, -100],
        ...,
        [   2,  281,   10,  ..., -100, -100, -100],
        [   2, 7608,   16,  ..., -100, -100, -100],
        [   2, 1779,   74,  ..., -100, -100, -100]])}

In [68]:
print(trainer.tokenizer.decode(token_ids=trainer.data_collator.torch_call([tokens])['input_ids'][0]))

</s>### Human: 回答集编程和Prolog有什么联系和区别？### Assistant: 回答集编程（Answer Set Programming，ASP）和Prolog是两种逻辑编程语言，它们之间的联系和区别如下：

联系：
1. ASP和Prolog都使用规则和查询来表示和解决问题。
2. ASP和Prolog都支持递归和模式匹配。
3. ASP和Prolog都可以被用来表示和解决复杂的逻辑和知识表达式。

区别：
1. ASP的语法更接近于一种自然语言，而Prolog的语法更接近于一种函数式编程语言。
2. ASP和Prolog的语义不同。ASP使用基于稳定模型（stable model）的语义来定义规则的解释，而Prolog使用基于霍恩子句（Horn clause）的语义来定义规则的解释。
3. ASP和Prolog的应用领域不同。ASP通常用于复杂知识的表示和解决，如自然语言处理、图形学等领域，而Prolog通常用于人工智能、专家系统等领域。### Human: 看来prolog也是挺强大的。为什么它现在不火呢？请试着给出几个原因。### Assistant: Prolog在当今编程语言市场上的影响力不如一些其他语言，可能是由于以下几个原因：

1. **学习曲线较陡峭**
Prolog语法和概念与传统的命令式编程语言非常不同，这使得初学者在学习过程中可能会遇到一些挑战。在传统的命令式编程语言中，程序员需要考虑如何控制程序的流程以及如何处理数据。而在Prolog中，程序员需要思考如何描述问题和约束条


In [74]:
labels = trainer.data_collator.torch_call([tokens])['labels']
trainer.tokenizer.decode(token_ids=labels[labels!=-100])

' 回答集编程（Answer Set Programming，ASP）和Prolog是两种逻辑编程语言，它们之间的联系和区别如下：\n\n联系：\n1. ASP和Prolog都使用规则和查询来表示和解决问题。\n2. ASP和Prolog都支持递归和模式匹配。\n3. ASP和Prolog都可以被用来表示和解决复杂的逻辑和知识表达式。\n\n区别：\n1. ASP的语法更接近于一种自然语言，而Prolog的语法更接近于一种函数式编程语言。\n2. ASP和Prolog的语义不同。ASP使用基于稳定模型（stable model）的语义来定义规则的解释，而Prolog使用基于霍恩子句（Horn clause）的语义来定义规则的解释。\n3. ASP和Prolog的应用领域不同。ASP通常用于复杂知识的表示和解决，如自然语言处理、图形学等领域，而Prolog通常用于人工智能、专家系统等领域。 Prolog在当今编程语言市场上的影响力不如一些其他语言，可能是由于以下几个原因：\n\n1. **学习曲线较陡峭**\nProlog语法和概念与传统的命令式编程语言非常不同，这使得初学者在学习过程中可能会遇到一些挑战。在传统的命令式编程语言中，程序员需要考虑如何控制程序的流程以及如何处理数据。而在Prolog中，程序员需要思考如何描述问题和约束条'

tensor([[False, False, False,  ...,  True,  True,  True]])
