In [69]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import AutoModelForCausalLMWithValueHead

model_name = "jeggers/OpenELM-270M-Instruct"
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_name)



In [76]:
# add hidden_size attribute to the model config
# take value from model_dim attribute in config
tokenizer.pad_token_id = 0
tokenizer.eos_token_id = 2

In [5]:
# push to hub
new_name = "jeggers/OpenELM-270M-Instruct"
model.push_to_hub(new_name)
tokenizer.push_to_hub(new_name)

model.safetensors: 100%|██████████| 1.09G/1.09G [12:13<00:00, 1.48MB/s]   
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
tokenizer.model: 100%|██████████| 500k/500k [00:00<00:00, 1.19MB/s]


CommitInfo(commit_url='https://huggingface.co/jeggers/OpenELM-270M-Instruct/commit/1d0c39b78a165d32275707622fcd2fef721ce8e8', commit_message='Upload tokenizer', commit_description='', oid='1d0c39b78a165d32275707622fcd2fef721ce8e8', pr_url=None, pr_revision=None, pr_num=None)

In [55]:
# try to load new model with trl

trl_model = AutoModelForCausalLMWithValueHead.from_pretrained(new_name, trust_remote_code=True)



In [77]:
# generate some text
input_texts = ["one two three four", "A quick", "The godfathers of AI are"]
batch = tokenizer(input_texts, padding="longest", return_tensors="pt")
print(batch)

{'input_ids': tensor([[    0,     0,     0,     0,     1,   697,  1023,  2211,  3023],
        [    0,     0,     0,     0,     0,     0,     1,   319,  4996],
        [    1,   450,  7339, 29888, 19467,   310,   319, 29902,   526]]), 'attention_mask': tensor([[0, 0, 0, 0, 1, 1, 1, 1, 1],
        [0, 0, 0, 0, 0, 0, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [79]:

out = trl_model.generate(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"], max_new_tokens=20, do_sample=True)
out_texts = tokenizer.batch_decode(out, skip_special_tokens=True)
print(out_texts)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['one two three four five six seven eight ninety o thirties hundred sixties eighties seventies\n-', 'A quick reference guide to the 20 most common types of adware applications, categorized by name or', "The godfathers of AI are meeting in Israel on Saturday, August 25 for the sixth meeting of the world's most"]


In [80]:
tokenizer = AutoTokenizer.from_pretrained("facebook/galactica-125m")



In [83]:
print(tokenizer.pad_token, tokenizer.eos_token)

None None


In [34]:
tokenizer.clean_up_tokenization_spaces

True

In [53]:
text="""
1. Let S < a. B ≥ T.
2. By P(T)≥a + P(S). B ≥ P(B) → By⊆P(B) + P(S)→by⊆−P(S) → Step-by-step: a+B≥ a ≤ a. Step-by-step: a∈S≥0.Proof of Theorem1.Proof of Theorem1.Proof of Theorem1.proof of Theorem1.
[37]: a. 3(b. a.)"""

In [54]:
toks = tokenizer(text, return_tensors="pt")
print(len(toks["input_ids"][0]))
# decode and encode again
text = tokenizer.decode(toks["input_ids"][0], skip_special_tokens=True)
print(text)
toks = tokenizer(text, return_tensors="pt")
print(len(toks["input_ids"][0]))


122

1. Let S < a. B ≥ T.
2. By P(T)≥a + P(S). B ≥ P(B) → By⊆P(B) + P(S)→by⊆−P(S) → Step-by-step: a+B≥ a ≤ a. Step-by-step: a∈S≥0.Proof of Theorem1.Proof of Theorem1.Proof of Theorem1.proof of Theorem1.
[37]: a. 3(b. a.)
122


In [90]:
import torch
x = torch.tensor([   -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
           -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
           -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
           -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
           -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
           -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
           -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
           -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
           -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
           -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
           -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
           -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
           -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
           -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
           -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
           -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
           -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
           -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
        10728,    35,  2182,    35, 10728,    48, 10520,    35,  2182, 10728,
          221,  6153, 28508,    48,   221,  6402,    48, 10520,    35,  2182,
           35, 10728,    35, 10728, 10520,    35,  2182, 10728,    35,  6153,
        28508,    51, 10728,    35,  2182,    35, 10728, 10520,    35,  2182,
         6324,    35, 10728,    35,  6153, 28508,    51, 10728,    35,  2182,
           35, 10728, 10520,    35,  2182,    35, 10728,    35,  6153, 28508,
        10728,   243,    51, 10520,    35,  6153, 28508, 10728,    35,  6153,
        28508, 10728,    35,  6153, 28508,  6153, 28508,  6153, 28508,  6153,
        28508,  6153, 28508, 35979, 10728,    35,  6153, 28508, 35979,  6153,
        28508, 35979, 35979, 13901,  4533,  9095, 26506,  6153, 28508,  6153,
        28508,  6153, 28508,  6153, 28508,  6153, 28508,  6153, 28508,  6153,
        28508,  6153, 28508,  6153, 28508,  6153, 28508,  6153, 12131,  6153,
        12131,  6153, 12131,  6153, 12131,  6153, 12131,  6153, 12131,  6153,
        12131,  6153, 12131,  6153, 12131,  6153, 12131,  6153, 12131,  6153,
        12131,  6153, 12131,  6153, 12131,  6153, 12131,  6153, 12131,  6153,
        12131,  6153, 12131,  6153, 12131,  6153, 12131,  6153, 12131,  6153,
        12131,  6153, 12131,  6153, 12131,  6153, 12131,  6153, 12131,  6153,
        12131,  6153, 12131,  6153, 12131,  6153, 12131,  6153, 12131,  6153,
        12131,  6153, 12131,  6153, 12131,  6153, 12131,  6153, 12131,  6153,
        12131,  6153, 12131,  6153, 12131,  6153, 12131,  6153, 12131,  6153,
           -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
           -1,    -1,    -1,    -1,    -1,    -1])
print(x.size())
# count number of greater than 0
print((x > 0).sum())

# replace -1 with 1
x = x.masked_fill(x < 0, 1)
# test tokenizer
text  = tokenizer.batch_decode(x, skip_special_tokens=True)
print(type(tokenizer))
print(text)
print(type(text))
toks = tokenizer(text, return_tensors="pt")
print(len(toks["input_ids"][0]))
text2 = tokenizer.decode(toks["input_ids"][0], skip_special_tokens=True)
print(text2==text)

torch.Size([396])
tensor(200)
<class 'transformers.tokenization_utils_fast.PreTrainedTokenizerFast'>
['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'Step', '-', 'by', '-', 'Step', ':', ' Step', '-', 'by', 'Step', '\n', 'Str', 'ategy', ':', '\n', 'Answer', ':', ' Step', '-', 'by', '-', 'Step', '-', 'Step', ' Step', '-', 'by',