In [1]:
from sentencepiece import SentencePieceProcessor
from logging import getLogger
from typing import List
import os
from transformers import AutoTokenizer, AutoModelForCausalLM


logger = getLogger()


class Tokenizer:
    def __init__(self, model_path: str):
        # reload tokenizer
        assert os.path.isfile(model_path), model_path
        self.sp_model = SentencePieceProcessor(model_file=model_path)
        logger.info(f"Reloaded SentencePiece model from {model_path}")

        # BOS / EOS token IDs
        self.n_words: int = self.sp_model.vocab_size()
        self.bos_id: int = self.sp_model.bos_id()
        self.eos_id: int = self.sp_model.eos_id()
        self.pad_id: int = self.sp_model.pad_id()
        logger.info(
            f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}"
        )
        assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()

    def encode(self, s: str, bos: bool, eos: bool) -> List[int]:
        assert type(s) is str
        t = self.sp_model.encode(s)
        if bos:
            t = [self.bos_id] + t
        if eos:
            t = t + [self.eos_id]
        return t

    def decode(self, t: List[int]) -> str:
        return self.sp_model.decode(t)

In [47]:
import json

with open('sess_outputs.json', 'r') as f:
    output = json.load(f)

In [48]:
len(output)

58

In [46]:
print(output[20]['session_3']['facts']['bot_1'])

 * bot_1 is a musician and a teacher at a middle school in New York City
* bot_1 has been blogging and creating YouTube content for nearly a decade
* bot_1 has been offering tutoring math services, which earns passive income
* bot_1 has been offering advice to bot_0 on side income opportunities
* bot_1 suggests that bot_0 should blog and teach music for side income
* bot_1 advises bot_0 to start blogging and earning passive income through math tutoring
* bot_1 plans to continue blogging and creating YouTube content in addition to teaching full-time
* bot_1 encourages bot_0 to pursue a career in music and to work towards getting a record deal.


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("samwit/vicuna-13b-8bit")

model = AutoModelForCausalLM.from_pretrained("samwit/vicuna-13b-8bit")

Downloading:   0%|          | 0.00/805 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/96.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading:   0%|          | 0.00/548 [00:00<?, ?B/s]

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# tokenizer = AutoTokenizer.from_pretrained("eachadea/vicuna-13b-1.1")

model = AutoModelForCausalLM.from_pretrained("eachadea/vicuna-7b-1.1")

In [3]:
tokenizer = Tokenizer('./LLaMA_tokenizer.model')

normalizer.cc(51) LOG(INFO) precompiled_charsmap is empty. use identity normalization.


In [4]:
text = "</s> Human: Write poem about linux\n</s> Assistant:"

In [5]:
tokenized = tokenizer.encode(text, True, True)

In [6]:
import torch

In [8]:
tokenized = torch.tensor([tokenized])

In [9]:
generated_indices = model.generate(input_ids=tokenized, max_new_tokens=12).detach().cpu()

In [10]:
tokenizer.decode(generated_indices[0].tolist())

'</s> Human: Write poem about linux\n</s> Assistant: Home / News / News / The Best of the Best'