In [6]:
import json
import random
import uuid
from ollama import Client
from datasets import load_dataset, IterableDataset, Dataset
from datasets import interleave_datasets

In [3]:
client = Client(base_url="http://127.0.0.1:8008")

In [4]:
with open("vocab.json", "rb") as f:
    vocab_data = json.load(f)
ALL_TOKENS = [token for token in vocab_data.keys() if len(token)>3]
ALL_TOKENS = [token[1:] for token in ALL_TOKENS if token.startswith("Ġ")]

def get_random_token():
    return random.choice(ALL_TOKENS)

In [5]:
def data_from_phi():
    word = get_random_token()
    message = {'role': 'user', 'content': f'tell me about the word {word} in 50 words'}
    response = client.chat(model='phi', messages=[message])
    content = response["message"]["content"]
    return content

class _DatasetGeneratorPickle:
    def __init__(self, generator, generator_id=None):
        self.generator = generator
        self.generator_id = (
            generator_id if generator_id is not None else str(uuid.uuid4())
        )

    def __call__(self, *args, **kwargs):
        return self.generator(*args, **kwargs)

    def __reduce__(self):
        return (_DatasetGeneratorPickle_raise, (self.generator_id,))


def _DatasetGeneratorPickle_raise(*args, **kwargs):
    raise AssertionError("cannot actually unpickle _DatasetGeneratorPickle!")

def phi_data_generator(n, *args, **kwargs):
    for i in range(n):
        content = data_from_phi()
        yield {"text": content}

In [8]:
phi2_gen_dataset = IterableDataset.from_generator(_DatasetGeneratorPickle(phi_data_generator),
                                                  gen_kwargs={"n": 10})
tiny_stories_dataset = load_dataset("roneneldan/TinyStories", split="train", streaming=True)



In [9]:
dataset = interleave_datasets([phi2_gen_dataset, tiny_stories_dataset])

In [11]:
for sample in dataset:
    print(sample)

{'text': ' Mei is a Chinese name that can be written using different characters, each with its own meaning. It represents beauty, grace, and elegance. In Mandarin, it means "graceful."\n'}
{'text': 'One day, a little girl named Lily found a needle in her room. She knew it was difficult to play with it because it was sharp. Lily wanted to share the needle with her mom, so she could sew a button on her shirt.\n\nLily went to her mom and said, "Mom, I found this needle. Can you share it with me and sew my shirt?" Her mom smiled and said, "Yes, Lily, we can share the needle and fix your shirt."\n\nTogether, they shared the needle and sewed the button on Lily\'s shirt. It was not difficult for them because they were sharing and helping each other. After they finished, Lily thanked her mom for sharing the needle and fixing her shirt. They both felt happy because they had shared and worked together.'}
{'text': ' Adding is an operation that combines two or more numbers, variables, or values to