# Mixo-LLM

My own LLM (Large Language Model), trained entirely from scratch. I also built the Tokenizer from Scratch

In [1]:
import requests
import os
import json
import jsonlines
from tqdm import tqdm
import sys
import random
import fileinput

In [None]:
dataset_urls = [ #(url, filename)
    ("https://huggingface.co/datasets/nvidia/OpenMathInstruct-1/resolve/main/correct_solutions/train.jsonl?download=true", "Nvidia_OpenMathInstruct-1_train.jsonl"),
    ("https://huggingface.co/datasets/OpenAssistant/oasst1/resolve/main/2023-04-12_oasst_all.messages.jsonl.gz?download=true", "OpenAssistant_oasst1_2023-04-12_oasst_all.messages.jsonl.gz"),
    ("https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K/resolve/main/llava_instruct_150k.json?download=true", "LLaVA-Instruct-150K_llava_instruct_150k.json"),
    ("https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K/resolve/main/llava_v1_5_mix665k.json?download=true", "LLaVA-Instruct-150K_llava_v1_5_mix665k.json"),
    ("https://huggingface.co/datasets/nomic-ai/gpt4all-j-prompt-generations/resolve/main/data/train-00000-of-00004-49a07627b3b5bdbe.parquet?download=true", "NomicAI_gpt4all-j-prompt-generations_train-0-of-4.parquet"),
    ("https://huggingface.co/datasets/nomic-ai/gpt4all-j-prompt-generations/resolve/main/data/train-00001-of-00004-62d2ea1ccbf3c546.parquet?download=true", "NomicAI_gpt4all-j-prompt-generations_train-1-of-4.parquet"),
    ("https://huggingface.co/datasets/nomic-ai/gpt4all-j-prompt-generations/resolve/main/data/train-00002-of-00004-5d4368eca33ee435.parquet?download=true", "NomicAI_gpt4all-j-prompt-generations_train-2-of-4.parquet"),
    ("https://huggingface.co/datasets/nomic-ai/gpt4all-j-prompt-generations/resolve/main/data/train-00003-of-00004-ef0356d35c1172f0.parquet?download=true", "NomicAI_gpt4all-j-prompt-generations_train-3-of-4.parquet"),
    ("https://huggingface.co/datasets/Open-Orca/SlimOrca/resolve/main/oo-labeled_correct.gpt4.sharegpt.jsonl?download=true", "Open-Orca_SlimOrca_oo-labeled_correct.gpt4.sharegpt.jsonl"),
    ("https://huggingface.co/datasets/openbmb/UltraFeedback/resolve/main/ultrachat.jsonl?download=true", "OpenBMB_UltraFeedback_ultrachat.jsonl"),
    ("https://huggingface.co/datasets/openbmb/UltraFeedback/resolve/main/sharegpt.jsonl?download=true", "OpenBMB_UltraFeedback_sharegpt.jsonl")
]

for url, filename in dataset_urls:
    filename = "./data/" + filename
    if not os.path.exists(filename):
        print(f"Downloading {filename}")
        r = requests.get(url, allow_redirects=True)
        open(filename, "wb").write(r.content)
    else:
        print(f"{filename} already exists")

In [2]:
class Message:
    author: str # either "user" or "assistant"
    text: str
    def __init__(self, author, text):
        self.author = author
        self.text = text
    
    def __str__(self):
        return f"{self.author}: {self.text}"
    
    def __hash__(self) -> int:
        return hash((self.author, self.text))

class Conversation:
    messages: list[Message]
    system_prompt: str
    
    def __init__(self, messages, system_prompt=""):
        self.messages = messages
        self.system_prompt = system_prompt
    
    def __str__(self):
        return self.system_prompt + str(len(self.messages)) + " messages:\n" + "\n".join([str(m) for m in self.messages[:10]]) + "..."
    
    def __hash__(self) -> int:
        return hash(tuple(self.messages))
    
def from_LLava_json(filepath) -> list[Conversation]:
    print(f"Loading {filepath}")
    # [x] working
    # LLava json is in this format:
    """
    [
        {
            "id": 0,
            "conversations": [
                {
                    "from": "human",
                    "value": TEXT
                },
                {
                    "from": "gpt",
                    "value": TEXT
                }
                ...
            ]
        },
        ...
    ]
    """
    with open(filepath, "r") as f:
        data = json.load(f)
        conversations = []
        for conversation in tqdm(data):
            messages = []
            for message in conversation["conversations"]:
                if message["from"] == "human":
                    author = "user"
                else:
                    author = "assistant"
                messages.append(Message(author, message["value"]))
            conversations.append(Conversation(messages))
    
    return conversations

def from_Nvidia_OMI_jsonl(filepath) -> list[Conversation]:
    print(f"Loading {filepath}")
    # [x] working
    # Nvidia OMI jsonl is in this format:
    """
    {"question": TEXT, "expected_answer": TEXT}
    ...
    """
    with jsonlines.open(filepath) as reader:
        conversations = []
        for obj in tqdm(reader):
            messages = [Message("user", obj["question"]), Message("assistant", obj["expected_answer"])]
            conversations.append(Conversation(messages))
    return conversations

def from_OpenBMB_UltraFeedback_jsonl(filepath) -> list[Conversation]:
    print(f"Loading {filepath}")
    # [x] working
    # OpenBMB jsonl is in this format:
    """
    {"source": "sharegpt", "instruction": USER_INPUT, "completions": [ {"response": ASSISTANT_OUTPUT, "custom_system_prompt": SYSTEM_PROMPT}, ... ]}
    """

    with jsonlines.open(filepath) as reader:
        conversations = []
        for obj in tqdm(reader):
            user_message = Message("user", obj["instruction"])
            
            for completion in obj["completions"]:
                assistant_message = Message("assistant", completion["response"])
                system_prompt = completion["custom_system_prompt"]
                conversations.append(Conversation([user_message, assistant_message], system_prompt))
    return conversations

def from_OpenOrca_jsonl(filepath) -> list[Conversation]:
    print(f"Loading {filepath}")
    # [ ] working
    # OpenOrca jsonl is in this format:
    """
    {"conversations": [{"from": "system", "value": SYSTEM_PROMPT}, {"from": "human", "value": USER_INPUT}, {"from": "gpt", "value": ASSISTANT_OUTPUT}]}
    """
    
    with jsonlines.open(filepath) as reader:
        conversations = []
        for obj in tqdm(reader):
            messages = []
            system_prompt = ""
            for json_obj in obj["conversations"]:

                if json_obj["from"] == "system":
                    system_prompt = json_obj["value"]
                else:
                    if json_obj["from"] == "human":
                        author = "user"
                    else:
                        author = "assistant"
                    messages.append(Message(author, json_obj["value"]))

            conversations.append(Conversation(messages, system_prompt))
    return conversations

def from_OASST_jsonl(filepath) -> list[Conversation]:
    # [ ] working
    # OASST jsonl is in this format:
    pass

def from_own_format(filepath) -> list[Conversation]:
    # [ ] working
    # own format is in this format:
    # {"system_prompt": SYSTEM_PROMPT, "messages": [{"author": "user", "text": USER_INPUT}, {"author": "assistant", "text": ASSISTANT_OUTPUT}]}

    with jsonlines.open(filepath) as reader:
        conversations = []
        for obj in tqdm(reader):
            messages = [Message(m["author"], m["text"]) for m in obj["messages"]]
            conversations.append(Conversation(messages, obj["system_prompt"]))
    return conversations

In [None]:
# write all conversations to a single jsonl file
conversations = []
conversations.extend(from_LLava_json("./data/LLaVA-Instruct-150K_llava_instruct_150k.json"))
conversations.extend(from_LLava_json("./data/LLaVA-Instruct-150K_llava_v1_5_mix665k.json"))
conversations.extend(from_Nvidia_OMI_jsonl("./data/Nvidia_OpenMathInstruct-1_train.jsonl"))
conversations.extend(from_OpenBMB_UltraFeedback_jsonl("./data/OpenBMB_UltraFeedback_sharegpt.jsonl"))
conversations.extend(from_OpenBMB_UltraFeedback_jsonl("./data/OpenBMB_UltraFeedback_ultrachat.jsonl"))
conversations.extend(from_OpenOrca_jsonl("./data/Open-Orca_SlimOrca_oo-labeled_correct.gpt4.sharegpt.jsonl"))

with jsonlines.open("./data/conversations.jsonl", "w") as writer:
    for conversation in tqdm(conversations):
        json_obj = {"system_prompt": conversation.system_prompt, "messages": [{"author": m.author, "text": m.text} for m in conversation.messages]}
        writer.write(json_obj)

print("Done!")

In [3]:
conversations = from_own_format("./data/conversations.jsonl")

3040280it [01:02, 48697.61it/s] 


In [4]:
print(len(conversations))

3040280


In [5]:
# filter out everything that contains "<image>" or "<url>" and remove duplicates
conversations_filtered = set()
for conversation in tqdm(conversations):
    if "<image>" in str(conversation) or "<url>" in str(conversation):
        continue
    conversations_filtered.add(conversation)

print(len(conversations_filtered))


100%|██████████| 3040280/3040280 [00:46<00:00, 65298.58it/s] 

2257941





In [6]:
all_text_arr = set()
for conversation in tqdm(conversations_filtered):
    convo_arr = []
    for message in conversation.messages:
        convo_arr.append("<|" + message.author + "|> " + message.text.replace("\n", "<|br|>")  + "<|end|>")
    
    convo_str = "<|system|> " + conversation.system_prompt.replace("\n", "<|br|>") + "<|end|>" + "".join(convo_arr)
    all_text_arr.add(convo_str)



with open("./data/all_text.txt", "w", encoding="utf-8") as f:
    for line in tqdm(all_text_arr):
        f.write(line + "\n")

100%|██████████| 2257941/2257941 [00:12<00:00, 185043.88it/s]
100%|██████████| 690670/690670 [00:06<00:00, 102608.25it/s]


In [25]:
# build a custom tokenizer
# generate a vocab list
# find the most common words, char combinations and chars

vocab = {}
for line in tqdm(all_text_arr):
    for word in line.split():
        vocab[word] = vocab.get(word, 0) + 1
    
    for char in line:
        vocab[char] = vocab.get(char, 0) + 1

# sort the vocab by frequency
vocab = dict(sorted(vocab.items(), key=lambda item: item[1], reverse=True))

100%|██████████| 690670/690670 [07:35<00:00, 1514.82it/s]


In [29]:
vocab_final = {
    "<|system|>": 0,
    "<|user|>": 1,
    "<|assistant|>": 2,
    "<|end|>": 3,
    "<|br|>": 4
}
    
max_vocab_size = 32_000

for i, (word, freq) in enumerate(vocab.items()):
    if i >= max_vocab_size:
        break
    vocab_final[word] = len(vocab_final)

# sort by length
vocab_final = dict(sorted(vocab_final.items(), key=lambda item: len(item[0]), reverse=True))

with open("./data/vocab.json", "w", encoding="utf-8") as f:
    json.dump(vocab_final, f, ensure_ascii=False, indent=4)

In [30]:
# for loading
vocab_final = json.load(open("./data/vocab_final.json", "r", encoding="utf-8"))

In [31]:
class Tokenizer:
    __vocab: dict[str, int]
    __vocab_reversed: dict[int, str]
    __vocab_size: int
    max_token_len: int

    def __init__(self, vocab: dict[str, int]):
        self.__vocab = vocab
        self.__vocab_reversed = {v: k for k, v in vocab.items()}
        self.__vocab_size = len(vocab)

        self.max_token_len = max([len(token) for token in vocab.keys()])
    
    def encode(self, text: str) -> list[int]:
        out_tokens = []
        
        sub_str = text[:self.max_token_len]
        while len(sub_str) > 0:
            if sub_str in self.__vocab:
                out_tokens.append(self.__vocab[sub_str])
                text = text[len(sub_str):]
                sub_str = text[:self.max_token_len]
            else:
                sub_str = sub_str[:-1]
        
        return out_tokens
    
    def decode(self, tokens: list[int]) -> str:
        return "".join([self.__vocab_reversed[t] for t in tokens])
    
    def vocab_size(self) -> int:
        return self.__vocab_size

In [34]:
# Phase 3:
# Tokenize the text
# generate training pairs

tokenizer = Tokenizer(vocab_final)

data_pairs = []
n_tokens = 0

with fileinput.input(files=("./data/all_text.txt"), encoding="utf-8") as f:
    for line in tqdm(f):
        tokens = tokenizer.encode(line)

        n_tokens += len(tokens)

        # find system token
        if 0 not in tokens:
            system_prompt = []
            system_token_idx = 0
        else:
            system_token_idx = tokens.index(0)
            system_prompt = tokens[:system_token_idx]
        
        # find user token
        if 1 not in tokens:
            user_token_idx = system_token_idx
            user_message = []
        else:
            user_token_idx = tokens.index(1)
            user_message = tokens[system_token_idx:user_token_idx]
        
        # find assistant token
        if 2 not in tokens:
            assistant_token_idx = user_token_idx
            assistant_message = []
        else:
            assistant_token_idx = tokens.index(2)
            assistant_message = tokens[user_token_idx:assistant_token_idx]
        
        data_pairs.append((system_prompt, user_message, assistant_message))

# save to jsonl
with jsonlines.open("./data/data_pairs.jsonl", "w") as writer:
    for pair in tqdm(data_pairs):
        writer.write({"system_prompt": pair[0], "user_message": pair[1], "assistant_message": pair[2]})


0it [00:00, ?it/s]

697261it [53:14, 218.26it/s]
100%|██████████| 697261/697261 [00:35<00:00, 19631.32it/s]

447527360





In [35]:
print(n_tokens)

print(f"{n_tokens:_}")

447527360
447_527_360


In [36]:
def generate_training_pairs(system_prompt_tokens, user_input_tokens, assistant_output_tokens):
    # generate all substrings of the assistant output from 1 to len

    training_pairs = []
    for i in range(1, len(assistant_output_tokens)):
        X = system_prompt_tokens + user_input_tokens + assistant_output_tokens[:i]
        y = assistant_output_tokens[i]
        training_pairs.append((X, y))
    
    return training_pairs

In [75]:
# Phase 4:
# Train a model
# model is trained on the generated training pairs

import torch
import random

def get_model(
        vocab_size: int,
        context_window_size: int,
        n_total_neurons: int,
        output_size: int
        ):
    
    layers = [
        "input",
        torch.nn.Linear(vocab_size, context_window_size),
        torch.nn.ReLU(),
        "hidden",
        "output",
        torch.nn.Linear(output_size, vocab_size),
        #torch.nn.Linear(vocab_size, output_size)
    ]

    # dynamically create the model
    n_neurons = (vocab_size * context_window_size) + (context_window_size * output_size)

    layers_out = []
    for layer in layers:
        if isinstance(layer, str):
            if layer == "hidden":
                # dynamically create hidden layers such that the total number of neurons is approx n_total_neurons
                n_hidden_layers = (abs(n_total_neurons - n_neurons) ** (1/3)) // 2
                neurons_per_layer = int(n_neurons // n_hidden_layers)
                shape = (int(neurons_per_layer ** 0.5), int(neurons_per_layer ** 0.5))

                layers_out.append(torch.nn.Linear(context_window_size, shape[0]))

                for i in range(int(n_hidden_layers)):
                    layers_out.append(torch.nn.Linear(shape[0], shape[1]))
                    layers_out.append(torch.nn.ReLU())
                
                layers_out.append(torch.nn.Linear(shape[1], output_size))
        else:
            layers_out.append(layer)
    
    return torch.nn.Sequential(*layers_out)

def train_model(
        model: torch.nn.Module,
        vocab_size: int,
        X_ARRAY: list[list[int]],
        y_ARRAY: list[int],
        epochs: int,
        learning_rate: float
        ):
    
    X_tensors = []
    for x in X_ARRAY:
        x_tensor = []
        for i in x:
            x_tensor.append([0] * vocab_size)
            x_tensor[-1][i] = 1
        X_tensors.append(torch.tensor(x_tensor, dtype=torch.float32))
    
    y_tensors = []
    for y in y_ARRAY:
        y_tensor = [0] * vocab_size
        y_tensor[y] = 1
        y_tensors.append(torch.tensor(y_tensor, dtype=torch.float32))
    
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    for epoch in range(epochs):
        for i in range(len(X_tensors)):
            X = X_tensors[i]
            y = y_tensors[i]

            optimizer.zero_grad()
            y_pred = model(X)
            # shape (15, vocab_size)
            y_pred = y_pred[-1]

            loss = criterion(y_pred, y)
            loss.backward()
            optimizer.step()
        
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss.item()}")

def generate(
    model: torch.nn.Module,
    tokenizer: Tokenizer,
    system_prompt: str,
    user_input: str,
    max_tokens: int
    ) -> str:

    system_prompt_tokens = tokenizer.encode(system_prompt)
    user_input_tokens = tokenizer.encode(user_input)

    X = system_prompt_tokens + user_input_tokens
    
    for i in range(max_tokens):
        x_tensor = []
        for i in X:
            x_tensor.append([0] * tokenizer.vocab_size())
            x_tensor[-1][i] = 1
        x_tensor = torch.tensor(x_tensor, dtype=torch.float32)

        y_pred = model(x_tensor)
        y_pred = y_pred[-1]

        y_pred = torch.argmax(y_pred).item()
        X.append(y_pred)

        if tokenizer.decode([y_pred]) == "<|end|>":
            break
    
    return tokenizer.decode(X)

In [38]:
tokenizer = Tokenizer(vocab_final)

In [77]:

model = get_model(
    vocab_size=tokenizer.vocab_size(),
    context_window_size=128,
    n_total_neurons=10_536_000,
    output_size=1
)

print("Model initialized")

print(generate(model, tokenizer, "<|system|>You are a helpful assistant.<|end|>", "<|user|>What is the capital of France?<|end|>", 100))


with jsonlines.open("./data/data_pairs.jsonl") as reader:
    for idx, obj in tqdm(enumerate(reader)):
        system_prompt = obj["system_prompt"]
        user_input = obj["user_message"]
        assistant_output = obj["assistant_message"]

        if system_prompt == []:
            system_prompt = tokenizer.encode("<|system|>You are a helpful assistant.<|end|>")
        

        train_pairs = generate_training_pairs(system_prompt, user_input, assistant_output)

        X_ARRAY = [pair[0] for pair in train_pairs]
        y_ARRAY = [pair[1] for pair in train_pairs]

        train_model(
            model=model,
            vocab_size=tokenizer.vocab_size(),
            X_ARRAY=X_ARRAY,
            y_ARRAY=y_ARRAY,
            epochs=10,
            learning_rate=3e-3
        )

        print(generate(model, tokenizer, "<|system|>You are a helpful assistant.<|end|>", "<|user|>What is the capital of France?<|end|>", 100))

print("Model trained")


Model initialized
<|system|>You are a helpful assistant.<|end|><|user|>What is the capital of France?<|end|>SouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonSouthamptonS

0it [00:00, ?it/s]

Epoch 1/10, Loss: 10.50253963470459
Epoch 2/10, Loss: 10.759696006774902
Epoch 3/10, Loss: 11.913043022155762
Epoch 4/10, Loss: 9.906529426574707
Epoch 5/10, Loss: 8.837465286254883
Epoch 6/10, Loss: 8.118941307067871
Epoch 7/10, Loss: 7.557490825653076
Epoch 8/10, Loss: 7.092545509338379
Epoch 9/10, Loss: 6.6931562423706055
Epoch 10/10, Loss: 6.344078540802002


1it [02:26, 146.44s/it]

<|system|>You are a helpful assistant.<|end|><|user|>What is the capital of France?<|end|>                                                                                                    
Epoch 1/10, Loss: 6.901194095611572
Epoch 2/10, Loss: 5.75759220123291
Epoch 3/10, Loss: 5.363442897796631
Epoch 4/10, Loss: 5.3113017082214355
Epoch 5/10, Loss: 5.3710103034973145
Epoch 6/10, Loss: 5.456414222717285
Epoch 7/10, Loss: 5.541018962860107
Epoch 8/10, Loss: 5.6177897453308105
Epoch 9/10, Loss: 5.685538291931152
Epoch 10/10, Loss: 5.745357036590576


2it [34:06, 1178.09s/it]

<|system|>You are a helpful assistant.<|end|><|user|>What is the capital of France?<|end|>                                                                                                    


: 

In [None]:
system_prompt = "<|system|>You are a helpful assistant.<|end|>"
input_str = "<|user|>" + str(input("User input: ")) + "<|end|>"

output_str = generate(
    model=model,
    tokenizer=tokenizer,
    system_prompt="<|system|>You are a helpful assistant.<|end|>",
    user_input=input_str,
    max_tokens=100
)

print(output_str)