# Mixo-LLM

My own LLM (Large Language Model), trained entirely from scratch. I also built the Tokenizer from Scratch

In [1]:
%pip install requests jsonlines tqdm torch

[2K   [91m━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m228.2/731.7 MB[0m [31m6.8 MB/s[0m eta [36m0:01:14[0m[31mERROR: Could not install packages due to an OSError: [Errno 28] No space left on device
[0m[31m
[2K   [91m━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m228.3/731.7 MB[0m [31m6.8 MB/s[0m eta [36m0:01:14[0m
[?25hNote: you may need to restart the kernel to use updated packages.


In [2]:
import requests
import os
import json
import jsonlines
from tqdm import tqdm
import sys
import random
import fileinput

In [None]:
dataset_urls = [ #(url, filename)
    ("https://huggingface.co/datasets/nvidia/OpenMathInstruct-1/resolve/main/correct_solutions/train.jsonl?download=true", "Nvidia_OpenMathInstruct-1_train.jsonl"),
    ("https://huggingface.co/datasets/OpenAssistant/oasst1/resolve/main/2023-04-12_oasst_all.messages.jsonl.gz?download=true", "OpenAssistant_oasst1_2023-04-12_oasst_all.messages.jsonl.gz"),
    ("https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K/resolve/main/llava_instruct_150k.json?download=true", "LLaVA-Instruct-150K_llava_instruct_150k.json"),
    ("https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K/resolve/main/llava_v1_5_mix665k.json?download=true", "LLaVA-Instruct-150K_llava_v1_5_mix665k.json"),
    ("https://huggingface.co/datasets/nomic-ai/gpt4all-j-prompt-generations/resolve/main/data/train-00000-of-00004-49a07627b3b5bdbe.parquet?download=true", "NomicAI_gpt4all-j-prompt-generations_train-0-of-4.parquet"),
    ("https://huggingface.co/datasets/nomic-ai/gpt4all-j-prompt-generations/resolve/main/data/train-00001-of-00004-62d2ea1ccbf3c546.parquet?download=true", "NomicAI_gpt4all-j-prompt-generations_train-1-of-4.parquet"),
    ("https://huggingface.co/datasets/nomic-ai/gpt4all-j-prompt-generations/resolve/main/data/train-00002-of-00004-5d4368eca33ee435.parquet?download=true", "NomicAI_gpt4all-j-prompt-generations_train-2-of-4.parquet"),
    ("https://huggingface.co/datasets/nomic-ai/gpt4all-j-prompt-generations/resolve/main/data/train-00003-of-00004-ef0356d35c1172f0.parquet?download=true", "NomicAI_gpt4all-j-prompt-generations_train-3-of-4.parquet"),
    ("https://huggingface.co/datasets/Open-Orca/SlimOrca/resolve/main/oo-labeled_correct.gpt4.sharegpt.jsonl?download=true", "Open-Orca_SlimOrca_oo-labeled_correct.gpt4.sharegpt.jsonl"),
    ("https://huggingface.co/datasets/openbmb/UltraFeedback/resolve/main/ultrachat.jsonl?download=true", "OpenBMB_UltraFeedback_ultrachat.jsonl"),
    ("https://huggingface.co/datasets/openbmb/UltraFeedback/resolve/main/sharegpt.jsonl?download=true", "OpenBMB_UltraFeedback_sharegpt.jsonl")
]

for url, filename in dataset_urls:
    filename = "./data/" + filename
    if not os.path.exists(filename):
        print(f"Downloading {filename}")
        r = requests.get(url, allow_redirects=True)
        open(filename, "wb").write(r.content)
    else:
        print(f"{filename} already exists")

In [3]:
class Message:
    author: str # either "user" or "assistant"
    text: str
    def __init__(self, author, text):
        self.author = author
        self.text = text
    
    def __str__(self):
        return f"{self.author}: {self.text}"
    
    def __hash__(self) -> int:
        return hash((self.author, self.text))

class Conversation:
    messages: list[Message]
    system_prompt: str
    
    def __init__(self, messages, system_prompt=""):
        self.messages = messages
        self.system_prompt = system_prompt
    
    def __str__(self):
        return self.system_prompt + str(len(self.messages)) + " messages:\n" + "\n".join([str(m) for m in self.messages[:10]]) + "..."
    
    def __hash__(self) -> int:
        return hash(tuple(self.messages))
    
def from_LLava_json(filepath) -> list[Conversation]:
    print(f"Loading {filepath}")
    # [x] working
    # LLava json is in this format:
    """
    [
        {
            "id": 0,
            "conversations": [
                {
                    "from": "human",
                    "value": TEXT
                },
                {
                    "from": "gpt",
                    "value": TEXT
                }
                ...
            ]
        },
        ...
    ]
    """
    with open(filepath, "r") as f:
        data = json.load(f)
        conversations = []
        for conversation in tqdm(data):
            messages = []
            for message in conversation["conversations"]:
                if message["from"] == "human":
                    author = "user"
                else:
                    author = "assistant"
                messages.append(Message(author, message["value"]))
            conversations.append(Conversation(messages))
    
    return conversations

def from_Nvidia_OMI_jsonl(filepath) -> list[Conversation]:
    print(f"Loading {filepath}")
    # [x] working
    # Nvidia OMI jsonl is in this format:
    """
    {"question": TEXT, "expected_answer": TEXT}
    ...
    """
    with jsonlines.open(filepath) as reader:
        conversations = []
        for obj in tqdm(reader):
            messages = [Message("user", obj["question"]), Message("assistant", obj["expected_answer"])]
            conversations.append(Conversation(messages))
    return conversations

def from_OpenBMB_UltraFeedback_jsonl(filepath) -> list[Conversation]:
    print(f"Loading {filepath}")
    # [x] working
    # OpenBMB jsonl is in this format:
    """
    {"source": "sharegpt", "instruction": USER_INPUT, "completions": [ {"response": ASSISTANT_OUTPUT, "custom_system_prompt": SYSTEM_PROMPT}, ... ]}
    """

    with jsonlines.open(filepath) as reader:
        conversations = []
        for obj in tqdm(reader):
            user_message = Message("user", obj["instruction"])
            
            for completion in obj["completions"]:
                assistant_message = Message("assistant", completion["response"])
                system_prompt = completion["custom_system_prompt"]
                conversations.append(Conversation([user_message, assistant_message], system_prompt))
    return conversations

def from_OpenOrca_jsonl(filepath) -> list[Conversation]:
    print(f"Loading {filepath}")
    # [ ] working
    # OpenOrca jsonl is in this format:
    """
    {"conversations": [{"from": "system", "value": SYSTEM_PROMPT}, {"from": "human", "value": USER_INPUT}, {"from": "gpt", "value": ASSISTANT_OUTPUT}]}
    """
    
    with jsonlines.open(filepath) as reader:
        conversations = []
        for obj in tqdm(reader):
            messages = []
            system_prompt = ""
            for json_obj in obj["conversations"]:

                if json_obj["from"] == "system":
                    system_prompt = json_obj["value"]
                else:
                    if json_obj["from"] == "human":
                        author = "user"
                    else:
                        author = "assistant"
                    messages.append(Message(author, json_obj["value"]))

            conversations.append(Conversation(messages, system_prompt))
    return conversations

def from_OASST_jsonl(filepath) -> list[Conversation]:
    # [ ] working
    # OASST jsonl is in this format:
    pass

def from_own_format(filepath) -> list[Conversation]:
    # [ ] working
    # own format is in this format:
    # {"system_prompt": SYSTEM_PROMPT, "messages": [{"author": "user", "text": USER_INPUT}, {"author": "assistant", "text": ASSISTANT_OUTPUT}]}

    with jsonlines.open(filepath) as reader:
        conversations = []
        for obj in tqdm(reader):
            messages = [Message(m["author"], m["text"]) for m in obj["messages"]]
            conversations.append(Conversation(messages, obj["system_prompt"]))
    return conversations

In [3]:
# write all conversations to a single jsonl file
conversations = []
conversations.extend(from_LLava_json("./data/LLaVA-Instruct-150K_llava_instruct_150k.json"))
conversations.extend(from_LLava_json("./data/LLaVA-Instruct-150K_llava_v1_5_mix665k.json"))
conversations.extend(from_Nvidia_OMI_jsonl("./data/Nvidia_OpenMathInstruct-1_train.jsonl"))
conversations.extend(from_OpenBMB_UltraFeedback_jsonl("./data/OpenBMB_UltraFeedback_sharegpt.jsonl"))
conversations.extend(from_OpenBMB_UltraFeedback_jsonl("./data/OpenBMB_UltraFeedback_ultrachat.jsonl"))
conversations.extend(from_OpenOrca_jsonl("./data/Open-Orca_SlimOrca_oo-labeled_correct.gpt4.sharegpt.jsonl"))

with jsonlines.open("./data/conversations.jsonl", "w") as writer:
    for conversation in tqdm(conversations):
        json_obj = {"system_prompt": conversation.system_prompt, "messages": [{"author": m.author, "text": m.text} for m in conversation.messages]}
        writer.write(json_obj)

print("Done!")

Loading ./data/LLaVA-Instruct-150K_llava_instruct_150k.json


100%|██████████| 157712/157712 [00:02<00:00, 74816.63it/s]


Loading ./data/LLaVA-Instruct-150K_llava_v1_5_mix665k.json


100%|██████████| 665298/665298 [00:17<00:00, 37168.35it/s] 


Loading ./data/Nvidia_OpenMathInstruct-1_train.jsonl


1579780it [00:27, 57497.66it/s]


Loading ./data/OpenBMB_UltraFeedback_sharegpt.jsonl


19949it [00:02, 7208.04it/s]


Loading ./data/OpenBMB_UltraFeedback_ultrachat.jsonl


9929it [00:01, 7470.47it/s]


Loading ./data/Open-Orca_SlimOrca_oo-labeled_correct.gpt4.sharegpt.jsonl


517982it [00:14, 35391.36it/s]
100%|██████████| 3040280/3040280 [01:14<00:00, 40610.06it/s]


Done!


In [4]:
conversations = from_own_format("./data/conversations.jsonl")

3040280it [01:11, 42597.21it/s] 


In [5]:
print(len(conversations))

3040280


In [6]:
# filter out everything that contains "<image>" or "<url>" and remove duplicates
conversations_filtered = set()
for conversation in tqdm(conversations):
    if "<image>" in str(conversation) or "<url>" in str(conversation):
        continue
    conversations_filtered.add(conversation)

print(len(conversations_filtered))


100%|██████████| 3040280/3040280 [00:41<00:00, 73736.04it/s] 

2257941





In [7]:
all_text_arr = set()
for conversation in tqdm(conversations_filtered):
    convo_arr = []
    for message in conversation.messages:
        convo_arr.append("<|" + message.author + "|> " + message.text.replace("\n", "<|br|>")  + "<|end|>")
    
    convo_str = "<|system|> " + conversation.system_prompt.replace("\n", "<|br|>") + "<|end|>" + "".join(convo_arr)
    all_text_arr.add(convo_str)



with open("./data/all_text.txt", "w", encoding="utf-8") as f:
    for line in tqdm(all_text_arr):
        f.write(line + "\n")

100%|██████████| 2257941/2257941 [00:11<00:00, 197732.40it/s]
100%|██████████| 690670/690670 [00:03<00:00, 192104.26it/s]


In [12]:
# build a custom tokenizer
# generate a vocab list
# find the most common words, char combinations and chars

vocab = {}
for line in tqdm(all_text_arr):
    for word in line.split():
        vocab[word] = vocab.get(word, 0) + 1
    
    for char in line:
        vocab[char] = vocab.get(char, 0) + 1
    
    continue

    for w_size in range(2, 4):
        for i in range(len(line) - w_size):
            vocab[line[i:i+w_size]] = vocab.get(line[i:i+w_size], 0) + 1

  0%|          | 0/690670 [00:00<?, ?it/s]

100%|██████████| 690670/690670 [07:10<00:00, 1605.20it/s]


In [13]:
with open("./data/vocab.json", "w", encoding="utf-8") as f:
    json.dump(vocab, f, ensure_ascii=False, indent=4)

In [14]:
vocab_filtered = {}

for key, value in tqdm(vocab.items()):
    if value > 35:
        vocab_filtered[key] = value

vocab_filtered = sorted(vocab_filtered.items(), key=lambda x: x[1], reverse=True)

print(len(vocab_filtered))

max_vocab_size = 32_000
vocab_filtered = vocab_filtered[:max_vocab_size]

100%|██████████| 5221896/5221896 [00:02<00:00, 2358671.91it/s]


153331


In [15]:
with open("./data/vocab_filtered.json", "w", encoding="utf-8") as f:
    json.dump(vocab_filtered, f, ensure_ascii=False, indent=4)

In [16]:
vocab_final = { # token_str: token_id
    "<|system|>": 0,
    "<|user|>": 1,
    "<|assistant|>": 2,
    "<|end|>": 3,
    "<|br|>": 4
}
for i, (key, value) in enumerate(vocab_filtered):
    if "<" in key or ">" in key or "|" in key:
        continue
    vocab_final[key] = len(vocab_final)

print(len(vocab_final))

with open("./data/vocab_final.json", "w", encoding="utf-8") as f:
    json.dump(vocab_final, f, ensure_ascii=False, indent=4)

30763


In [4]:
# for loading
vocab_final = json.load(open("./data/vocab_final.json", "r", encoding="utf-8"))

In [5]:
class Tokenizer:
    __vocab: dict[str, int]
    __vocab_reversed: dict[int, str]
    __vocab_size: int

    def __init__(self, vocab: dict[str, int]):
        self.__vocab = vocab
        self.__vocab_reversed = {v: k for k, v in vocab.items()}
        self.__vocab_size = len(vocab)
    
    def encode(self, text: str) -> list[int]:
        out_tokens = []
        
        current_substring = ""
        for char in text:
            current_substring += char
            if current_substring in self.__vocab:
                out_tokens.append(self.__vocab[current_substring])
                current_substring = ""
        
        return out_tokens
    
    def decode(self, tokens: list[int]) -> str:
        return "".join([self.__vocab_reversed[t] for t in tokens])
    
    def vocab_size(self) -> int:
        return self.__vocab_size

In [20]:
# Phase 3:
# Tokenize the text
# generate training pairs

tokenizer = Tokenizer(vocab_final)

data_pairs = []

with fileinput.input(files=("./data/all_text.txt"), encoding="utf-8") as f:
    for line in tqdm(f):
        tokens = tokenizer.encode(line)

        # find system token
        if 0 not in tokens:
            system_prompt = []
            system_token_idx = 0
        else:
            system_token_idx = tokens.index(0)
            system_prompt = tokens[:system_token_idx]
        
        # find user token
        if 1 not in tokens:
            user_token_idx = system_token_idx
            user_message = []
        else:
            user_token_idx = tokens.index(1)
            user_message = tokens[system_token_idx:user_token_idx]
        
        # find assistant token
        if 2 not in tokens:
            assistant_token_idx = user_token_idx
            assistant_message = []
        else:
            assistant_token_idx = tokens.index(2)
            assistant_message = tokens[user_token_idx:assistant_token_idx]
        
        data_pairs.append((system_prompt, user_message, assistant_message))

# save to jsonl
with jsonlines.open("./data/data_pairs.jsonl", "w") as writer:
    for pair in tqdm(data_pairs):
        writer.write({"system_prompt": pair[0], "user_message": pair[1], "assistant_message": pair[2]})

14067it [00:08, 1668.05it/s]


KeyboardInterrupt: 

In [6]:
def generate_training_pairs(system_prompt_tokens, user_input_tokens, assistant_output_tokens):
    # generate all substrings of the assistant output from 1 to len

    training_pairs = []
    for i in range(1, len(assistant_output_tokens)):
        X = system_prompt_tokens + user_input_tokens + assistant_output_tokens[:i]
        y = assistant_output_tokens[i]
        training_pairs.append((X, y))
    
    return training_pairs

In [22]:
# Phase 4:
# Train a model
# model is trained on the generated training pairs

import torch
import torch.nn as nn
import random

class Network(nn.Module):
    # Linear Transformer
    # using Dense and Dropout layers
    # softmax at the end

    def __init__(self, vocab_size, num_neurons, num_layers, dropout, output_n_tokens):
        self.vocab_size = vocab_size
        self.context_window = num_neurons
        
        super(Network, self).__init__()

        self.layers = nn.ModuleList()
        
        #input layer (context window, vocab_size)
        self.layers.append(nn.Linear(self.context_window, self.vocab_size))

        # hidden layers
        for _ in range(num_layers // 2):
            self.layers.append(nn.Linear(num_neurons, num_neurons))
            self.layers.append(nn.Dropout(dropout))
        
        self.layers.append(nn.Linear(self.vocab_size, self.vocab_size))

        for _ in range(num_layers // 2):
            self.layers.append(nn.Linear(num_neurons, num_neurons))
            self.layers.append(nn.Dropout(dropout))
        
        # output layer (vocab_size, output_n_tokens)
        self.layers.append(nn.Linear(self.vocab_size, output_n_tokens))
        self.layers.append(nn.Softmax(dim=1))



    
    def forward(self, x):
        x_input = torch.zeros((self.context_window, self.vocab_size))
        for i, token in enumerate(x):
            x_input[i, int(token)] = 1

        # rotate 90 degrees
        x_input = torch.rot90(x_input, 1, [0, 1])
        
        
        # truncate from the left
        if len(x_input) > self.context_window:
            x_input = x_input[-self.context_window:]
        
        
        for idx, layer in enumerate(self.layers):
            print(x_input.shape)
            
            # if we are at one of the midle layers with shape (num_neurons, num_neurons)
            # idx is in between 1 and len(self.layers) - 2
            # not including the 1
            if idx > 1 and idx < len(self.layers) - 2:
                # squeeze the input to num_neurons, num_neurons
                x_input = x_input.squeeze()
                

            x_input = layer(x_input)
        
        x_input = x_input.squeeze()
        
        return torch.argmax(x_input)
    
    
    def generate(self, system_prompt, user_input, max_len):
        # generate a response
        # system_prompt and user_input are lists of tokens
        # max_len is the maximum length of the response

        x: list[int] = system_prompt + user_input
        for _ in range(max_len):
            y = self.forward(x)
            x.append(y)
        
        return x
    
    def train(self, X_arr, y_arr, train_lr, epochs, batch_size):
        # train the model
        # X_arr and y_arr are lists of tuples
        # each tuple contains a list of tokens and a single token
        # X_arr is the input and y_arr is the output

        criterion = nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(self.parameters(), lr=train_lr)

        for epoch in range(epochs):
            epoch_loss = 0.0

            combined_data = list(zip(X_arr, y_arr))
            random.shuffle(combined_data)
            X_arr, y_arr = zip(*combined_data)

            # Mini-batch training
            for i in range(0, len(X_arr), batch_size):
                optimizer.zero_grad()
                
                X_batch = X_arr[i:i+batch_size]
                y_batch = y_arr[i:i+batch_size]


                max_len = max([len(x) for x in X_batch])

                x_batch_padded = []
                for x in X_batch:
                    x_batch_padded.append(x + [0] * (max_len - len(x)))

                x_tensors = [torch.tensor(x, dtype=torch.float32) for x in x_batch_padded]

                inputs = torch.stack(x_tensors)

                max_len_y = max([len(y) for y in y_batch])
                y_batch_padded = []
                for y in y_batch:
                    y_batch_padded.append(y + [0] * (max_len_y - len(y)))
                
                y_tensors = [torch.tensor([y], dtype=torch.long) for y in y_batch_padded]
                targets = torch.stack(y_tensors)



                outputs = [self.forward(x) for x in inputs]
                loss = criterion(outputs, targets)
                epoch_loss += loss.item()

                loss.backward()
                optimizer.step()
            
            print(f"Epoch {epoch + 1} loss: {epoch_loss / len(X_arr)}")

ModuleNotFoundError: No module named 'torch'

In [7]:
tokenizer = Tokenizer(vocab_final)

In [None]:
model = Network(tokenizer.vocab_size(), 256, 3, 0.2)

print("Model initialized")

batch_size = 32
train_lr = 0.001
epochs = 10

with jsonlines.open("./data/data_pairs.jsonl") as reader:
    X_arr = []
    y_arr = []
    for obj in tqdm(reader):
        X_arr.append(obj["system_prompt"] + obj["user_message"])
        y_arr.append(obj["assistant_message"])

        if len(X_arr) == batch_size:
            model.train(X_arr, y_arr, train_lr, epochs, batch_size)
            X_arr = []
            y_arr = []


In [76]:
n = 50
tokens_out = []
for _ in range(n):
    random_token_id = int(abs(random.gauss(100, 100)))
    tokens_out.append(random_token_id)

for token in tokens_out:
    txt = tokenizer.decode([token])

    if len(txt) != 1:
        print(txt, end = " ")

question. an people answer information. like some hypothesis information. think de no and complete first many answer information two give both assistant her 

In [78]:
for token in tokens_out:
    print(token, tokenizer.decode([token]))
    print("---")

223 question.
---
71 an
---
36 I
---
10 i
---
125 people
---
43 '
---
51 )
---
98 answer
---
229 information.
---
220 é
---
131 like
---
171 some
---
214 hypothesis
---
229 information.
---
133 think
---
72 =
---
225 de
---
232 no
---
100 6
---
304 –
---
9 n
---
220 é
---
129 ]
---
31 and
---
238 complete
---
45 1
---
92 9
---
166 first
---
210 many
---
98 answer
---
156 information
---
61 H
---
157 two
---
78 F
---
190 }
---
134 give
---
227 both
---
36 I
---
164 !
---
181 assistant
---
84 5
---
12 o
---
84 5
---
76 3
---
283 ，
---
124 her
---
49 2
---
46 C
---
9 n
---
76 3
---
