In [24]:
%pip install pyarrow jsonlines pandas matplotlib -q

Note: you may need to restart the kernel to use updated packages.


In [1]:
from pyarrow import parquet as pq
import jsonlines
import pandas as pd
from tqdm import tqdm
import json
import os
import subprocess
import requests

In [None]:
files = [
    ("https://huggingface.co/datasets/HuggingFaceTB/cosmopedia-100k/resolve/main/data/train-00000-of-00002.parquet?download=true", "train-00000-of-00002.parquet"),
    ("https://huggingface.co/datasets/HuggingFaceTB/cosmopedia-100k/resolve/main/data/train-00001-of-00002.parquet?download=true", "train-00001-of-00002.parquet")
]

try:
    os.mkdir("./data")
except FileExistsError:
    pass

for url, file in files:
    fp = "./data/" + file
    if os.path.exists(fp):
        continue
    
    content = requests.get(url).content
    with open(fp, "wb") as f:
        f.write(content)
    
    print(f"Downloaded {file}")

In [None]:
pq_file_01 = pq.read_table("./data/train-00000-of-00002.parquet").to_pandas()
pq_file_02 = pq.read_table("./data/train-00001-of-00002.parquet").to_pandas()

In [None]:
json_conversations = []

for pq_file in [pq_file_01, pq_file_02]:
    for row in tqdm(pq_file.iterrows(), total=len(pq_file)):
        json_conversations.append(row[1].to_dict())


print(json_conversations[0].keys())

In [None]:
with jsonlines.open("./data/train.jsonl", "w") as writer:
    for conv in tqdm(json_conversations):
        writer.write(conv)

In [2]:
try:
    json_conversations.append(json_conversations.pop(0))
except:
    json_conversations = []
    with open("./data/train.jsonl", "r") as reader:
        for line in reader:
            json_conversations.append(json.loads(line))

print("\n".join([str(val) for val in json_conversations[0].items()]))

('prompt', 'Here is an extract from a webpage: "What can cause my settlement offer to be delayed?\nWhen you’ve been injured in an Austin truck accident, one of the most common questions is how long it will take for the insurance company to make an offer to settle your case. The answer depends on a variety of factors.\nThe process starts with filing an insurance claim and providing evidence that shows exactly what happened during the accident and who was at fault. This can involve gathering key Austin truck accident evidence such as:\n- Medical records\n- Photographs or video footage of the crash scene\n- Witness statements\n- Other documents related to your injuries and damages.\nOnce this information has been collected by both sides, negotiations may begin between your Austin truck accident lawyer and the insurance company on how much compensation should be offered in exchange for settling the case out of court.\nIt is important to remember that every truck accident case is unique so 

In [None]:
word_freq = {}
for conv in tqdm(json_conversations):
    for word in conv["text"].split():
        if word in word_freq:
            word_freq[word] += 1
        else:
            word_freq[word] = 1
    
    for word in conv["prompt"].split():
        if word in word_freq:
            word_freq[word] += 1
        else:
            word_freq[word] = 1

with open("./data/word_freq.json", "w") as writer:
    json.dump(word_freq, writer, indent=4)

In [3]:
try:
    n = word_freq.get("the")
    print(n)
except:
    with open("./data/word_freq.json", "r") as reader:
        word_freq = json.load(reader)

In [4]:
class Tokenizer:
    __token_dict: dict[str, int]
    __reverse_token_dict: dict[int, str]

    vocab_size: int

    def __init__(self, word_freq: dict[str, int]):
        self.vocab_size = len(word_freq)

        # Sort the words by frequency
        sorted_words = sorted(word_freq, key=word_freq.get, reverse=True)

        self.__token_dict = {}
        self.__reverse_token_dict = {}

        for i, word in enumerate(sorted_words):
            self.__token_dict[word] = i
            self.__reverse_token_dict[i] = word
    
    def reduce_vocab_size(self, new_vocab_size: int):
        # cut out the least frequent words
        words_to_cut = list(self.__token_dict.keys())[new_vocab_size:]
        for word in words_to_cut:
            del self.__reverse_token_dict[self.__token_dict[word]]
            del self.__token_dict[word]
        
        self.vocab_size = len(self.__token_dict)
    
    def __get_token(self, word: str) -> int:
        if len(word) > 1:
            punctuations = ",.!?"
            if word[-1] in punctuations:
                word = word[:-1]
            if word[0] in punctuations:
                word = word[1:]
        
        if word in self.__token_dict:
            return self.__token_dict[word]
        else:
            return -1
        
        
    def encode(self, text: str) -> list[int]:
        return [self.__get_token(word) for word in text.split()]
    
    def encode_one_hot(self, text: str) -> list[int]:
        tokens = self.encode(text)
        one_hot_tokens = []
        for tok in tokens:
            one_hot = [0] * self.vocab_size
            one_hot[tok] = 1
            one_hot_tokens.append(one_hot)
        return one_hot_tokens

    def __get_word(self, token: int) -> str:
        if token in self.__reverse_token_dict:
            return self.__reverse_token_dict[token]
        else:
            return "N/A"
    
    def decode(self, tokens: list[int]) -> str:
        return " ".join([self.__get_word(tok) for tok in tokens])
    
    def decode_one_hot_tokens(self, one_hot_tokens: list[int]) -> str:
        tokens = [self.__reverse_token_dict[one_hot.index(max(one_hot))] for one_hot in one_hot_tokens]
        return " ".join(tokens)
    
    def vocab_size(self) -> int:
        return self.vocab_size

def pad_or_truncate(tokens: list[int], length: int) -> list[int]:
    if len(tokens) < length:
        return tokens + [0] * (length - len(tokens))
    else:
        return tokens[:length]

In [6]:
tokenizer = Tokenizer(word_freq)
print(tokenizer.vocab_size)

1451398


In [12]:
txt = "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua."

tokens = tokenizer.encode(txt)
print(tokens)

txt_out = tokenizer.decode(tokens)
print(txt_out)

[88961, 84452, 75405, 6083, -1, 112288, 107941, 168554, 190837, 121, 220485, 179950, 251287, 102847, 197808, 1058, 295006, 93829, -1]
Lorem ipsum dolor sit N/A consectetur adipiscing elit Sed do eiusmod tempor incididunt ut labore et dolore magna N/A


In [13]:
import torch
import torch.nn as nn

print(torch.__version__)
print(torch.cuda.is_available())

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

2.4.0.dev20240406
True
cuda


In [14]:
def pad_or_truncate_tensor(tensor: torch.Tensor, length: int) -> torch.Tensor:

    if tensor.size(0) < length:
        if len(tensor.size()) == 1:
            return torch.cat([tensor, torch.zeros(length - tensor.size(0))], dim=0)
        return torch.cat([tensor, torch.zeros(length - tensor.size(0), tensor.size(1))], dim=0)
    else:
        return tensor[:length]

In [15]:
class Network(nn.Module):
    def __init__(self, hidden_layer_size: int, n_hidden_layers: int, context_window_size: int):
        super(Network, self).__init__()

        self.context_window_size = context_window_size

        input_shape = (context_window_size, hidden_layer_size)
        hidden_layer_shape = (hidden_layer_size, hidden_layer_size)
        output_shape = (hidden_layer_size, context_window_size)

        self.input_layer = nn.Linear(*input_shape)
        self.hidden_layers = nn.ModuleList([nn.Linear(*hidden_layer_shape) for _ in range(n_hidden_layers)])
        self.output_layer = nn.Linear(*output_shape)
    
    def forward(self, x):
        x = self.input_layer(x)
        for layer in self.hidden_layers:
            x = layer(x)
        x = self.output_layer(x)
        return x
    
    def random_init(self):
        for layer in [self.input_layer, *self.hidden_layers, self.output_layer]:
            nn.init.xavier_uniform_(layer.weight)
            nn.init.zeros_(layer.bias)
    
    def generate_text(self, input_text: str, tokenizer: Tokenizer, n_words: int):
        tokens = tokenizer.encode(input_text)
        input_tensor = torch.tensor(tokens, dtype=torch.float32).to(device)
        input_tensor = pad_or_truncate_tensor(input_tensor, self.context_window_size).to(device)

        n_inferences = n_words // self.context_window_size + 1 # number of inferences needed to generate n_words

        output_text = []
        for _ in range(n_inferences):
            output = self(input_tensor)
            output_py_arr = output.detach().cpu().numpy()

            for elem in output_py_arr:
                token_idx = int(abs(elem))
                output_text.append(tokenizer.decode([token_idx]))
                input_tensor = torch.cat([input_tensor[1:], torch.tensor([token_idx], dtype=torch.float32).to(device)], dim=0)
        
        return " ".join(output_text)


In [16]:
with jsonlines.open("./data/train.jsonl", "r") as reader:
    data = list(reader)

In [17]:
print(data[0].keys())

dict_keys(['prompt', 'text_token_length', 'text', 'seed_data', 'format', 'audience'])


In [19]:
X = []
y = []

for conv in tqdm(data):
    y.append(torch.tensor(tokenizer.encode(conv["text"])))
    X.append(torch.tensor(tokenizer.encode(conv["prompt"])))

100%|██████████| 100000/100000 [01:11<00:00, 1389.75it/s]


In [20]:
max_len_x = max([len(x) for x in X])
max_len_y = max([len(y) for y in y])

print(max_len_x, max_len_y) # 415, 1805

context_window_size = 128

415 1805


In [21]:
X = [pad_or_truncate_tensor(x, context_window_size) for x in X]
X = torch.stack(X).to(device)

y = [pad_or_truncate_tensor(y_, context_window_size) for y_ in y]
y = torch.stack(y).to(device)

In [32]:
hidden_layer_size = 512
n_hidden_layers = 8

network = Network(hidden_layer_size, n_hidden_layers, context_window_size).to(device)
network.random_init()

In [25]:
import matplotlib.pyplot as plt

In [33]:
# Training

lr = 0.01
n_epochs = 10
batch_size = 16

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(network.parameters(), lr=lr)


loss_data = []

for epoch in range(n_epochs):
    for i in tqdm(range(0, len(X), batch_size)):
        X_batch = X[i:i+batch_size]
        y_batch = y[i:i+batch_size]

        optimizer.zero_grad()
        output = network(X_batch)
        loss = criterion(output, y_batch)
        loss.backward()
        optimizer.step()

        loss_data.append(loss.item())
    
    # plot the loss
    # and save it to ./data/loss_{epoch}.png
    plt.plot(loss_data)
    plt.savefig(f"./data/loss_{epoch}.png")
    plt.close()


  0%|          | 0/6250 [00:00<?, ?it/s]

100%|██████████| 6250/6250 [01:31<00:00, 67.99it/s]
100%|██████████| 6250/6250 [01:33<00:00, 67.14it/s]
  1%|          | 54/6250 [00:00<01:35, 65.04it/s]


KeyboardInterrupt: 

In [40]:
input_txt = "What is the capital of France?"
input_tensor = torch.tensor(tokenizer.encode(input_txt), dtype=torch.float32)
input_tensor = pad_or_truncate_tensor(input_tensor, context_window_size).to(device)

output_tokens = network(input_tensor)

output_tokens = [abs(int(elem)) for elem in output_tokens.detach().cpu().numpy()]

# clamp all values to the vocab size
ratio = tokenizer.vocab_size / max(output_tokens)
output_tokens = [int(elem * ratio) for elem in output_tokens]

output_txt = tokenizer.decode(output_tokens)

print(output_tokens)
print(output_txt)

[25379, 663177, 613752, 607243, 240456, 75395, 583388, 51917, 668095, 334030, 926524, 507964, 190899, 440002, 415373, 28005, 1073567, 304250, 496763, 121913, 40124, 845659, 1275422, 36011, 133177, 561422, 320588, 323446, 174176, 1451397, 366151, 290013, 498514, 999331, 692271, 682804, 224635, 750064, 430428, 435790, 696832, 106507, 501817, 665300, 45532, 265862, 413418, 76504, 346599, 1114848, 781563, 175212, 632697, 598536, 464661, 1032256, 149386, 81014, 94202, 463561, 51578, 858420, 226122, 263647, 639873, 398834, 145652, 23854, 79516, 479303, 324166, 498998, 335094, 488753, 361490, 764914, 588352, 143092, 196442, 781722, 58220, 798254, 4632, 653663, 27158, 733618, 1257249, 378667, 573787, 187665, 853626, 545068, 134581, 322306, 892675, 550250, 505948, 12881, 1082951, 19640, 434391, 514661, 711830, 231292, 671137, 891662, 1120361, 344056, 59589, 1046690, 389833, 608195, 117927, 445728, 513043, 136505, 576637, 155823, 808869, 717704, 498538, 200016, 156523, 1296531, 543738, 1410077, 

In [122]:
import random

def generate_random_token(vocab_size: int) -> int:
    # generate a random token using a bell curve
    expected_value = 0
    variation = vocab_size // 10000

    return abs(int(random.gauss(expected_value, variation)))

input_tokens = []
for _ in range(context_window_size):
    input_tokens.append(generate_random_token(tokenizer.vocab_size))

print(tokenizer.decode(input_tokens))

A should within at also Use Provide both each while related examples, this Illustrate: As how information It creating was information just or start educates bring These individuals 2. suitable The related consider This various tone without by including content With content around based without that his individuals take it but it her This moving accessible. a with further time before two future provide is has was such a post data an based time was different unit Write These When she explore In what In help after also accessible. Do - examples, in-depth A For Aim we over concepts can many potential do also A Aim students extract upon suitable public but offers critical all many related topic, there allows any by do only take webpage: upon had
