In [1]:
from pyarrow import parquet as pq
import jsonlines
import pandas as pd
from tqdm import tqdm
import json
import os
import subprocess

In [None]:
files = [
    ("https://huggingface.co/datasets/HuggingFaceTB/cosmopedia-100k/resolve/main/data/train-00000-of-00002.parquet?download=true", "train-00000-of-00002.parquet"),
    ("https://huggingface.co/datasets/HuggingFaceTB/cosmopedia-100k/resolve/main/data/train-00001-of-00002.parquet?download=true", "train-00001-of-00002.parquet")
]

for url, file in files:
    fp = "./data/" + file
    if os.path.exists(fp):
        continue
    subprocess.run(["wget", url, "-O", fp])

In [None]:
pq_file_01 = pq.read_table("./data/train-00000-of-00002.parquet").to_pandas()
pq_file_02 = pq.read_table("./data/train-00001-of-00002.parquet").to_pandas()

In [None]:
json_conversations = []

for pq_file in [pq_file_01, pq_file_02]:
    for row in tqdm(pq_file.iterrows(), total=len(pq_file)):
        json_conversations.append(row[1].to_dict())


print(json_conversations[0].keys())

In [None]:
with jsonlines.open("./data/train.jsonl", "w") as writer:
    for conv in tqdm(json_conversations):
        writer.write(conv)

In [None]:
try:
    json_conversations.append(json_conversations.pop(0))
except:
    json_conversations = []
    with open("./data/train.jsonl", "r") as reader:
        for line in reader:
            json_conversations.append(json.loads(line))

print("\n".join([str(val) for val in json_conversations[0].items()]))

In [None]:
word_freq = {}
for conv in tqdm(json_conversations):
    for word in conv["text"].split():
        if word in word_freq:
            word_freq[word] += 1
        else:
            word_freq[word] = 1
    
    for word in conv["prompt"].split():
        if word in word_freq:
            word_freq[word] += 1
        else:
            word_freq[word] = 1

with open("./data/word_freq.json", "w") as writer:
    json.dump(word_freq, writer, indent=4)

In [2]:
try:
    n = word_freq.get("the")
    print(n)
except:
    with open("./data/word_freq.json", "r") as reader:
        word_freq = json.load(reader)

In [3]:
class Tokenizer:
    __token_dict: dict[str, int]
    __reverse_token_dict: dict[int, str]

    vocab_size: int

    def __init__(self, word_freq: dict[str, int]):
        self.vocab_size = len(word_freq)

        # Sort the words by frequency
        sorted_words = sorted(word_freq, key=word_freq.get, reverse=True)

        self.__token_dict = {}
        self.__reverse_token_dict = {}

        for i, word in enumerate(sorted_words):
            self.__token_dict[word] = i
            self.__reverse_token_dict[i] = word
    
    def reduce_vocab_size(self, new_vocab_size: int):
        # cut out the least frequent words
        words_to_cut = list(self.__token_dict.keys())[new_vocab_size:]
        for word in words_to_cut:
            del self.__reverse_token_dict[self.__token_dict[word]]
            del self.__token_dict[word]
        
        self.vocab_size = len(self.__token_dict)
    
    def __get_token(self, word: str) -> int:
        if len(word) > 1:
            punctuations = ",.!?"
            if word[-1] in punctuations:
                word = word[:-1]
            if word[0] in punctuations:
                word = word[1:]
        return self.__token_dict.get(word, 0)
        
        
    def encode(self, text: str) -> list[int]:
        return [self.__get_token(word) for word in text.split()]
    
    def encode_one_hot(self, text: str) -> list[int]:
        tokens = self.encode(text)
        one_hot_tokens = []
        for tok in tokens:
            one_hot = [0] * self.vocab_size
            one_hot[tok] = 1
            one_hot_tokens.append(one_hot)
        return one_hot_tokens
    
    def decode(self, tokens: list[int]) -> str:
        return " ".join([self.__reverse_token_dict[token] for token in tokens])
    
    def decode_one_hot_tokens(self, one_hot_tokens: list[int]) -> str:
        tokens = [self.__reverse_token_dict[one_hot.index(max(one_hot))] for one_hot in one_hot_tokens]
        return " ".join(tokens)
    
    def vocab_size(self) -> int:
        return self.vocab_size

def pad_or_truncate(tokens: list[int], length: int) -> list[int]:
    if len(tokens) < length:
        return tokens + [0] * (length - len(tokens))
    else:
        return tokens[:length]

In [4]:
tokenizer = Tokenizer(word_freq)
tokenizer.reduce_vocab_size(32_000)
print(tokenizer.vocab_size)

32000


In [5]:
txt = "the quick brown fox jumps over the lazy dog"

oh_toks = tokenizer.encode(txt)

txt_out = tokenizer.decode(oh_toks)
print(txt_out)

the quick brown fox jumps over the lazy dog


In [6]:
import torch
import torch.nn as nn

print(torch.__version__)
print(torch.cuda.is_available())

gpu = torch.device("cuda:0")

2.4.0.dev20240406
True


In [7]:
class Network(nn.Module):
    # Fully connected neural network
    # context_length: int # THe Lenght of the input token sequence
    # vocab_size: int # The size of the vocabulary
    # output_size: int # The size of the output token sequence

    # input_shape = (context_length, vocab_size)
    # output_shape = (output_size, vocab_size)

    # hidden_layer_shape = (int(context_length / 2), int(context_length / 4))
    # hidden_layer_activation = "relu"

    def __init__(self, context_length: int, vocab_size: int, output_size: int, n_hidden_layers: int):
        super(Network, self).__init__()

        self.context_length = context_length
        self.vocab_size = vocab_size
        self.output_size = output_size
        self.n_hidden_layers = n_hidden_layers

        self.hidden_layers = nn.ModuleList()
        self.hidden_layers.append(nn.Linear(context_length * vocab_size, int(context_length / 2)))
        for i in range(n_hidden_layers - 1):
            self.hidden_layers.append(nn.Linear(int(context_length / 2), int(context_length / 4)))
        self.hidden_layers.append(nn.Linear(int(context_length / 4), output_size * vocab_size))

        self.activation = nn.ReLU()
    
    def forward(self, x):
        x = x.view(-1, self.context_length * self.vocab_size)
        for layer in self.hidden_layers:
            x = self.activation(layer(x))
        return x.view(-1, self.output_size, self.vocab_size)
    
    def generate(self, x):
        return self.forward(x)

In [None]:
def generate_text(model, tokenizer, text, max_len=100):
    out_tokens = []
    input_tokens = tokenizer.encode(text)
    input_tokens = pad_or_truncate(input_tokens, 256)

    input_tensor = torch.tensor(input_tokens).unsqueeze(0).to(gpu)
    model.eval()

    with torch.no_grad():
        for i in range(max_len):
            output = model.generate(input_tensor)
            output = output[:, -1, :]
            token = torch.argmax(output).item()
            out_tokens.append(token)
            input_tensor = torch.cat([input_tensor, torch.tensor([[token]]).to(gpu)], dim=1)
    
    return tokenizer.decode(out_tokens)

In [8]:
with jsonlines.open("./data/train.jsonl", "r") as reader:
    conversations = list(reader)


In [9]:
data = conversations[:8192]

In [10]:
X = []
y = []

for conv in tqdm(data):
    X.append(pad_or_truncate(tokenizer.encode(conv["text"]), 256))
    y.append(pad_or_truncate(tokenizer.encode(conv["prompt"]), 256))

X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.float32)

100%|██████████| 8192/8192 [00:05<00:00, 1624.37it/s]


In [12]:
# train the model

model = Network(context_length=256, vocab_size=tokenizer.vocab_size, output_size=256, n_hidden_layers=1).to(gpu)
X = X.to(gpu)
y = y.to(gpu)

: 

In [None]:
lr = 0.1

optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()

In [None]:
num_epochs = 128
batch_size = 24

for i in range(num_epochs):
    model.train()
    for j in tqdm(range(0, len(X), batch_size)):
        optimizer.zero_grad()
        output = model(X[j:j+batch_size])
        # output shape: (batch_size, seq_len, vocab_size)
        # (16, 256, 32_000)
        target = y[j:j+batch_size]
        # target shape: (batch_size, seq_len)
        # (16, 256)

        # so we convert the one hot tokens (32_000) to target acutal tokens (1)
        output = output.view(-1, tokenizer.vocab_size)
        target = target.view(-1)

        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
    
    print(f"after {i} epochs: loss = {loss.item()}")
    print(generate_text(model, tokenizer, "what is the capital of France?"))
    # generate_text(...) calls model.eval() so we need to put it back to training mode