In [1]:
%pip install jsonlines tqdm requests pyarrow tensorflow==2.9.0 torch tensorflow-hub tensorflow-text pandas keras sentencepiece transformers -q

Note: you may need to restart the kernel to use updated packages.


DEPRECATION: pytorch-lightning 1.7.0 has a non-standard dependency specifier torch>=1.9.*. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pytorch-lightning or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063


In [2]:
import jsonlines
from tqdm import tqdm
import json
import os
import requests
from pyarrow import parquet as pq
import re

In [3]:
import tensorflow as tf
import tensorflow_hub as tf_hub
import tensorflow_text as tf_text

print(f"Num GPUs Available: {tf.config.experimental.list_physical_devices('GPU')}")

Num GPUs Available: []


In [4]:
model_path = "./models/USE-3-Large/"
if not os.path.exists(model_path):
    model_path = "https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3"

USE_model = tf_hub.load(model_path)

In [5]:
if not (os.path.exists("./data/train-00000-of-00002.parquet") and os.path.exists("./data/train-00001-of-00002.parquet")):
    files = [
        ("https://huggingface.co/datasets/HuggingFaceTB/cosmopedia-100k/resolve/main/data/train-00000-of-00002.parquet?download=true", "train-00000-of-00002.parquet"),
        ("https://huggingface.co/datasets/HuggingFaceTB/cosmopedia-100k/resolve/main/data/train-00001-of-00002.parquet?download=true", "train-00001-of-00002.parquet")
    ]

    try:
        os.mkdir("./data")
    except FileExistsError:
        pass

    for url, file in files:
        fp = "./data/" + file
        if os.path.exists(fp):
            continue
        
        content = requests.get(url).content
        with open(fp, "wb") as f:
            f.write(content)
        
        print(f"Downloaded {file}")


    pq_file_01 = pq.read_table("./data/train-00000-of-00002.parquet").to_pandas()
    pq_file_02 = pq.read_table("./data/train-00001-of-00002.parquet").to_pandas()

    json_conversations = []

    for pq_file in [pq_file_01, pq_file_02]:
        for row in tqdm(pq_file.iterrows(), total=len(pq_file)):
            json_conversations.append(row[1].to_dict())


    with jsonlines.open("./data/train.jsonl", "w") as writer:
        for conv in tqdm(json_conversations):
            writer.write(conv)

else:
    print("Data already downloaded")

Data already downloaded


In [6]:
n_to_load = 4000

data = []

with jsonlines.open("./data/train.jsonl", "r") as reader:
    for i, obj in enumerate(reader):
        data.append(obj)
        if i >= n_to_load:
            break

print(f"Loaded {len(data)} conversations")

Loaded 4001 conversations


In [7]:
sentence_endings = re.compile(r'[.!?]')

sentences = []
for obj in tqdm(data):
    prompt = obj["prompt"]
    text = obj["text"]

    combined_text = prompt + " " + text

    
    start = 0
    for re_match in sentence_endings.finditer(combined_text):
        end = re_match.end()
        sentences.append(combined_text[start:end].strip())
        start = end

print(len(sentences))

100%|██████████| 4001/4001 [00:00<00:00, 14628.89it/s]


190141


In [22]:
# using Mixtral 7b tokenizer
from transformers import LlamaTokenizer
tokenizer = LlamaTokenizer.from_pretrained("kittn/mistral-7B-v0.1-hf")

toks = tokenizer.encode("Hello, my name is John. I like to play football.")
print(toks)

[1, 22557, 28725, 586, 1141, 349, 2215, 28723, 315, 737, 298, 1156, 6569, 28723]


In [25]:
X = []  # 512 dimensional embeddings from USE_3_Large
y = []  # encoded text (not one-hot)

batch_size_use = 64
for i in tqdm(range(0, len(sentences), batch_size_use)):
    batch = sentences[i:i+batch_size_use]

    embeddings = USE_model(batch).numpy().tolist()

    for j, embedding in enumerate(embeddings):
        text = sentences[i + j]
        encoded_text = tokenizer.encode(text)
        X.append(embedding)
        y.append(encoded_text)
    
    tf.keras.backend.clear_session()

print(len(X), len(y))

  4%|▍         | 121/2971 [05:38<2:13:03,  2.80s/it]


KeyboardInterrupt: 

In [None]:
with jsonlines.open("./data/train_embeddings.jsonl", "w") as writer:
    for i in tqdm(range(len(X))):
        writer.write({"X": X[i], "y": y[i]})

In [None]:
with jsonlines.open("./data/train_embeddings.jsonl", "r") as reader:
    X = []
    y = []
    for obj in tqdm(reader):
        X.append(obj["X"])
        y.append(obj["y"])

print(len(X), len(y))
print(max([len(tokens) for tokens in y]))

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
class TransformerNetwork(nn.Module):
    def __init__(self, d_model, nhead, num_layers, dim_feedforward, num_tokens, max_seq_length):
        super(TransformerNetwork, self).__init__()

        self.transformer_encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward),
            num_layers
        )
        self.fc = nn.Linear(d_model, num_tokens)
        self.pos_encoder = nn.Embedding(max_seq_length, d_model)
        self.d_model = d_model

    def forward(self, x):
        seq_length = x.size(1)
        pos = torch.arange(seq_length, dtype=torch.long, device=x.device)
        x = x + self.pos_encoder(pos)
        x = self.transformer_encoder(x)
        x = self.fc(x)
        return x


In [26]:
d_model = 512
nhead = 8
num_layers = 6
dim_feedforward = 2048
num_tokens = 32_000
max_seq_length = 512

batch_size = 64
num_epochs = 10
lr = 0.0001

In [27]:
# pad or truncate y to max_seq_length
for i in range(len(y)):
    y[i] = y[i][:max_seq_length] + [0] * (max_seq_length - len(y[i]))


In [28]:
X = torch.tensor(X, dtype=torch.float32).to(device)
y = torch.tensor(y, dtype=torch.int32).to(device)

dataset = TensorDataset(X, y)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

NameError: name 'torch' is not defined

In [None]:
model = TransformerNetwork(d_model, nhead, num_layers, dim_feedforward, num_tokens, max_seq_length).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

In [None]:
for epoch in range(num_epochs):
    for batch_X, batch_y in dataloader:
        batch_X = batch_X.to('cuda')  # Move the input batch to GPU
        batch_y = batch_y.to('cuda')  # Move the target batch to GPU
        
        optimizer.zero_grad()
        
        outputs = model(batch_X)
        outputs = outputs.view(-1, num_tokens)
        batch_y = batch_y.view(-1)
        
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")