In [None]:
%pip install jsonlines tqdm requests pyarrow tensorflow torch tensorflow-hub tensorflow-text pandas keras -q

In [None]:
import requests

ext_ip = requests.get("https://checkip.amazonaws.com").text.strip()
loc_json = requests.get(f"http://ip-api.com/json/{ext_ip}").json()
print("\n".join(str((k, v)) for k, v in loc_json.items()))

In [4]:
import jsonlines
from tqdm import tqdm
import json
import os
import requests
from pyarrow import parquet as pq
import re

In [3]:
import tensorflow as tf
import tensorflow_hub as tf_hub
import tensorflow_text as tf_text

print(f"Num GPUs Available: {tf.config.experimental.list_physical_devices('GPU')}")

Num GPUs Available: []


In [5]:
model_path = "./models/USE-3-Large/"
if not os.path.exists(model_path):
    model_path = "https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3"

USE_model = tf_hub.load(model_path)

2024-04-12 11:34:49.247919: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


In [6]:
if not (os.path.exists("./data/train-00000-of-00002.parquet") and os.path.exists("./data/train-00001-of-00002.parquet")):
    files = [
        ("https://huggingface.co/datasets/HuggingFaceTB/cosmopedia-100k/resolve/main/data/train-00000-of-00002.parquet?download=true", "train-00000-of-00002.parquet"),
        ("https://huggingface.co/datasets/HuggingFaceTB/cosmopedia-100k/resolve/main/data/train-00001-of-00002.parquet?download=true", "train-00001-of-00002.parquet")
    ]

    try:
        os.mkdir("./data")
    except FileExistsError:
        pass

    for url, file in files:
        fp = "./data/" + file
        if os.path.exists(fp):
            continue
        
        content = requests.get(url).content
        with open(fp, "wb") as f:
            f.write(content)
        
        print(f"Downloaded {file}")


    pq_file_01 = pq.read_table("./data/train-00000-of-00002.parquet").to_pandas()
    pq_file_02 = pq.read_table("./data/train-00001-of-00002.parquet").to_pandas()

    json_conversations = []

    for pq_file in [pq_file_01, pq_file_02]:
        for row in tqdm(pq_file.iterrows(), total=len(pq_file)):
            json_conversations.append(row[1].to_dict())


    with jsonlines.open("./data/train.jsonl", "w") as writer:
        for conv in tqdm(json_conversations):
            writer.write(conv)

else:
    print("Data already downloaded")

Data already downloaded


In [13]:
n_to_load = 4000

data = []

with jsonlines.open("./data/train.jsonl", "r") as reader:
    for i, obj in enumerate(reader):
        data.append(obj)
        if i >= n_to_load:
            break

print(f"Loaded {len(data)} conversations")

Loaded 4001 conversations


In [8]:
sentence_endings = re.compile(r'[.!?]')

sentences = []
for obj in tqdm(data):
    prompt = obj["prompt"]
    text = obj["text"]

    combined_text = prompt + " " + text

    
    start = 0
    for re_match in sentence_endings.finditer(combined_text):
        end = re_match.end()
        sentences.append(combined_text[start:end].strip())
        start = end

print(len(sentences))

100%|██████████| 4000/4000 [00:01<00:00, 3487.86it/s]

190098





In [9]:
class Tokenizer:
    __token_dict: dict[str, int]
    __reverse_token_dict: dict[int, str]

    vocab_size: int

    def __init__(self, word_freq: dict[str, int]):
        self.vocab_size = len(word_freq)

        # Sort the words by frequency
        sorted_words = sorted(word_freq, key=word_freq.get, reverse=True)

        self.__token_dict = {}
        self.__reverse_token_dict = {}

        for i, word in enumerate(sorted_words):
            self.__token_dict[word] = i
            self.__reverse_token_dict[i] = word
    
    def reduce_vocab_size(self, new_vocab_size: int):
        # cut out the least frequent words
        words_to_cut = list(self.__token_dict.keys())[new_vocab_size:]
        for word in words_to_cut:
            del self.__reverse_token_dict[self.__token_dict[word]]
            del self.__token_dict[word]
        
        self.vocab_size = len(self.__token_dict)
    
    def __get_token(self, word: str) -> int:
        if len(word) > 1:
            punctuations = ",.!?"
            if word[-1] in punctuations:
                word = word[:-1]
            if word[0] in punctuations:
                word = word[1:]
        
        if word in self.__token_dict:
            return self.__token_dict[word]
        else:
            return -1
        
        
    def encode(self, text: str) -> list[int]:
        return [self.__get_token(word) for word in text.split()]
    
    def encode_one_hot(self, text: str) -> list[int]:
        tokens = self.encode(text)
        one_hot_tokens = []
        for tok in tokens:
            one_hot = [0] * self.vocab_size
            one_hot[tok] = 1
            one_hot_tokens.append(one_hot)
        return one_hot_tokens

    def __get_word(self, token: int) -> str:
        if token in self.__reverse_token_dict:
            return self.__reverse_token_dict[token]
        else:
            return "N/A"
    
    def decode(self, tokens: list[int]) -> str:
        return " ".join([self.__get_word(tok) for tok in tokens])
    
    def decode_one_hot_tokens(self, one_hot_tokens: list[int]) -> str:
        tokens = [self.__reverse_token_dict[one_hot.index(max(one_hot))] for one_hot in one_hot_tokens]
        return " ".join(tokens)
    
    def vocab_size(self) -> int:
        return self.vocab_size

def pad_or_truncate(tokens: list[int], length: int) -> list[int]:
    if len(tokens) < length:
        return tokens + [0] * (length - len(tokens))
    else:
        return tokens[:length]

In [10]:
with open("./data/word_freq.json", "r") as reader:
    word_freq = json.load(reader)

In [11]:
tokenizer = Tokenizer(word_freq)

In [12]:
X = []  # 512 dimensional embeddings from USE_3_Large
y = []  # encoded text (not one-hot)

batch_size_use = 256
for i in tqdm(range(0, len(sentences), batch_size_use)):
    batch = sentences[i:i+batch_size_use]

    embeddings = USE_model(batch).numpy().tolist()

    for j, embedding in enumerate(embeddings):
        text = sentences[i + j]
        encoded_text = tokenizer.encode(text)
        X.append(embedding)
        y.append(encoded_text)
    
    tf.keras.backend.clear_session()

print(len(X), len(y))

  1%|          | 5/743 [00:47<1:57:36,  9.56s/it]


KeyboardInterrupt: 

In [None]:
with jsonlines.open("./data/train_embeddings.jsonl", "w") as writer:
    for i in tqdm(range(len(X))):
        writer.write({"X": X[i], "y": y[i]})

In [None]:
with jsonlines.open("./data/train_embeddings.jsonl", "r") as reader:
    X = []
    y = []
    for obj in tqdm(reader):
        X.append(obj["X"])
        y.append(obj["y"])

print(len(X), len(y))
print(max([len(tokens) for tokens in y]))

In [None]:
def pad_or_truncate(tokens: list[int], length: int) -> list[int]:
    if len(tokens) < length:
        return tokens + [0] * (length - len(tokens))
    else:
        return tokens[:length]

In [None]:
y = [pad_or_truncate(tokens, 315) for tokens in y]

In [None]:
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import os

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
class Network(nn.Module):
    # Reverse Sentence Encoder Model:
    # Input shape: (512,)
    # from Universal Sentence Encoder Multilingual 3 Large
    # Output shape: (output_length,)
    # where output_length is the length of the encoded text
    # Outputs -> Tokens

    def __init__(self, output_length):

        super(Network, self).__init__()

        n_layers = 48
        hidden_size = 2048

        self.input_layer = nn.Linear(512, hidden_size)
        self.hidden_layers = nn.ModuleList([nn.Linear(hidden_size, hidden_size) for _ in range(n_layers)])
        self.output_layer = nn.Linear(hidden_size, output_length)
    
    def init_random(self):
        # initializes the model with random weights
        for layer in self.hidden_layers:
            nn.init.xavier_uniform_(layer.weight)
            nn.init.zeros_(layer.bias)
    
    def forward(self, x):
        x = torch.relu(self.input_layer(x))
        for layer in self.hidden_layers:
            x = torch.relu(layer(x))
        x = self.output_layer(x)
        return x

In [None]:
lr = 0.01
epochs = 1
batch_size = 10
output_length = 315

In [None]:
model = Network(output_length).to(device)
model.init_random()

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [None]:
def train():
    model.train()
    loss_arr = []
    for i in tqdm(range(0, len(X), batch_size)):
        X_batch = torch.tensor(X[i:i+batch_size], device=device, dtype=torch.float32)
        y_batch = torch.tensor(y[i:i+batch_size], device=device, dtype=torch.float32)

        optimizer.zero_grad()

        output = model(X_batch)
        loss = criterion(output, y_batch)
        loss.backward()
        optimizer.step()

        loss_arr.append(loss.item())
    
    plt.plot(loss_arr)
    fp = "./data/loss_plt_"
    for i in range(100):
        if not os.path.exists(fp + str(i) + ".png"):
            plt.savefig(fp + str(i) + ".png")
            break
    plt.clf()
    

for epoch in range(epochs):
    train()

In [None]:
input_txt = "This is a test sentence."

input_embedding = USE_model([input_txt]).numpy().tolist()
input_embedding = torch.tensor(input_embedding, device=device, dtype=torch.float32)

output = model(input_embedding)
output = output.cpu().detach().numpy()[0]

print(output)

output_text = tokenizer.decode([int(token) for token in output])
print(output_text)