In [1]:
!pip install datasets
!pip install accelerate -U

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194

In [2]:
from transformers import PreTrainedTokenizer, TFOpenAIGPTLMHeadModel
from transformers import OpenAIGPTLMHeadModel, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from transformers import OpenAIGPTConfig
import torch
import tensorflow as tf
from nltk.corpus import brown
import re
import nltk
from collections import Counter
from datasets import Dataset, load_dataset


nltk.download('brown')

def preprocess_word(word):
    cleaned_word = re.sub(r'[^a-zA-Z]', '', word).lower()
    return cleaned_word

def preprocess_corpus(corpus):
    cleaned_corpus = []
    for sentence in corpus:
        cleaned_sentence = [preprocess_word(word) for word in sentence if preprocess_word(word)]
        cleaned_corpus.append(cleaned_sentence)
    return cleaned_corpus

def tokenizer(corpus):
    corpus_char = []
    for sentence in corpus:
        sentence_str = ' '.join(sentence)
        sentence_char = list(sentence_str)
        corpus_char.append(sentence_char)
    return corpus_char

def train_test_split(corpus, ratio=0.7):
    corpus_len = len(corpus)
    train_id = int(corpus_len * ratio)
    train_set = corpus[:train_id]
    test_set = corpus[train_id:]
    return train_set, test_set

class CustomTokenizer(PreTrainedTokenizer):
    def __init__(self, vocab):
        self.vocab = vocab
        super().__init__(vocab=vocab, pad_token="<pad>", unk_token="<unk>")

        self.ids_to_tokens = {v: k for k, v in vocab.items()}
        self.pad_token = "<pad>"
        self.unk_token = "<unk>"
        self.pad_token_id = vocab[self.pad_token]

    def _tokenize(self, text):
        return list(text)

    def _convert_token_to_id(self, token):
        return self.vocab.get(token, self.vocab.get("<unk>"))

    def _convert_ids_to_tokens(self, index):
        return self.ids_to_tokens.get(index, "<unk>")

    def get_vocab(self):
        return self.vocab

    def encode(self, text, max_length=None, padding=True, truncation=True):
        tokens = self._tokenize(text)
        token_ids = [self._convert_token_to_id(token) for token in tokens]
        if truncation and max_length:
            token_ids = token_ids[:max_length]
        if padding and max_length:
            token_ids += [self._convert_token_to_id(self.pad_token)] * (max_length - len(token_ids))
        #attention_mask = [1] * len(token_ids)  # Add attention mask
        return token_ids

    def decode(self, token_ids):
        tokens = [self._convert_ids_to_tokens(token_id) for token_id in token_ids]
        return ''.join(tokens)

def prepare_data(corpus_tokenized):
    corpus_texts = [''.join(sentence) for sentence in corpus_tokenized]
    return Dataset.from_dict({'text': corpus_texts})



[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


In [3]:

corpus = list(brown.sents())
corpus_processed = preprocess_corpus(corpus)
corpus_tokenized = tokenizer(corpus_processed)
train_set, test_set = train_test_split(corpus_tokenized)

train_text = "\n".join(["".join(sent) for sent in train_set])
test_text = "\n".join(["".join(sent) for sent in test_set])

with open('train.txt', 'w') as f:
    f.write(train_text)

with open('test.txt', 'w') as f:
    f.write(test_text)


dataset = load_dataset('text', data_files={'train': 'train.txt', 'validation': 'test.txt'})
train_dataset = dataset['train']
validation_dataset = dataset['validation']

flat_corpus = [item for sublist in corpus_tokenized for item in sublist]
vocab_counter = Counter(flat_corpus)
vocab = {word: idx for idx, (word, _) in enumerate(vocab_counter.items(), start=1)}
vocab["<pad>"] = 0  # Add padding token




tokenizer = CustomTokenizer(vocab)

# Tokenize the Dataset
def tokenize_function(examples):
    return {"input_ids": [tokenizer.encode(text, max_length=256, padding=True, truncation=True) for text in examples["text"]]}

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_validation_dataset = validation_dataset.map(tokenize_function, batched=True, remove_columns=["text"])



Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]



Map:   0%|          | 0/40138 [00:00<?, ? examples/s]

Map:   0%|          | 0/17202 [00:00<?, ? examples/s]

In [4]:
from transformers import GPT2Config
from transformers import GPT2LMHeadModel

#configuration = GPT2Config.from_pretrained("/content/drive/MyDrive/GPT2-scratch/config.json")
#model = GPT2LMHeadModel.from_pretrained("/content/drive/MyDrive/GPT2-scratch/model.safetensors", config=configuration)
#configuration = GPT2Config()
#model = GPT2LMHeadModel(configuration)

config = OpenAIGPTConfig.from_pretrained("/content/drive/MyDrive/Colab Notebooks/brown-finetuned/config.json")
model = OpenAIGPTLMHeadModel.from_pretrained("/content/drive/MyDrive/Colab Notebooks/brown-finetuned/model.safetensors", config=config)
model.resize_token_embeddings(len(vocab))
#model = OpenAIGPTLMHeadModel(config)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


OpenAIGPTLMHeadModel(
  (transformer): OpenAIGPTModel(
    (tokens_embed): Embedding(28, 768)
    (positions_embed): Embedding(512, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x Block(
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
    )
  )
  (lm_head): Linear(in_features=768, out_features=28, bias=False)
)

In [6]:


data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)


training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=40,
    per_device_eval_batch_size=40,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_validation_dataset,
)



#Skip this unless you want to train
Before training, check the batch sizes in the training args to make sure you wont run out of GPU Ram (you will need to restart the runtime if so). L4 can do 40 batches at 256 tokens. A100 can do 100.

In [None]:
trainer.train()

In [None]:
model.save_pretrained("GPT2-scratch")


In [8]:
import torch.nn.functional as F
#model2 = OpenAIGPTLMHeadModel(config)

input_ids = torch.tensor(tokenizer.encode("th")).unsqueeze(0)  # Batch size 1
input_ids = input_ids.to(device)
#model2.to(device)
model.to(device)
outputs = model(input_ids, labels=input_ids)
loss, logits = outputs[:2]

last_token_logits = logits[0, -1, :]


# Apply softmax to convert logits to probabilities
probs = torch.softmax(last_token_logits, dim=-1)

num_preds=3
# Get the top num_preds predicted characters and their probabilities
top_preds = torch.topk(probs, num_preds)
predicted_chars = []
predicted_probs = []
predicted_ids = []
for i in range(num_preds):
    pred_id = top_preds.indices[i].item()
    pred_char = tokenizer.decode([pred_id])
    pred_prob = top_preds.values[i].item()

    predicted_ids.append(pred_id)
    predicted_chars.append(pred_char)
    predicted_probs.append(pred_prob)

print(predicted_chars, predicted_ids, predicted_probs)


with torch.no_grad():
    outputs = model.generate(input_ids, max_length=120, num_return_sequences=1, do_sample = True, temperature=0.7)
    #outputs = model.generate(input_ids, max_length=120, num_return_sequences=1, do_sample = True)

# Convert tensor to list of token IDs
token_ids = outputs[0].tolist()

# Decode the token IDs to a string
decoded_string = tokenizer.decode(token_ids)

print("Decoded string:", decoded_string)

['e', 'i', 'a'] [3, 18, 14] [0.8387231826782227, 0.10710516571998596, 0.023102300241589546]
Decoded string: the came problems responsibilities exploration and united of the mentional congress and the black subman from the felt i


In [9]:

def extract_unique_n_minus_1_grams(corpus, n):
    n_minus_1_grams = set()
    for sentence in corpus:
        sentence = ''.join(sentence)
        for i in range(len(sentence) - n + 2):
            context = sentence[i:i + n - 1]
            if context == '8':
              pass
            n_minus_1_grams.add(context)
    return n_minus_1_grams

def find_probabilities(unique_strings, long_string):
    frequencies = {string: 0 for string in unique_strings}

    # Length of the unique strings
    string_length = len(list(unique_strings)[0])

    # Total number of possible substrings of the same length in the long string
    total_substrings = len(long_string) - string_length + 1

    for string in unique_strings:
        frequencies[string] = long_string.count(string)


    probabilities = {string: freq / total_substrings for string, freq in frequencies.items()}

    return probabilities


def calculate_bigram_probabilities(model, tokenizer, corpus, test_set, ngram = 3):

    bigrams = extract_unique_n_minus_1_grams(corpus, ngram)
    weights = find_probabilities(bigrams, test_set)
    probabilities = []
    entropies = []

    for bigram in bigrams:
        input_ids = torch.tensor(tokenizer.encode(bigram)).unsqueeze(0)
        input_ids = input_ids.to(model.device)

        outputs = model(input_ids, labels=input_ids)
        loss, logits = outputs[:2]

        last_token_logits = logits[0, -1, :]

        # Apply softmax to convert logits to probabilities
        probs = torch.softmax(last_token_logits, dim=-1)

        # Calculate Shannon entropy
        entropy = torch.sum(probs * torch.log2(1/probs))
        entropy = entropy * weights[bigram]

        probabilities.append(probs)
        entropies.append(entropy)

    return bigrams, probabilities, entropies



#corpus = list(brown.sents())
corpus_processed = preprocess_corpus(corpus)
train_set, test_set = train_test_split(corpus_processed)
combined_string = ' '.join(word for sublist in test_set for word in sublist)


bigrams, probabilities, entropies = calculate_bigram_probabilities(model, tokenizer, corpus_processed, combined_string)

average_probability = sum(probs.mean() for probs in probabilities) / len(probabilities)
weighted_entropy = sum(entropy for entropy in entropies)

print("Average bigram probability:", average_probability)
print("Bigram entropy:", weighted_entropy)


Average bigram probability: tensor(0.0357, grad_fn=<DivBackward0>)
Bigram entropy: tensor(1.4612, grad_fn=<AddBackward0>)
