#Initialize by running all cells to "train model"

In [2]:
!pip install datasets
!pip install accelerate -U

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/547.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.4/547.8 kB[0m [31m4.3 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━[0m [32m450.6/547.8 kB[0m [31m6.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m34.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3

In [3]:
import re
import nltk
from nltk.corpus import shakespeare

nltk.download('shakespeare')


def extract_sentences_from_work(work):
    work_text = shakespeare.raw(work)
    sentences = []

    # Use regular expressions to find <SPEAKER> and <LINE> tags and extract the text
    speaker_pattern = re.compile(r'<SPEAKER>(.*?)</SPEAKER>', re.DOTALL)
    line_pattern = re.compile(r'<LINE>(.*?)</LINE>', re.DOTALL)

    speakers = speaker_pattern.findall(work_text)
    lines = line_pattern.findall(work_text)

    # Combine speaker and line text into sentences
    for speaker, line in zip(speakers, lines):
        # Remove any leading or trailing whitespace from the speaker and line
        speaker = speaker.strip()
        line = line.strip()
        # Combine speaker and line into a sentence
        sentence = f"{speaker}: {line}"
        sentences.append(sentence)

    return sentences


def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove characters that are not a-z or space
    cleaned_text = re.sub(r'[^a-z\s]', '', text)
    return cleaned_text


def validate_text(all_works_sentences):
    valid = True
    for work_sentences in all_works_sentences:
        for sentence in work_sentences:
            if not all(char in 'abcdefghijklmnopqrstuvwxyz ' for char in sentence):
                valid = False
                return valid
    return valid


def train_test_split(corpus):
    corpus_len = len(corpus)
    train_set_tmp = corpus[:corpus_len-1]
    train_set = []
    for work_lst in train_set_tmp:
        for paragraph in work_lst:
            train_set.append(paragraph)

    test_set_tmp = corpus[corpus_len-1:]
    test_set = []
    for work_lst in test_set_tmp:
        for paragraph in work_lst:
            test_set.append(paragraph)

    return train_set, test_set


def extract_unique_n_minus_1_grams(corpus, n):
    n_minus_1_grams = set()
    for sentence in corpus:
        sentence = ''.join(sentence)
        for i in range(len(sentence) - n + 2):
            context = sentence[i:i + n - 1]
            n_minus_1_grams.add(context)
    return n_minus_1_grams

def get_train_test_data():
    works = shakespeare.fileids()

    all_works_sentences = []
    for work in works:
        sentences = extract_sentences_from_work(work)
        cleaned_sentences = [list(clean_text(sentence)) for sentence in sentences]
        all_works_sentences.append(cleaned_sentences)

    if validate_text(all_works_sentences):
        print(f'Dictionary size is correct!')
    else:
        print(f'Dictionary size is wrong!')
    train_set, test_set = train_test_split(all_works_sentences)

    return all_works_sentences, train_set, test_set


[nltk_data] Downloading package shakespeare to /root/nltk_data...
[nltk_data]   Unzipping corpora/shakespeare.zip.


In [4]:
sents, train_set, test_set = get_train_test_data()

Dictionary size is correct!


In [5]:
from transformers import PreTrainedTokenizer, TFOpenAIGPTLMHeadModel
from transformers import OpenAIGPTLMHeadModel, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from transformers import OpenAIGPTConfig
from transformers import OpenAIGPTTokenizer
import torch
import tensorflow as tf
from nltk.corpus import brown
import re
import nltk
from collections import Counter
from datasets import Dataset

class CustomTokenizer(PreTrainedTokenizer):
    def __init__(self, vocab):
        self.vocab = vocab
        super().__init__(vocab=vocab, pad_token="<pad>", unk_token="<unk>")

        self.ids_to_tokens = {v: k for k, v in vocab.items()}
        self.pad_token = "<pad>"
        self.unk_token = "<unk>"
        self.pad_token_id = vocab[self.pad_token]

    def _tokenize(self, text):
        return list(text)

    def _convert_token_to_id(self, token):
        return self.vocab.get(token, self.vocab.get("<unk>"))

    def _convert_ids_to_tokens(self, index):
        return self.ids_to_tokens.get(index, "<unk>")

    def get_vocab(self):
        return self.vocab

    def encode(self, text, max_length=None, padding=True, truncation=True):
        tokens = self._tokenize(text)
        token_ids = [self._convert_token_to_id(token) for token in tokens]
        if truncation and max_length:
            token_ids = token_ids[:max_length]
        if padding and max_length:
            token_ids += [self._convert_token_to_id(self.pad_token)] * (max_length - len(token_ids))
        #attention_mask = [1] * len(token_ids)  # Add attention mask
        return token_ids

    def decode(self, token_ids):
        tokens = [self._convert_ids_to_tokens(token_id) for token_id in token_ids]
        return ''.join(tokens)

def prepare_data(corpus_tokenized):
    corpus_texts = [''.join(sentence) for sentence in corpus_tokenized]
    return Dataset.from_dict({'text': corpus_texts})



In [6]:
flat_corpus_t = [item for sublist in train_set for item in sublist]
flat_corpus_v = [item for sublist in test_set for item in sublist]
flat_corpus = flat_corpus_t + flat_corpus_v

vocab_counter = Counter(flat_corpus)
vocab = {word: idx for idx, (word, _) in enumerate(vocab_counter.items(), start=1)}
vocab["<pad>"] = 0  # Add padding token

train_text = "\n".join(["".join(sent) for sent in train_set])
test_text = "\n".join(["".join(sent) for sent in test_set])
with open('train.txt', 'w') as f:
    f.write(train_text)

with open('test.txt', 'w') as f:
    f.write(test_text)


In [7]:
from datasets import load_dataset

dataset = load_dataset('text', data_files={'train': 'train.txt', 'validation': 'test.txt'})

train_dataset = dataset['train']
validation_dataset = dataset['validation']


tokenizer = CustomTokenizer(vocab)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding=False, max_length=256)
#tokenizer.add_special_tokens({'pad_token': '[PAD]'})
#tokenized_datasets = dataset.map(tokenize_function, batched=True)

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_validation_dataset = validation_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]



Map:   0%|          | 0/6096 [00:00<?, ? examples/s]

Map:   0%|          | 0/841 [00:00<?, ? examples/s]

#Validate data input

In [8]:
print(len(tokenized_train_dataset['input_ids'][4]))
tokenizer._tokenize(train_dataset['text'][1])
print(train_dataset['text'][1])

49
cleopatra oerflows the measure those his goodly eyes


#Load Model

In [84]:
config = OpenAIGPTConfig.from_json_file("/content/drive/MyDrive/Colab Notebooks/shakespeare-finetuned/config.json")
model = OpenAIGPTLMHeadModel.from_pretrained("/content/drive/MyDrive/Colab Notebooks/shakespeare-finetuned/model.safetensors", config=config)

#tokenizer = AutoTokenizer.from_pretrained("openai-community/openai-gpt")
#model = OpenAIGPTLMHeadModel.from_pretrained("openai-community/openai-gpt")

#config = OpenAIGPTConfig()
#model = OpenAIGPTLMHeadModel(config)
model.resize_token_embeddings(len(vocab))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

OpenAIGPTLMHeadModel(
  (transformer): OpenAIGPTModel(
    (tokens_embed): Embedding(28, 768)
    (positions_embed): Embedding(512, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x Block(
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
    )
  )
  (lm_head): Linear(in_features=768, out_features=28, bias=False)
)

##Train model (skip if not training)

In [76]:
#batchsize 90 on the A100
#batchsize 40 on the L4
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=40,
    per_device_eval_batch_size=40,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
    learning_rate = 1e-5
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_validation_dataset,
)


In [154]:
trainer.train()

Step,Training Loss
500,1.2719
1000,1.164
1500,1.1265


TrainOutput(global_step=1530, training_loss=1.1863770777883094, metrics={'train_runtime': 379.5264, 'train_samples_per_second': 160.621, 'train_steps_per_second': 4.031, 'total_flos': 1945545335635968.0, 'train_loss': 1.1863770777883094, 'epoch': 10.0})

In [168]:
model.save_pretrained("./shakespeare-scratch")

##Predict next top 3 tokens, and generate a longer ouptut string (qualitative analysis)

In [82]:
input_ids = torch.tensor(tokenizer.encode('k')).unsqueeze(0)
input_ids = input_ids.to(model.device)
outputs = model(input_ids, labels=input_ids)

loss, logits = outputs[:2]

last_token_logits = logits[0, -1, :]


probs = torch.softmax(last_token_logits, dim=-1)

num_preds=3
# Get the top num_preds predicted characters and their probabilities
top_preds = torch.topk(probs, num_preds)
predicted_chars = []
predicted_probs = []
predicted_ids = []
for i in range(num_preds):
    pred_id = top_preds.indices[i].item()
    pred_char = tokenizer.decode([pred_id])
    pred_prob = top_preds.values[i].item()

    predicted_ids.append(pred_id)
    predicted_chars.append(pred_char)
    predicted_probs.append(pred_prob)

entropy = -torch.sum(probs * torch.log2(probs))

print(predicted_chars, predicted_ids, predicted_probs, entropy)

['k', 'c', 'o'] [22, 19, 5] [0.12855510413646698, 0.06109977886080742, 0.0578085258603096] tensor(4.5919, device='cuda:0', grad_fn=<NegBackward0>)


In [83]:
input_ids = torch.tensor(tokenizer.encode('as')).unsqueeze(0)
input_ids = input_ids.to(model.device)
with torch.no_grad():
    outputs = model.generate(input_ids, max_length=120, num_return_sequences=1)

generated_text = tokenizer.decode(outputs[0])

output_tensor = outputs.cpu().numpy()

token_ids = output_tensor[0].tolist()

# Decode the token IDs to a string
decoded_string = tokenizer.decode(token_ids)

print("Decoded string:", decoded_string)

Decoded string: asssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssss


##Prepare data for entropy calculation, and do entropy calculation


In [32]:
full_test = ''.join(flat_corpus_v)
full_train = ''.join(flat_corpus_t)
joined_sents = [''.join(sublist) for sublist_list in sents for sublist in sublist_list]
joined_sents = ''.join(joined_sents)

In [81]:

def extract_unique_n_minus_1_grams(corpus, n):
    n_minus_1_grams = set()
    for sentence in corpus:
        sentence = ''.join(sentence)
        for i in range(len(sentence) - n + 2):
            context = sentence[i:i + n - 1]
            if context == 'xz':
              print(sentence)
            n_minus_1_grams.add(context)
    return n_minus_1_grams

def find_probabilities(unique_strings, long_string):
    frequencies = {string: 0 for string in unique_strings}

    # Length of the unique strings
    string_length = len(list(unique_strings)[0])

    # Total number of possible substrings of the same length in the long string
    total_substrings = len(long_string) - string_length + 1

    for string in unique_strings:
        frequencies[string] = long_string.count(string)


    probabilities = {string: freq / total_substrings for string, freq in frequencies.items()}

    return probabilities


def calculate_bigram_probabilities(model, tokenizer, corpus, test_set, ngram = 2):

    bigrams = extract_unique_n_minus_1_grams(corpus, ngram)
    weights = find_probabilities(bigrams, test_set)

    probabilities = []
    entropies = []

    for bigram in bigrams:
        input_ids = torch.tensor(tokenizer.encode(bigram)).unsqueeze(0)
        input_ids = input_ids.to(model.device)

        outputs = model(input_ids, labels=input_ids)
        loss, logits = outputs[:2]

        last_token_logits = logits[0, -1, :]

        # Apply softmax to convert logits to probabilities
        probs = torch.softmax(last_token_logits, dim=-1)

        # Calculate Shannon entropy
        entropy = torch.sum(probs * torch.log2(1/probs))
        entropy = entropy * weights[bigram]
        probabilities.append(probs)
        entropies.append(entropy)

    return bigrams, probabilities, entropies


bigrams, probabilities, entropies = calculate_bigram_probabilities(model, tokenizer, joined_sents, full_test)

average_probability = sum(probs.mean() for probs in probabilities) / len(probabilities)
weighted_entropy = sum(entropy for entropy in entropies)

print("Average bigram probability:", average_probability)
print("Bigram entropy:", weighted_entropy)


Average bigram probability: tensor(0.0357, device='cuda:0', grad_fn=<DivBackward0>)
Bigram entropy: tensor(4.1569, device='cuda:0', grad_fn=<AddBackward0>)
