In [4]:
!pip install nltk




In [1]:
import torch
import nltk
from transformers import AutoTokenizer, GPT2LMHeadModel, GPT2Config, AdamW, get_linear_schedule_with_warmup
from nltk.corpus import shakespeare, brown
from nltk.util import ngrams
from collections import Counter
from torch.utils.data import Dataset, DataLoader
import os
nltk.download('shakespeare')
nltk.download('brown')

def preprocess_text(corpus):
    return ' '.join(corpus)


class TextDataset(Dataset):
    def __init__(self, tokenizer, texts, block_size):
        self.examples = []
        for text in texts:
            tokenized_text = tokenizer.encode(text, add_special_tokens=True)
            for i in range(0, len(tokenized_text) - block_size + 1, block_size):
                self.examples.append(tokenized_text[i:i + block_size])

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, item):
        return torch.tensor(self.examples[item])


EPOCHS = 10
BATCH_SIZE = 64
LEARNING_RATE = 5e-5
BLOCK_SIZE = 128

config = GPT2Config()
model = GPT2LMHeadModel(config)

tokenizer = AutoTokenizer.from_pretrained("gpt2")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

shakespeare_corpus = shakespeare.words('hamlet.xml')
brown_corpus = brown.words(categories='news')

train_texts = [preprocess_text(shakespeare_corpus), preprocess_text(brown_corpus)]
train_dataset = TextDataset(tokenizer, train_texts, BLOCK_SIZE)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
total_steps = len(train_dataloader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

model.train()
for epoch in range(EPOCHS):
    for batch in train_dataloader:
        batch = batch.to(device)
        outputs = model(batch, labels=batch)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    print(f"Epoch {epoch + 1}/{EPOCHS}, Loss: {loss.item()}")

model_save_path = '/Users/howard/Documents/PyCharmProjects/EEC289A/final_project/distilgpt2_train from scratch'
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f"Model saved to {model_save_path}")


[nltk_data] Downloading package shakespeare to /root/nltk_data...
[nltk_data]   Unzipping corpora/shakespeare.zip.
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (46674 > 1024). Running this sequence through the model will result in indexing errors


Epoch 1/10, Loss: 9.104451179504395
Epoch 2/10, Loss: 8.538995742797852
Epoch 3/10, Loss: 8.060266494750977
Epoch 4/10, Loss: 7.598653316497803
Epoch 5/10, Loss: 7.338159561157227
Epoch 6/10, Loss: 7.224276542663574
Epoch 7/10, Loss: 6.905210494995117
Epoch 8/10, Loss: 6.904689311981201
Epoch 9/10, Loss: 6.951355934143066
Epoch 10/10, Loss: 6.736823558807373
Model saved to /Users/howard/Documents/PyCharmProjects/EEC289A/final_project/distilgpt2_train from scratch


In [2]:
import torch
from transformers import AutoTokenizer, GPT2LMHeadModel, GPT2Config
from nltk.corpus import shakespeare, brown
from nltk.util import ngrams
from collections import Counter

def preprocess_text(corpus):
    return ' '.join(corpus)

def get_weighted_n_minus_1_grams(corpus, n):
    n_grams = list(ngrams(corpus, n))
    n_minus_1_grams = [gram[:-1] for gram in n_grams]
    n_minus_1_gram_frequencies = Counter(n_minus_1_grams)
    total_count = sum(n_minus_1_gram_frequencies.values())
    weighted_grams = {gram: count / total_count for gram, count in n_minus_1_gram_frequencies.items()}
    return weighted_grams

model_load_path = '/Users/howard/Documents/PyCharmProjects/EEC289A/final_project/distilgpt2_train from scratch'  # 修改为预训练模型路径
tokenizer = AutoTokenizer.from_pretrained(model_load_path)
model = GPT2LMHeadModel.from_pretrained(model_load_path)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

character_vocabulary = list("abcdefghijklmnopqrstuvwxyz ")
char_to_token_id = {char: tokenizer.encode(char, add_special_tokens=False)[0] for char in character_vocabulary}

def calculate_average_entropy(corpus, n):
    data = preprocess_text(corpus)
    weighted_n_minus_1_grams = get_weighted_n_minus_1_grams(data, n)
    n_minus_1_gram_keys = weighted_n_minus_1_grams.keys()

    n_minus_1_grams_list = []
    for gram in n_minus_1_gram_keys:
        n_minus_1_grams_list.append([char for char in gram])

    entropies = []

    for gram in n_minus_1_grams_list:
        input_text = ''.join(gram)
        input_ids = torch.tensor([tokenizer.encode(input_text, add_special_tokens=False)], device=device)

        with torch.no_grad():
            outputs = model(input_ids)
            logits = outputs.logits

        next_token_logits = logits[:, -1, :].squeeze()
        char_logits = [next_token_logits[char_to_token_id[char]].item() for char in character_vocabulary]
        char_logits_tensor = torch.tensor(char_logits)
        char_probs = torch.softmax(char_logits_tensor, dim=-1)

        char_probs_tensor = torch.tensor(char_probs)
        entropy = -torch.sum(char_probs_tensor * torch.log(char_probs_tensor + 1e-9))

        entropies.append(entropy.item())

    average_entropy = sum(weight * entropy for weight, entropy in zip(weighted_n_minus_1_grams.values(), entropies))

    return average_entropy

shakespeare_corpus = shakespeare.words('hamlet.xml')
average_entropy_shakespeare = calculate_average_entropy(shakespeare_corpus, n=3)
print(f"Average Entropy for Shakespeare: {average_entropy_shakespeare}")

brown_corpus = brown.words(categories='news')
average_entropy_brown = calculate_average_entropy(brown_corpus, n=3)
print(f"Average Entropy for Brown: {average_entropy_brown}")


  char_probs_tensor = torch.tensor(char_probs)


Average Entropy for Shakespeare: 2.9440640662201485
Average Entropy for Brown: 2.939032317695957


In [3]:
shakespeare_corpus = shakespeare.words('hamlet.xml')
average_entropy_shakespeare = calculate_average_entropy(shakespeare_corpus, n=2)
print(f"Average Entropy for Shakespeare: {average_entropy_shakespeare}")

brown_corpus = brown.words(categories='news')
average_entropy_brown = calculate_average_entropy(brown_corpus, n=2)
print(f"Average Entropy for Brown: {average_entropy_brown}")

  char_probs_tensor = torch.tensor(char_probs)


Average Entropy for Shakespeare: 2.9304886259870937
Average Entropy for Brown: 2.930089216527105
