In [None]:
from transformers import GPT2Tokenizer

# Load the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Example text
long_text = """
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Integer nec odio. Praesent libero. Sed cursus ante dapibus diam. Sed nisi. Nulla quis sem at nibh elementum imperdiet. Duis sagittis ipsum. Praesent mauris. Fusce nec tellus sed augue semper porta. Mauris massa. Vestibulum lacinia arcu eget nulla. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. 

In hac habitasse platea dictumst. Curabitur sodales ligula in libero. Sed dignissim lacinia nunc. Curabitur tortor. Pellentesque nibh. Aenean quam. In scelerisque sem at dolor. Maecenas mattis. Sed convallis tristique sem. Proin ut ligula vel nunc egestas porttitor. Morbi lectus risus, iaculis vel, suscipit quis, luctus non, massa. Fusce ac turpis quis ligula lacinia aliquet. Mauris ipsum. Nulla metus metus, ullamcorper vel, tincidunt sed, euismod in, nibh. Quisque volutpat condimentum velit. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. 

Nam nec ante. Sed lacinia, urna non tincidunt mattis, tortor neque adipiscing diam, a cursus ipsum ante quis turpis. Nulla facilisi. Ut fringilla. Suspendisse potenti. Nunc feugiat mi a tellus consequat imperdiet. Vestibulum sapien. Proin quam. Etiam ultrices. Suspendisse in justo eu magna luctus suscipit. Sed lectus. 

"""

# Tokenize the text
tokenized_text = tokenizer.encode(long_text, add_special_tokens=True, max_length=512, truncation=True)



In [None]:
txt = """
First line. 

second line

third line!
"""

enc = tokenizer.encode(txt, add_special_tokens=True, max_length=512, truncation=True)

enc

dec = tokenizer.decode(enc)

dec

In [None]:
txtlst = [
"""first line. 

second line

third Line!""",
"new document",
"new single line"
]

# Tokenize each document separately and concatenate them with separator tokens
tokenized_documents = []
for doc in txtlst:
    tokenized_doc = tokenizer.encode(doc, add_special_tokens=False)
    tokenized_documents.extend(tokenized_doc + [tokenizer.eos_token_id])  # Adding end-of-sequence token between documents


tokenized_documents

dec = [tokenizer.decode(enc) for enc in tokenized_documents]

dec

In [None]:
#SOTU

from nltk.corpus import state_union


In [None]:

docs = [state_union.raw(f) for f in state_union.fileids()]

tokenized_docs = []
for doc in docs:
    tokenized_doc = tokenizer.encode(doc, add_special_tokens=False, 
                                     #max_length=1600, 
                                     #truncation=False
                                     )
    tokenized_docs.extend(tokenized_doc + [tokenizer.eos_token_id])  # Adding end-of-sequence token between documents


In [None]:
len(tokenized_docs)

In [None]:
tokenized_docs[:100]

dec = tokenizer.decode(tokenized_docs[:100])

dec

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn import TransformerEncoder, TransformerEncoderLayer
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import math

torch.cuda.empty_cache()

device='cpu'

if torch.cuda.is_available():
    device = 'cuda'

#import numpy as np

from transformers import GPT2Tokenizer

# Load the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Define positional encoding
class PositionalEncoding(nn.Module):
    def __init__(self, embed_size, max_sequence_length=5000,device='cpu'):
        super(PositionalEncoding, self).__init__()
        position = torch.arange(0, max_sequence_length).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embed_size, 2) * -(math.log(10000.0) / embed_size))
        self.positional_encoding = torch.zeros(max_sequence_length, embed_size)
        self.positional_encoding[:, 0::2] = torch.sin(position * div_term)
        self.positional_encoding[:, 1::2] = torch.cos(position * div_term)
        self.positional_encoding = self.positional_encoding.unsqueeze(0).to(device=device)

    def forward(self, x):
        return x + self.positional_encoding[:, :x.size(1)].detach()


# Define your dataset class
class MyDataset(Dataset):
    def __init__(self, data):
        self.data = data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        return self.data[index]

# Define the Transformer model
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, embed_size, num_heads, hidden_size, num_layers, dropout_prob, max_sequence_length=5000,device='cpu'):
        super(TransformerModel, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.positional_encoding = PositionalEncoding(embed_size, max_sequence_length,device=device)
        self.transformer_encoder_layers = TransformerEncoderLayer(embed_size, num_heads, hidden_size, dropout_prob)
        self.transformer_encoder = TransformerEncoder(self.transformer_encoder_layers, num_layers)
        self.fc = nn.Linear(embed_size, vocab_size)
        
    def forward(self, input_seq):
        embedded = self.embedding(input_seq)
        embedded_with_position = self.positional_encoding(embedded)
        embedded_with_position = embedded_with_position.permute(1, 0, 2)  # Transformer expects sequence length first
        output = self.transformer_encoder(embedded_with_position)
        output = output.permute(1, 0, 2)  # Back to batch first
        logits = self.fc(output)
        return logits
    
# Define a function to generate text given a starting prompt
def generate_text(model, prompt, max_length=100, temperature=1.0):
    model.eval()
    with torch.no_grad():
        current_token = torch.tensor([[prompt]], dtype=torch.long)
        output_sequence = [prompt]
        
        for _ in range(max_length):
            logits = model(current_token)
            logits = logits[0, -1, :] / temperature
            probabilities = F.softmax(logits, dim=-1)
            next_token = torch.multinomial(probabilities, 1)
            output_sequence.append(next_token.item())
            current_token = next_token.unsqueeze(0)
            if next_token == 1:  # Stop generating if EOS token is generated
                break
                
    return output_sequence

def generate_text_cuda(model, prompt, max_length=100, temperature=1.0):
    model.eval()
    with torch.no_grad():
        current_token = torch.tensor([[prompt]], dtype=torch.long).cuda()
        output_sequence = [prompt]
        
        for _ in range(max_length):
            logits = model(current_token)
            logits = logits[0, -1, :] / temperature
            probabilities = F.softmax(logits, dim=-1)
            next_token = torch.multinomial(probabilities, 1)
            output_sequence.append(next_token.item())
            current_token = next_token.unsqueeze(0)
            if next_token == 1:  # Stop generating if EOS token is generated
                break
                
    return output_sequence

In [2]:



# Dummy data for demonstration
seq_length = 20
batch_size = 32
vocab_size = 10000
dummy_input = torch.randint(0, vocab_size, (seq_length, batch_size))  # Random input sequence

# Hyperparameters
embed_size = 128
num_heads = 4
hidden_size = 256
num_layers = 2
dropout_prob = 0.1

# Instantiate the model
model = TransformerModel(vocab_size, embed_size, num_heads, hidden_size, num_layers, dropout_prob)

# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

# Training loop (dummy example)
num_epochs = 10
for epoch in range(num_epochs):
    optimizer.zero_grad()
    logits = model(dummy_input)
    loss = criterion(logits.view(-1, vocab_size), dummy_input.view(-1))
    loss.backward()
    optimizer.step()
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}')

# Save the trained model
#torch.save(model.state_dict(), 'small_transformer_model.pth')

Epoch [1/10], Loss: 9.345067977905273
Epoch [2/10], Loss: 9.10288143157959
Epoch [3/10], Loss: 8.877528190612793
Epoch [4/10], Loss: 8.660417556762695
Epoch [5/10], Loss: 8.456753730773926
Epoch [6/10], Loss: 8.261812210083008
Epoch [7/10], Loss: 8.048837661743164
Epoch [8/10], Loss: 7.841767311096191
Epoch [9/10], Loss: 7.635434627532959
Epoch [10/10], Loss: 7.436553001403809


In [3]:
generated_sequence = generate_text(model, prompt=10)  # You can provide any integer as the starting prompt
print('Generated sequence:', generated_sequence)


Generated sequence: [10, 9906, 4485, 3379, 2575, 4871, 1690, 8158, 529, 3708, 8759, 2922, 9740, 1695, 1498, 7538, 2346, 9762, 7206, 4156, 901, 5765, 7035, 8598, 7048, 9014, 7972, 9525, 5764, 6930, 508, 8478, 9505, 2680, 540, 2939, 4200, 3585, 5569, 8594, 6750, 3076, 5262, 6919, 3901, 2126, 4422, 1098, 6945, 7915, 3519, 928, 6261, 671, 188, 3948, 5523, 5015, 8954, 556, 495, 9509, 2258, 5571, 2533, 6244, 3202, 1785, 5253, 4033, 8984, 8994, 8870, 6128, 1999, 1763, 5037, 5721, 4980, 4682, 9374, 6571, 4989, 7951, 589, 2674, 1797, 5766, 3641, 563, 3925, 2646, 7261, 6291, 6276, 2901, 865, 9560, 8785, 3246, 6569]


In [4]:
torch.cuda.is_available()

True

In [5]:
torch.cuda.empty_cache()


In [6]:
from nltk.corpus import state_union

docs = [state_union.raw(f) for f in state_union.fileids()]

tokenized_docs = []
for doc in docs:
    tokenized_doc = tokenizer.encode(doc, add_special_tokens=False, 
                                     #max_length=1600, 
                                     #truncation=False
                                     )
    tokenized_docs.extend(tokenized_doc + [tokenizer.eos_token_id])  # Adding end-of-sequence token between documents


Token indices sequence length is longer than the specified maximum sequence length for this model (2264 > 1024). Running this sequence through the model will result in indexing errors


In [7]:
input_ids = torch.tensor(tokenized_docs)

dataset = MyDataset(input_ids)

batch_size = 512

dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)


In [11]:
vocab_size = tokenizer.vocab_size

# Hyperparameters
num_heads = 6
embed_subsize = 32
embed_size = num_heads*embed_subsize
hidden_size = num_heads*embed_subsize
num_layers = 4
dropout_prob = 0.1


model = TransformerModel(vocab_size, embed_size, num_heads, hidden_size, num_layers, dropout_prob,device='cuda').cuda()



In [12]:
lr = 1e-4
# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(),lr=lr)


In [13]:
# Training loop (dummy example)
num_epochs = 2
for epoch in range(num_epochs):

    total_loss = 0.0

    for batch in dataloader:

        batch=batch.cuda().unsqueeze(0)
        optimizer.zero_grad()
        logits = model(batch)
        loss = criterion(logits.view(-1, vocab_size), batch.view(-1))
        loss.backward()
        optimizer.step()

        total_loss+=loss.item()
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss}')




Epoch [1/2], Loss: 4087.267250061035
Epoch [2/2], Loss: 1798.0610418319702


In [14]:
generated_sequence = generate_text_cuda(model, prompt=16000,temperature=1)  # You can provide any integer as the starting prompt
print('Generated sequence:', generated_sequence)

Generated sequence: [16000, 3336, 8741, 17984, 23637, 7876, 523, 662, 8904, 13293, 2372, 12859, 18424, 2793, 7584, 4385, 6303, 16826, 19586, 20903, 5091, 12546, 44129, 1365, 14923, 4441, 257, 45338, 5716, 26718, 2756, 1363, 17413, 32288, 16247, 3394, 3812, 76, 5975, 4133, 43270, 18147, 11798, 503, 8598, 39246, 29874, 263, 15055, 14186, 3395, 32293, 10219, 6961, 7763, 39530, 2222, 38032, 6875, 18385, 1381, 22459, 1957, 1904, 8494, 27522, 15331, 4290, 11501, 1367, 3379, 47649, 30383, 1415, 20197, 9, 326, 3452, 1716, 4906, 869, 3265, 41495, 7918, 21487, 9194, 1957, 6556, 12356, 7810, 1069, 13, 554, 22584, 780, 1219, 14607, 23781, 502, 2236, 1365]


In [15]:

tokenizer.decode(generated_sequence)

' Message THE Nation observers dialect urban so pre substantial achievement threat�icable lower puts supposed 1990 willingness Marines finances occurred disagree Kodi better rumors creating a kins treated segregation price home steppingregor dignity Russian towardm surprise resourcesrocal Cuban industries out mountain DeskArthurer triple precious Met benign initiative proposal contained Dyn bringperia declared tuitionats preservation localuse solve Thought monetary citizens wisdom 11iableAuthentITIES14 Revenue* that latest becometype call populationmeat myth utilized producing local motiv talented colleaguesex. Infact becauseoh reasoning um me shall better'

In [16]:

total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters: {total_params}")

Total number of parameters: 20464465


In [17]:
import pynvml
import time

In [18]:

def get_gpu_temperature():
    pynvml.nvmlInit()
    handle = pynvml.nvmlDeviceGetHandleByIndex(0)  # Assuming you have only one GPU
    gpu_temperature = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU)
    pynvml.nvmlShutdown()
    return gpu_temperature


In [19]:

temperature = get_gpu_temperature()
print(f"GPU Temperature: {temperature} degrees Celsius")

GPU Temperature: 70 degrees Celsius


In [20]:
temperature

70

In [21]:
lr = 5e-4
# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(),lr=lr)


In [22]:

epoch = 0
num_epochs = 30

while epoch < num_epochs:

    temperature = get_gpu_temperature()

    if temperature < 75:

        total_loss = 0.0

        for batch in dataloader:

            batch=batch.cuda().unsqueeze(0)
            optimizer.zero_grad()
            logits = model(batch)
            loss = criterion(logits.view(-1, vocab_size), batch.view(-1))
            loss.backward()
            optimizer.step()

            total_loss+=loss.item()
        
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss}')
        epoch+=1
    
    else:
        print(f"Sleeping...{temperature}")
        time.sleep(30)

    

Epoch [1/30], Loss: 703.6633914113045
Epoch [2/30], Loss: 177.0269646421075
Sleeping...75
Epoch [3/30], Loss: 49.81680866423994
Epoch [4/30], Loss: 9.126902840391267
Sleeping...75
Epoch [5/30], Loss: 1.4451938063430134
Epoch [6/30], Loss: 0.7366683402215131
Sleeping...76
Epoch [7/30], Loss: 0.43585958385665435
Epoch [8/30], Loss: 0.2698445964997518
Sleeping...75
Epoch [9/30], Loss: 0.17099054516802425
Epoch [10/30], Loss: 0.10983816627049237
Sleeping...76
Epoch [11/30], Loss: 0.07112167431114358
Epoch [12/30], Loss: 0.04627244730909297
Sleeping...75
Epoch [13/30], Loss: 0.030197119576769182
Epoch [14/30], Loss: 0.019742071815471718
Sleeping...75
Epoch [15/30], Loss: 0.012926153644002625
Epoch [16/30], Loss: 0.008474889194530988
Sleeping...76
Epoch [17/30], Loss: 0.0055655126657256915
Epoch [18/30], Loss: 0.0036631661948831606
Epoch [19/30], Loss: 0.002419059901399123
Sleeping...77
Epoch [20/30], Loss: 0.001604369345784562
Epoch [21/30], Loss: 0.0010699926142478944
Sleeping...75
Epoch [

In [23]:
generated_sequence = generate_text_cuda(model, prompt=16000,temperature=1)  # You can provide any integer as the starting prompt
print('Generated sequence:', generated_sequence)

Generated sequence: [16000, 39981, 7622, 33019, 9839, 17093, 384, 19773, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262, 262]


In [24]:
tokenizer.decode(generated_sequence)

' Message assassinated keeps slows tie unrest se Mohammed the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the'

In [25]:
torch.save(model.state_dict(), 'model.pt')