In [13]:
import torch
torch.cuda.empty_cache()
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import get_peft_model, LoraConfig, TaskType
from torch.utils.data import Dataset
import os

In [14]:
class TextDataset(Dataset):
    def __init__(self, file_path, tokenizer, block_size):
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()

        # Tokenize the entire text
        tokenized_text = tokenizer.encode(text)

        # Create examples of length block_size
        self.examples = []
        for i in range(0, len(tokenized_text) - block_size + 1, block_size):
            self.examples.append(tokenized_text[i:i + block_size])

        print(f"Loaded {len(self.examples)} examples.")

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return torch.tensor(self.examples[i], dtype=torch.long)

In [15]:
# Determine the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cuda


In [16]:

# Set model name
model_name = "gpt2"  # You can change this to "gpt2-medium", "gpt2-large", or "gpt2-xl" if you have more memory


In [17]:

# Load the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Add padding token to the tokenizer
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id

# Move model to the appropriate device
model.to(device)



GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [18]:
# Configure LoRA
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["c_attn", "c_proj"]
)

# Wrap the model with LoRA
model = get_peft_model(model, peft_config)




In [19]:
# Load and preprocess the data
file_path = "trump_speeches_combined_processed.txt"
block_size = 128  # Reduced block size to save memory
dataset = TextDataset(file_path, tokenizer, block_size)

# Set up the trainer
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=50,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    save_steps=10_000,
    save_total_limit=2,
    learning_rate=5e-5,
    warmup_steps=100,
    logging_dir='./logs',
    logging_steps=100,
    fp16=True,
    optim="adamw_torch"
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=data_collator,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
output_dir = "./fine_tuned_gpt2"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

# Load the fine-tuned model
fine_tuned_model = GPT2LMHeadModel.from_pretrained(output_dir)
fine_tuned_model.to(device)
fine_tuned_tokenizer = GPT2Tokenizer.from_pretrained(output_dir)

Token indices sequence length is longer than the specified maximum sequence length for this model (1022810 > 1024). Running this sequence through the model will result in indexing errors


Loaded 7990 examples.


Step,Training Loss
100,3.6681
200,3.3361
300,3.2518
400,3.2193
500,3.1828
600,3.1739
700,3.1447
800,3.1401
900,3.1279
1000,3.1103


In [20]:
def generate_text(model, tokenizer, max_length=100):
    # Start with just the start token
    input_ids = torch.tensor([[tokenizer.bos_token_id]]).to(model.device)
    attention_mask = torch.ones_like(input_ids).to(model.device)
    
    with torch.no_grad():
        output = model.generate(
            input_ids,
            attention_mask=attention_mask,
            max_length=max_length,
            num_return_sequences=1,
            no_repeat_ngram_size=2,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            temperature=0.7,
            pad_token_id=tokenizer.eos_token_id,
            bos_token_id=tokenizer.bos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
    
    # Decode and remove the start token from the output
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text.strip()

# Generate text
generated_text = generate_text(fine_tuned_model, fine_tuned_tokenizer)
print(generated_text)

"we're gonna build a wall. " this is a lie. it's an attack on america, and i don't know if it works. but we have to do it. and we will. we're going to rebuild our border, we are going, "we have a good border.
" and when we do that, it will be so easy. right? right. you look at what's happening with the fake news, you know what? it just doesn't happen
