In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import Dataset
import os

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
class TextDataset(Dataset):
    column_names = ["input_ids", "attention_mask"]  # Updated to include attention_mask

    def __init__(self, file_path, tokenizer, block_size):
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()

        # Tokenize the entire text
        tokenized = tokenizer(text, truncation=True, max_length=block_size, padding="max_length", return_tensors="pt")
        self.input_ids = tokenized["input_ids"][0]
        self.attention_mask = tokenized["attention_mask"][0]
        
        # Create chunks of block_size
        self.examples = []
        for i in range(0, len(self.input_ids) - block_size + 1, block_size):
            self.examples.append({
                "input_ids": self.input_ids[i:i + block_size],
                "attention_mask": self.attention_mask[i:i + block_size]
            })

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return self.examples[i]

In [3]:
# Determine the device
if torch.cuda.is_available():
    device = torch.device("cuda")
# elif torch.backends.mps.is_available():
#     device = torch.device("mps")
else:
    device = torch.device("cpu")

print(f"Using device: {device}")

Using device: cpu


In [4]:
# Load the model and tokenizer
model_name = "meta-llama/Meta-Llama-3.1-8B"
access_token = os.environ.get('ACCESS_TOKEN')
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, token=access_token).to(device)

Loading checkpoint shards: 100%|██████████| 4/4 [01:09<00:00, 17.32s/it]


In [5]:
# Handle padding token for LLaMA
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

In [6]:
# Load and preprocess the data
file_path = "trump_speeches_combined_processed.txt"
block_size = 512
dataset = TextDataset(file_path, tokenizer, block_size)

In [7]:
# Set up the trainer
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
    learning_rate=2e-5,
    warmup_steps=500,
    fp16=(device.type == "cuda"),  # Use fp16 only on CUDA GPUs
    gradient_accumulation_steps=4,  # Accumulate gradients over 4 steps
    logging_dir='./logs',
    logging_steps=100,
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=data_collator,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./fine_tuned_llama")
tokenizer.save_pretrained("./fine_tuned_llama")

RuntimeError: MPS backend out of memory (MPS allocated: 18.00 GB, other allocations: 816.00 KB, max allowed: 18.13 GB). Tried to allocate 224.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [None]:
# Generate text without prompting
def generate_text(model, tokenizer, max_length=100):
    input_ids = tokenizer.encode("", return_tensors="pt").to(device)
    attention_mask = torch.ones(input_ids.shape, dtype=torch.long).to(device)
    
    output = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=max_length,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.7,
    )
    
    return tokenizer.decode(output[0], skip_special_tokens=True)

: 

In [None]:
# Load the fine-tuned model
fine_tuned_model = AutoModelForCausalLM.from_pretrained("./fine_tuned_llama").to(device)
fine_tuned_tokenizer = AutoTokenizer.from_pretrained("./fine_tuned_llama")

# Generate text
generated_text = generate_text(fine_tuned_model, fine_tuned_tokenizer)
print(generated_text)

: 

: 

: 