In [9]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling
import numpy as np
from tqdm import tqdm

In [10]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Data Collection: Download Warren Buffett's annual letters to shareholders

# Data Preprocessing
with open("/content/sample_data/WarrenBuffet.txt", "r", encoding="utf-8") as file:
    text = file.read()

In [11]:
# Model Training
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Tokenize the text
tokenized_text = tokenizer.encode(text)

# Define dataset
class TextDataset(Dataset):
    def __init__(self, tokenized_text, block_size):
        self.examples = []
        for i in range(0, len(tokenized_text) - block_size + 1, block_size):
            self.examples.append(tokenized_text[i:i+block_size])

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        return torch.tensor(self.examples[idx], dtype=torch.long)

Token indices sequence length is longer than the specified maximum sequence length for this model (81615 > 1024). Running this sequence through the model will result in indexing errors


In [12]:
block_size = 128
train_dataset = TextDataset(tokenized_text, block_size)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Load pre-trained GPT2 model
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Fine-tune the model
model.to(device)
model.train()
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)

optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
num_epochs = 3

In [13]:
for epoch in range(num_epochs):
    for batch in tqdm(train_loader):
        inputs, labels = batch.to(device), batch.to(device)
        optimizer.zero_grad()
        outputs = model(inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

100%|██████████| 160/160 [00:09<00:00, 16.66it/s]
100%|██████████| 160/160 [00:09<00:00, 16.62it/s]
100%|██████████| 160/160 [00:09<00:00, 16.78it/s]


In [14]:
# Evaluation
model.eval()
eval_loss = 0
eval_steps = 0
eval_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)

In [15]:
for batch in tqdm(eval_loader):
    inputs, labels = batch.to(device), batch.to(device)
    with torch.no_grad():
        outputs = model(inputs, labels=labels)
        eval_loss += outputs.loss
    eval_steps += 1

eval_loss /= eval_steps
perplexity = torch.exp(eval_loss)
print("Perplexity:", perplexity)

100%|██████████| 160/160 [00:02<00:00, 54.88it/s]

Perplexity: tensor(9.4722, device='cuda:0')





In [16]:
def generate_text(model, tokenizer, prompt_text, length=100, temperature=1.0):
    input_ids = tokenizer.encode(prompt_text, return_tensors="pt").to(device)
    attention_mask = torch.ones(input_ids.shape, device=device)
    output = model.generate(input_ids, attention_mask=attention_mask, pad_token_id=tokenizer.eos_token_id, do_sample=True, max_length=length, temperature=temperature)
    return tokenizer.decode(output[0], skip_special_tokens=True)

In [17]:
prompt = "Warren Buffett believes"
generated_text = generate_text(model, tokenizer, prompt, length=200, temperature=0.7)
print("Generated Text:\n", generated_text)

Generated Text:
 Warren Buffett believes Berkshire's competitive advantage over the competition is important, and that we have to continue to use it. 

• We are a leading provider of supplemental insurance to all of our customers. If we were to lose some, we would need 
to take other insurers - one of whom would be a big loss - to replace it. We must also 
evaluate whether or not our current policies will adequately cover the cost of the cost-free premium we impose on 
our customers. 

• We get credit for the investment that we make. If we lose money on a Berkshire product, we pay the premiums to 
another company, who then adds us the money to pay for the risk we take on our customers. 

• When we qualify for certain types of supplemental insurance, we pay the premium for the 
contingent service, which is a far more expensive and expensive service than supplemental insurance. If we 
continue to operate under


In [18]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2LMHeadModel, GPT2Tokenizer, DataCollatorForLanguageModeling
from tqdm import tqdm

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Data Collection: Download Warren Buffett's annual letters to shareholders

# Data Preprocessing
with open("/content/sample_data/WarrenBuffet.txt", "r", encoding="utf-8") as file:
    text = file.read()

In [19]:
# Tokenization
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Numerical Encoding
tokenized_text = tokenizer.encode(text)

# Padding and Truncation
max_length = 512  # Maximum length for GPT-2 models
tokenized_text = tokenized_text[:max_length]  # Truncate to maximum length

if len(tokenized_text) < max_length:
    tokenized_text += [tokenizer.pad_token_id] * (max_length - len(tokenized_text))  # Pad if needed

# Define dataset
class TextDataset(Dataset):
    def __init__(self, tokenized_text, block_size):
        self.examples = []
        for i in range(0, len(tokenized_text) - block_size + 1, block_size):
            self.examples.append(tokenized_text[i:i+block_size])

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        return torch.tensor(self.examples[idx], dtype=torch.long)


Token indices sequence length is longer than the specified maximum sequence length for this model (81615 > 1024). Running this sequence through the model will result in indexing errors


In [20]:
block_size = 128
train_dataset = TextDataset(tokenized_text, block_size)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Load pre-trained GPT2 model
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Fine-tune the model
model.to(device)
model.train()
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
num_epochs = 3

In [21]:
for epoch in range(num_epochs):
    for batch in tqdm(train_loader):
        inputs, labels = batch.to(device), batch.to(device)
        optimizer.zero_grad()
        outputs = model(inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

100%|██████████| 1/1 [00:00<00:00, 23.29it/s]
100%|██████████| 1/1 [00:00<00:00, 21.00it/s]
100%|██████████| 1/1 [00:00<00:00, 18.77it/s]


In [22]:
# Evaluation
model.eval()
eval_loss = 0
eval_steps = 0
eval_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)

In [23]:
for batch in tqdm(eval_loader):
    inputs, labels = batch.to(device), batch.to(device)
    with torch.no_grad():
        outputs = model(inputs, labels=labels)
        eval_loss += outputs.loss
    eval_steps += 1

eval_loss /= eval_steps
perplexity = torch.exp(eval_loss)
print("Perplexity:", perplexity.item())

100%|██████████| 1/1 [00:00<00:00, 69.20it/s]

Perplexity: 20.70240020751953





In [24]:
def generate_text(model, tokenizer, prompt_text, length=100, temperature=1.0):
    input_ids = tokenizer.encode(prompt_text, return_tensors="pt").to(device)
    attention_mask = torch.ones(input_ids.shape, device=device)
    output = model.generate(input_ids, attention_mask=attention_mask, pad_token_id=tokenizer.eos_token_id, do_sample=True, max_length=length, temperature=temperature)
    return tokenizer.decode(output[0], skip_special_tokens=True)

In [25]:
# Text Generation
prompt = "Warren Buffett believes"
generated_text = generate_text(model, tokenizer, prompt, length=200, temperature=0.7)
print("Generated Text:\n", generated_text)

Generated Text:
 Warren Buffett believes the market is a better bet than anyone else to see the U.S. economy recover in the next decade because of rising inequality.

"It's a tough year for the U.S. economy, because inequality has fallen in real terms and it's gotten worse over the last few years," Buffett said. "There's more than enough capital in the U.S. economy to support a sustained recovery."

He said that although the economy has grown faster than the U.S. economy since 2009, the number of workers has been less than half the amount they are today.

Still, Buffett said that he is confident that "there is momentum on the horizon."

"The U.S. economy is moving forward at a pace that we've never seen before," he said.


The second generated text is more impressive due to several high-impact design choices:

*   Relevance to Warren Buffett's Business Philosophy: The text discusses Berkshire Hathaway's competitive advantage, a topic that aligns closely with Warren Buffett's investment philosophy. Buffett often emphasizes the importance of identifying and capitalizing on a company's competitive strengths, making this discussion highly relevant.


*   Clarity and Coherence: The text is clear and logically organized, presenting three key points about Berkshire's competitive advantage and how it contributes to the company's success. Each point is succinctly explained, maintaining coherence throughout. Use of Bullet Points: The use of bullet points enhances readability and makes the text more visually appealing. Bullet points allow for easy identification of key points and emphasize the importance of each aspect of Berkshire's competitive advantage.

*   Incorporation of Specific Examples: The text provides specific examples to illustrate Berkshire's competitive advantage, such as its leading position in supplemental insurance and its ability to obtain credit for investments made. These examples add credibility and concreteness to the discussion.


*   Engagement with Shareholders: The text addresses the importance of evaluating current policies and making strategic decisions to maintain Berkshire's competitive position. This engagement with shareholders reflects Buffett's communication style in his annual letters, where he often provides insights into the company's operations and strategy.