<a href="https://colab.research.google.com/github/IrfanSadik13/sherlock_gpt/blob/main/sherlock_gpt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:

import requests

# Dictionary of all Sherlock Holmes stories with their Gutenberg URLs
sherlock_stories = {
    "A Study in Scarlet": "https://www.gutenberg.org/files/1661/1661-0.txt",
    "The Sign of the Four": "https://www.gutenberg.org/files/2097/2097-0.txt",
    "The Adventures of Sherlock Holmes": "https://www.gutenberg.org/files/1661/1661-0.txt",
    "The Memoirs of Sherlock Holmes": "https://www.gutenberg.org/files/834/834-0.txt",
    "The Hound of the Baskervilles": "https://www.gutenberg.org/files/2852/2852-0.txt",
    "The Return of Sherlock Holmes": "https://www.gutenberg.org/files/108/108-0.txt",
    "The Valley of Fear": "https://www.gutenberg.org/files/22357/22357-0.txt",
    "His Last Bow": "https://www.gutenberg.org/files/2852/2852-0.txt",
    "The Case-Book of Sherlock Holmes": "https://www.gutenberg.org/files/221/221-0.txt"
}

# Output file to store all collected text
output_file = "sherlock_holmes_complete.txt"

# Fetch and save stories
with open(output_file, "w", encoding="utf-8") as f:
    for title, url in sherlock_stories.items():
        print(f"Fetching {title}...")
        response = requests.get(url)
        if response.status_code == 200:
            f.write(f"\n\n=== {title} ===\n\n")
            f.write(response.text)
        else:
            print(f"Failed to fetch {title}")

print(f"All Sherlock Holmes stories saved to {output_file}")

Fetching A Study in Scarlet...
Fetching The Sign of the Four...
Fetching The Adventures of Sherlock Holmes...
Fetching The Memoirs of Sherlock Holmes...
Fetching The Hound of the Baskervilles...
Fetching The Return of Sherlock Holmes...
Fetching The Valley of Fear...
Failed to fetch The Valley of Fear
Fetching His Last Bow...
Fetching The Case-Book of Sherlock Holmes...
All Sherlock Holmes stories saved to sherlock_holmes_complete.txt


In [14]:

import re

# Load the collected text file
input_file = "sherlock_holmes_complete.txt"
output_file = "sherlock_cleaned.txt"

def clean_text(text):
    # Remove Gutenberg headers and footers
    text = re.sub(r"\*\*\* START OF THIS PROJECT GUTENBERG EBOOK .*? \*\*\*", "", text, flags=re.DOTALL)
    text = re.sub(r"\*\*\* END OF THIS PROJECT GUTENBERG EBOOK .*? \*\*\*", "", text, flags=re.DOTALL)

    # Remove extra whitespace
    text = re.sub(r"\s+", " ", text)

    # Normalize text (optional)
    text = text.lower()

    return text.strip()

# Read the file and clean it
with open(input_file, "r", encoding="utf-8") as f:
    raw_text = f.read()

cleaned_text = clean_text(raw_text)

# Save the cleaned text
with open(output_file, "w", encoding="utf-8") as f:
    f.write(cleaned_text)

print(f"Preprocessed text saved to {output_file}")

Preprocessed text saved to sherlock_cleaned.txt


In [15]:

!pip install tokenizers

from tokenizers import Tokenizer, trainers, models, pre_tokenizers

# Load text
with open("sherlock_cleaned.txt", "r", encoding="utf-8") as f:
    text = f.read()

# Train a new Byte-Pair Encoding (BPE) tokenizer
tokenizer = Tokenizer(models.BPE())
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
trainer = trainers.BpeTrainer(vocab_size=8000, min_frequency=2)

# Train tokenizer
tokenizer.train_from_iterator([text], trainer)

# Test tokenization
encoded = tokenizer.encode("Mr. Sherlock Holmes was sitting in his chair.")
print("Tokens:", encoded.ids)
print("Decoded:", tokenizer.decode(encoded.ids))

# Save tokenizer
tokenizer.save("sherlock_tokenizer.json")
print("Tokenizer saved!")

[31mERROR: Operation cancelled by user[0m[31m
[0mTokens: [49, 12, 216, 388, 466, 183, 120, 1364, 86, 129, 702, 12]
Decoded: r . her lock ol mes was sitting in his chair .
Tokenizer saved!


In [None]:

import torch
import numpy as np

# Load tokenized data
with open("sherlock_tokenizer.json", "r") as f:
    from tokenizers import Tokenizer
    tokenizer = Tokenizer.from_file("sherlock_tokenizer.json")

# Load the cleaned text
with open("sherlock_cleaned.txt", "r", encoding="utf-8") as f:
    text = f.read()

# Tokenize the text
encoded_text = tokenizer.encode(text).ids

# Define sequence length
SEQ_LEN = 256  # You can adjust this based on GPU memory

# Create input-output pairs
input_sequences = []
output_sequences = []

for i in range(len(encoded_text) - SEQ_LEN):
    input_sequences.append(encoded_text[i:i+SEQ_LEN])
    output_sequences.append(encoded_text[i+1:i+SEQ_LEN+1])

# Convert to PyTorch tensors
X = torch.tensor(input_sequences, dtype=torch.long)
Y = torch.tensor(output_sequences, dtype=torch.long)

# Save as PyTorch dataset
torch.save((X, Y), "sherlock_train_data.pt")

print("Training data prepared and saved!")

In [1]:

import torch
import torch.nn as nn
import torch.nn.functional as F

class MiniGPT(nn.Module):
    def __init__(self, vocab_size, d_model=256, n_heads=4, num_layers=4, seq_len=256):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_embedding = nn.Embedding(seq_len, d_model)

        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=n_heads)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        self.fc_out = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        seq_len = x.shape[1]
        positions = torch.arange(0, seq_len, device=x.device).unsqueeze(0)

        x = self.embedding(x) + self.pos_embedding(positions)
        x = self.transformer(x)
        x = self.fc_out(x)

        return x

# Load tokenizer
from tokenizers import Tokenizer
tokenizer = Tokenizer.from_file("sherlock_tokenizer.json")
VOCAB_SIZE = tokenizer.get_vocab_size()

# Define model
model = MiniGPT(VOCAB_SIZE)

print("Model initialized!")

Model initialized!




In [None]:

import torch
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn

# Load dataset
X, Y = torch.load("sherlock_train_data.pt")

# Set device (use GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move data to GPU
X, Y = X.to(device), Y.to(device)

# Create DataLoader
BATCH_SIZE = 32
dataset = TensorDataset(X, Y)
train_loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

# Initialize model & move to GPU
VOCAB_SIZE = 8000  # Make sure to match vocab size with tokenizer
model = MiniGPT(VOCAB_SIZE).to(device)

# Define optimizer and loss function
optimizer = optim.AdamW(model.parameters(), lr=3e-4)
criterion = nn.CrossEntropyLoss()

# Mixed precision scaler
scaler = torch.cuda.amp.GradScaler()

# Training loop
EPOCHS = 5  # Adjust as needed
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0

    for batch in train_loader:
        inputs, targets = batch
        inputs, targets = inputs.to(device), targets.to(device)

        optimizer.zero_grad()

        with torch.cuda.amp.autocast():  # Mixed precision
            outputs = model(inputs)
            loss = criterion(outputs.view(-1, VOCAB_SIZE), targets.view(-1))

        # Scale loss & backprop
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {avg_loss:.4f}")

# Save trained model
torch.save(model.state_dict(), "sherlock_gpt_model.pth")
print("✅ Model training complete & saved!")

  X, Y = torch.load("sherlock_train_data.pt")
  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast():  # Mixed precision


In [None]:

!nvidia-smi