# Rebuild Gutenberg Poetry Corpus with Proper Lineation

**Purpose:** Download and parse 1,191 Project Gutenberg poetry texts with proper line breaks, stanza markers, and structure.

**Output:** Clean JSONL corpus saved to Google Drive

**Time:** ~1-2 hours for 1,191 texts

---

## Instructions

1. Upload `gutenberg_ids.txt` to Colab Files
2. Run all cells in order
3. Output will save to Google Drive: `/MyDrive/gutenberg_poetry_corpus_clean.jsonl`

## Step 1: Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

print("✓ Google Drive mounted")

## Step 2: Upload Gutenberg ID List

Upload `gutenberg_ids.txt` using the Files panel on the left, or run this cell to upload:

In [None]:
from google.colab import files

print("Upload gutenberg_ids.txt:")
uploaded = files.upload()

# Verify
with open('gutenberg_ids.txt', 'r') as f:
    ids = [line.strip() for line in f if line.strip()]
    print(f"\n✓ Loaded {len(ids)} Gutenberg IDs")

## Step 3: Install Dependencies

In [None]:
!pip install -q requests tqdm

print("✓ Dependencies installed")

## Step 4: Define Corpus Rebuilder

In [None]:
import json
import requests
import time
from pathlib import Path
from tqdm import tqdm
import logging

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')
logger = logging.getLogger(__name__)


class GutenbergCorpusRebuilder:
    """Rebuild corpus from original Gutenberg sources."""

    def __init__(self):
        self.failed_downloads = []

    def download_text(self, gutenberg_id, max_retries=3):
        """Download text from Project Gutenberg with retries."""

        url_patterns = [
            f"http://www.gutenberg.org/cache/epub/{gutenberg_id}/pg{gutenberg_id}.txt",
            f"https://www.gutenberg.org/files/{gutenberg_id}/{gutenberg_id}-0.txt",
            f"https://www.gutenberg.org/files/{gutenberg_id}/{gutenberg_id}.txt",
        ]

        for attempt in range(max_retries):
            for url in url_patterns:
                try:
                    response = requests.get(url, timeout=30)
                    if response.status_code == 200:
                        if not response.text.startswith('<!DOCTYPE'):
                            return response.text
                except Exception as e:
                    continue

            if attempt < max_retries - 1:
                time.sleep(2 ** attempt)

        logger.warning(f"Failed to download ID {gutenberg_id}")
        self.failed_downloads.append(gutenberg_id)
        return None

    def clean_gutenberg_text(self, text):
        """Remove Gutenberg headers and footers."""

        start_markers = [
            "*** START OF THIS PROJECT GUTENBERG",
            "*** START OF THE PROJECT GUTENBERG",
            "*END*THE SMALL PRINT",
        ]

        for marker in start_markers:
            if marker in text:
                text = text.split(marker, 1)[1]
                break

        end_markers = [
            "*** END OF THIS PROJECT GUTENBERG",
            "*** END OF THE PROJECT GUTENBERG",
            "End of the Project Gutenberg",
            "End of Project Gutenberg",
        ]

        for marker in end_markers:
            if marker in text:
                text = text.split(marker, 1)[0]
                break

        return text.strip()

    def parse_poetry_lines(self, text, gutenberg_id):
        """
        Parse text into poetry lines with metadata.
        
        Preserves:
        - Line breaks (exact lineation)
        - Blank lines (stanza breaks)
        - Line numbers from source
        """
        lines = []
        text = self.clean_gutenberg_text(text)
        raw_lines = text.split('\n')

        for line_num, line in enumerate(raw_lines, 1):
            lines.append({
                'line': line.strip(),
                'gutenberg_id': gutenberg_id,
                'line_num': line_num,
                'is_blank': not line.strip()
            })

        return lines

    def process_all(self, id_file, output_jsonl):
        """Process all Gutenberg IDs and create clean corpus."""

        with open(id_file, 'r') as f:
            gutenberg_ids = [int(line.strip()) for line in f if line.strip()]

        logger.info(f"Processing {len(gutenberg_ids)} Gutenberg texts")

        processed = 0
        total_lines = 0

        with open(output_jsonl, 'w') as outfile:
            for gid in tqdm(gutenberg_ids, desc="Processing texts"):
                text = self.download_text(gid)

                if text is None:
                    continue

                lines = self.parse_poetry_lines(text, gid)

                for line_data in lines:
                    outfile.write(json.dumps(line_data) + '\n')
                    total_lines += 1

                processed += 1

                if processed % 50 == 0:
                    logger.info(f"Progress: {processed}/{len(gutenberg_ids)} texts, {total_lines:,} lines")

                time.sleep(0.5)  # Rate limiting

        logger.info("="*60)
        logger.info(f"Processing complete!")
        logger.info(f"Texts processed: {processed}/{len(gutenberg_ids)}")
        logger.info(f"Total lines: {total_lines:,}")
        logger.info(f"Failed downloads: {len(self.failed_downloads)}")

        if self.failed_downloads:
            failed_file = str(Path(output_jsonl).parent / "failed_downloads.txt")
            with open(failed_file, 'w') as f:
                for fid in self.failed_downloads:
                    f.write(f"{fid}\n")
            logger.info(f"Failed IDs saved to: {failed_file}")

        logger.info(f"Clean corpus saved to: {output_jsonl}")
        
        return processed, total_lines


print("✓ Rebuilder class defined")

## Step 5: Run Corpus Rebuild

**This will take 1-2 hours.** You can close the browser tab and it will keep running.

Check back later to see progress.

In [None]:
# Initialize rebuilder
rebuilder = GutenbergCorpusRebuilder()

# Paths
id_file = 'gutenberg_ids.txt'
output_file = '/content/drive/MyDrive/AI and Poetry/Historical Embeddings/gutenberg_poetry_corpus_clean.jsonl'

# Run rebuild
print("Starting corpus rebuild...")
print("This will take 1-2 hours for 1,191 texts.")
print("You can close the browser - it will keep running.\n")

processed, total_lines = rebuilder.process_all(id_file, output_file)

print("\n" + "="*60)
print("REBUILD COMPLETE!")
print(f"Processed: {processed} texts")
print(f"Total lines: {total_lines:,}")
print(f"Output: {output_file}")
print("="*60)

## Step 6: Verify Output

Check first few lines and compare with original corrupted corpus:

In [None]:
import json

output_file = '/content/drive/MyDrive/AI and Poetry/Historical Embeddings/gutenberg_poetry_corpus_clean.jsonl'

print("First 10 lines of rebuilt corpus:\n")
with open(output_file, 'r') as f:
    for i, line in enumerate(f):
        if i >= 10:
            break
        data = json.loads(line)
        blank = "[BLANK LINE]" if data['is_blank'] else ""
        print(f"{i+1}. ID {data['gutenberg_id']} Line {data['line_num']}: {data['line'][:60]} {blank}")

# Count total lines
import subprocess
result = subprocess.run(['wc', '-l', output_file], capture_output=True, text=True)
print(f"\nTotal lines in rebuilt corpus: {result.stdout.split()[0]}")

## Step 7: Optional - Compress Output

Compress the JSONL file to save space:

In [None]:
import gzip
import shutil

input_file = '/content/drive/MyDrive/AI and Poetry/Historical Embeddings/gutenberg_poetry_corpus_clean.jsonl'
output_gz = input_file + '.gz'

print(f"Compressing {input_file}...")

with open(input_file, 'rb') as f_in:
    with gzip.open(output_gz, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

# Check sizes
import os
orig_size = os.path.getsize(input_file) / (1024**2)
comp_size = os.path.getsize(output_gz) / (1024**2)

print(f"\nOriginal: {orig_size:.1f} MB")
print(f"Compressed: {comp_size:.1f} MB")
print(f"Compression: {(1 - comp_size/orig_size)*100:.1f}%")
print(f"\n✓ Saved to: {output_gz}")