In [None]:
import requests
import os
import json

# check if /datasets/gutenberg/dataset_books_with_metadata.json exists, if so, load it

# fetch the text of the books from the Gutenberg dataset (too large to upload to GitHub)
with open(f"./datasets/train_100M/gutenberg.train", "r", encoding="utf-8") as f:
    all_books = f.read()
lines = all_books.split("\n")

# split up the books
texts = {}
beginning_indices = []
for i in range(len(lines)):
    if lines[i].startswith("= = = "):
        beginning_indices.append(i)

for i in range(len(beginning_indices) - 1):
    gutenberg_id = lines[beginning_indices[i]][8:].split()[0]
    if not gutenberg_id.isdigit():
        print("Id-length: ", len(gutenberg_id), "for id:", gutenberg_id)
        continue
    text = " ".join(lines[(beginning_indices[i] + 1):(beginning_indices[i + 1] - 1)])
    texts[gutenberg_id] = text
print(f"Found {len(texts)} books in the Gutenberg dataset.")
print("Example book text:", texts.get("52018", "")[:500])  # Print first 500 characters of a sample book

books_with_metadata = {} # id -> { 'text': str, 'author': str, 'title': str, 'subjects': list, 'bookshelves': list }

# if metadata is locally available, load it
if os.path.exists("./datasets/gutenberg/dataset_books_with_metadata.json"):
    with open("./datasets/gutenberg/dataset_books_with_metadata.json", "r", encoding="utf-8") as f:
        books_with_metadata = json.load(f)
    print(f"Loaded {len(books_with_metadata)} books with metadata from local file.")
    print("Example book metadata:", books_with_metadata.get("52018", {}))

# otherwise, fetch metadata from Gutendex
else: 
    # Fetch metadata for each book from Gutendex
    count = 0
    for gutenberg_id, text in texts.items():
        count += 1
        if count % 50 == 0:
            print(f"Processing book ID {gutenberg_id}... ({count}/{len(texts)})")
        try:
            response = requests.get(f"https://gutendex.com/books/{gutenberg_id}", timeout=30)
            response.raise_for_status()
            metadata = response.json()

            authors = metadata.get("authors", [])
            if not authors:
                author = "Unknown"
            else:
                author = authors[0].get("name", "Unknown")
            
            books_with_metadata[gutenberg_id] = {
                
                'author': author,
                'title': metadata["title"],
                'subjects': metadata["subjects"],
                'bookshelves': metadata["bookshelves"]
            }
        except requests.exceptions.RequestException as e:
            print(f"Failed to get metadata for book {gutenberg_id}: {e}")
        except Exception as e:
            print(f"Error processing book {gutenberg_id}: {e}")
    with open("./datasets/gutenberg/dataset_books_with_metadata.json", "w", encoding="utf-8") as f:
        json.dump(books_with_metadata, f, ensure_ascii=False, indent=4)
    print(len(books_with_metadata), "books with metadata found.")
    print("Example book metadata:", books_with_metadata.get(52018, {}))

for gutenberg_id, book in books_with_metadata.items():
    if gutenberg_id in texts:
        book['text'] = texts[gutenberg_id]


In [None]:
all_bookshelves = set()
all_subjects = set()
bookshelve_counts = {}
bookshelve_word_counts = {}
subject_counts = {}
subject_word_counts = {}
for book in books_with_metadata.values():
    word_count = len(book["text"].split())
    for bookshelve in book["bookshelves"]:
        all_bookshelves.add(bookshelve)
        if bookshelve in bookshelve_counts:
            bookshelve_counts[bookshelve] += 1
            bookshelve_word_counts[bookshelve] += word_count
        else:
            bookshelve_counts[bookshelve] = 1
            bookshelve_word_counts[bookshelve] = word_count
    for subject in book["subjects"]:
        all_subjects.add(subject)
        if subject in subject_counts:
            subject_counts[subject] += 1
            subject_word_counts[subject] += word_count
        else:
            subject_counts[subject] = 1
            subject_word_counts[subject] = word_count

print(f"Total unique bookshelves: {len(all_bookshelves)}")
print(f"Total unique subjects: {len(all_subjects)}")


In [None]:

bookshelf_list = sorted(list(all_bookshelves))
subject_list = sorted(list(all_subjects))
# print 10 most common bookshelves and subjects with their counts
print("\nMost common bookshelves:")
sorted_bookshelves = sorted(bookshelve_counts.items(), key=lambda x: x[1], reverse=True)
for bookshelf, count in sorted_bookshelves[:10]:
    print(f"{bookshelf}: {count} books")
print("\nMost common subjects:")
sorted_subjects = sorted(subject_counts.items(), key=lambda x: x[1], reverse=True)
for subject, count in sorted_subjects[:10]:
    print(f"{subject}: {count} books")

# print 10 longest bookshelves and subjects with their total word counts
print("\nLongest bookshelves:")
sorted_bookshelves_by_length = sorted(bookshelve_word_counts.items(), key=lambda x: x[1], reverse=True)
for bookshelf, word_count in sorted_bookshelves_by_length[:10]:
    print(f"{bookshelf}: {word_count} words")
print("\nLongest subjects:")
sorted_subjects_by_length = sorted(subject_word_counts.items(), key=lambda x: x[1], reverse=True)
count = 1
for subject, word_count in sorted_subjects_by_length:
    if word_count < 500_000:
        break
    print(f"{count}: {subject}: {word_count} words")
    count += 1



In [None]:
# create a mapping from subjects to books
subject_to_book_map = {}
for subject in all_subjects:
    subject_to_book_map[subject] = []
for book_id, book in books_with_metadata.items():
    for subject in book["subjects"]:
        subject_to_book_map[subject].append(book_id)

# look at all pairs of subject, and check how many books they have in common
subject_pairs = {}
for i in range(len(subject_list)):
    for j in range(i + 1, len(subject_list)):
        subject1 = subject_list[i]
        subject2 = subject_list[j]
        common_books = set(subject_to_book_map[subject1]) & set(subject_to_book_map[subject2])
        if common_books:
            subject_pairs[(subject1, subject2)] = len(common_books)
print("\nMost common subject pairs:")
sorted_subject_pairs = sorted(subject_pairs.items(), key=lambda x: x[1], reverse=True)
for (subject1, subject2), count in sorted_subject_pairs[:10]:
    print(f"{subject1} & {subject2}: {count} common books")

In [None]:
# takes in a list of tuples (gutenberg_id, name) a filename and a word limit
# writes file with the books that have a word count less than the limit to 
# ./datasets/gutenberg/genres/filename.train
def create_dataset_from_list(books, filename, word_limit):
    if os.path.exists(f"./datasets/gutenberg/genres/{filename}.train"):
        os.remove(f"./datasets/gutenberg/genres/{filename}.train")
    word_count = 0
    for gutenberg_id, name in books:
        try:
            response = requests.get(f"https://www.gutenberg.org/cache/epub/{gutenberg_id}/pg{gutenberg_id}.txt", timeout=30)
            response.raise_for_status()
            text = response.text
            if word_count + len(text.split()) > word_limit:
                print(f"Word limit reached. Stopping.")
                # write the remaining number of words to hit the word limit exactly
                text = " ".join(text.split()[:word_limit - word_count])
                with open(f"./datasets/gutenberg/genres/{filename}.train", "a", encoding="utf-8") as f:
                    f.write(f"= = = {gutenberg_id} {name}\n")
                    f.write(text + "\n")
                print(f"Wrote {word_count + len(text.split())} words to {filename}.train with word limit {word_limit}.")
                break
            word_count += len(text.split())
            with open(f"./datasets/gutenberg/genres/{filename}.train", "a", encoding="utf-8") as f:
                f.write(f"= = = {gutenberg_id} {name}\n")
                f.write(text + "\n")
        except requests.exceptions.RequestException as e:
            print(f"Failed to get metadata for book {gutenberg_id}: {e}")
        except Exception as e:
            print(f"Error processing book {gutenberg_id}: {e}")

In [None]:
example_list = [
    (84, "Frankenstein; Or, The Modern Prometheus"),                      # Mary Shelley :contentReference[oaicite:1]{index=1}
    (35, "The Time Machine"),                                            # H. G. Wells :contentReference[oaicite:2]{index=2}
    (36, "The Island of Doctor Moreau"),                                 # H. G. Wells :contentReference[oaicite:3]{index=3}
    (36, "The War of the Worlds"),                                       # H. G. Wells :contentReference[oaicite:4]{index=4}
    (5230, "The Invisible Man"),                                         # H. G. Wells :contentReference[oaicite:5]{index=5}
    (12163, "The Sleeper Awakes"),                                       # H. G. Wells (aka When the Sleeper Wakes) :contentReference[oaicite:6]{index=6}
    (36, "When the Sleeper Wakes"),                                      # duplicate ID but alt title :contentReference[oaicite:7]{index=7}
    (1887, "A Princess of Mars"),                                        # Edgar Rice Burroughs :contentReference[oaicite:8]{index=8}
    (164, "Twenty Thousand Leagues Under the Sea"),                      # Jules Verne :contentReference[oaicite:9]{index=9}
    (147, "A Journey to the Centre of the Earth"),                       # Jules Verne :contentReference[oaicite:10]{index=10}
    (120, "The Mysterious Island"),                                      # Jules Verne :contentReference[oaicite:11]{index=11}
    (16, "From the Earth to the Moon"),                                  # Jules Verne :contentReference[oaicite:12]{index=12}
    (523, "Flatland: A Romance of Many Dimensions"),                     # Edwin A. Abbott :contentReference[oaicite:13]{index=13}
    (766, "Triplanetary"),                                               # E. E. “Doc” Smith :contentReference[oaicite:14]{index=14}
    (526, "First Lensman"),                                              # E. E. Smith :contentReference[oaicite:15]{index=15}
    (34217, "The Lost World"),                                           # Sir Arthur Conan Doyle :contentReference[oaicite:16]{index=16}
    (121, "The Strange Case of Dr. Jekyll and Mr. Hyde"),                # Stevenson (proto-SF/horror) :contentReference[oaicite:17]{index=17}
    (4080, "2 B R 0 2 B"),                                               # Kurt Vonnegut :contentReference[oaicite:18]{index=18}
    (73886, "Scanners Live in Vain"),                                    # Cordwainer Smith :contentReference[oaicite:19]{index=19}
    (111, "Caesar’s Column")                                             # Ignatius Donnelly :contentReference[oaicite:20]{index=20}
]

create_dataset_from_list(example_list, "sci-fi", 100_000)