# 1. Get a list of English-language fiction on Gutenberg
First [download Gutenberg's metadata catalogue](https://www.gutenberg.org/wiki/Gutenberg:Feeds). Unzip this into a folder and specify that location as catalogue_dir below.

Then run this notebook. It will produce a dataframe showing the catalogue number, title, authors and keywords for all the books available in English that have 'fiction' somewhere in their keywords.

In [None]:
import pickle
import pandas as pd

from tqdm import tqdm
from pathlib import Path
from bs4 import BeautifulSoup

catalogue_dir = Path.cwd().parent / "data" / "gutenberg_catalogue"

# read the rdfs and combine them into a single-file index
files = [path for path in catalogue_dir.rglob('*.rdf')]
len(files)

In [None]:
# Find books in English with the word 'fiction' in their subject tags
results = []
for entry in tqdm(range(len(files))):
    with open(files[entry], encoding = 'utf8') as f:
        # Parse each metadata rdf file with BeautifulSoup
        doc = BeautifulSoup(f, "xml")
        
        # Get the language and subject data for each file
        subjects = [s.find("rdf:value").text for s in doc.find_all("subject")]
        languages = [a.text.strip() for a in doc.find_all("language")]
        
        # Filter to English-language fiction and add to the results list
        if "en" in languages and "fiction" in " ".join(subjects).lower():
            links = [link.attrs["rdf:about"] for link in doc.find_all("file")]
            results.append({
                "catalogue_number": doc.find("ebook").attrs["rdf:about"].lstrip("ebooks/"),
                "title": doc.find("dcterms:title").text,
                "authors": [name.text for name in doc.find_all("name")],
                "subjects": subjects, 
            })

books = pd.DataFrame(results)
books.to_pickle(catalogue_dir.parent / "english_fiction.pkl")

books.head()