In [1]:
# In order to build the extended catalogue with additional fields, download the book rdf files from one of the Project Gutenberg mirrors:
# https://www.gutenberg.org/MIRRORS.ALL

# SETTINGS
CATALOG_RDF_DIR = None

In [None]:
from subprocess import call
import json
import os
import shutil
from time import strftime
import sys
import urllib.request
import defusedxml.ElementTree as parser
import re
import pandas as pd

In [None]:
LINE_BREAK_PATTERN = re.compile(r'[ \t]*[\n\r]+[ \t]*')
NAMESPACES = {
    'dc': 'http://purl.org/dc/terms/',
    'dcam': 'http://purl.org/dc/dcam/',
    'marcrel': 'http://id.loc.gov/vocabulary/relators/',
    'pg': 'http://www.gutenberg.org/2009/pgterms/',
    'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
}


def safe_unicode(arg, *args, **kwargs):
    """ Coerce argument to Unicode if it's not already. """
    return arg if isinstance(arg, str) else str(arg, *args, **kwargs)

def fix_subtitles(title):
    """
    This formats subtitles with (semi)colons instead of new lines. The first
    subtitle is introduced with a colon, and the rest are introduced with
    semicolons.

    >>> fix_subtitles(u'First Across ...\r\nThe Story of ... \r\n'
    ... 'Being an investigation into ...')
    u'First Across ...: The Story of ...; Being an investigation into ...'
    """

    new_title = LINE_BREAK_PATTERN.sub(': ', title, 1)
    return LINE_BREAK_PATTERN.sub('; ', new_title)

def get_book(id, xml_file_path):
    """ Based on https://gist.github.com/andreasvc/b3b4189120d84dec8857 """

    # Parse the XML.
    document = None
    try:
        document = parser.parse(xml_file_path)
    except:
        raise Exception('The XML file could not be parsed.')

    # Get the book node.
    root = document.getroot()
    book = root.find('{%(pg)s}ebook' % NAMESPACES)

    result = {
        'id': int(id),
        'title': None,
        'authors': [],
        'translators': [],
        'type': None,
        'subjects': [],
        'languages': [],
        'formats': {},
        'downloads': None,
        'bookshelves': [],
        'copyright': None
    }

    # Authors
    creators = book.findall('.//{%(dc)s}creator' % NAMESPACES)
    for creator in creators:
        author = {'birth': None, 'death': None}
        name = creator.find('.//{%(pg)s}name' % NAMESPACES)
        if name is None:
            continue
        author['name'] = safe_unicode(name.text, encoding='UTF-8')
        birth = creator.find('.//{%(pg)s}birthdate' % NAMESPACES)
        if birth is not None:
            author['birth'] = int(birth.text)
        death = creator.find('.//{%(pg)s}deathdate' % NAMESPACES)
        if death is not None:
            author['death'] = int(death.text)
        result['authors'] += [author]

    # Translators
    translator_elements = book.findall('.//{%(marcrel)s}trl' % NAMESPACES)
    for translator_element in translator_elements:
        translator = {'birth': None, 'death': None}
        name = translator_element.find('.//{%(pg)s}name' % NAMESPACES)
        if name is None:
            continue
        translator['name'] = safe_unicode(name.text, encoding='UTF-8')
        birth = translator_element.find('.//{%(pg)s}birthdate' % NAMESPACES)
        if birth is not None:
            translator['birth'] = int(birth.text)
        death = translator_element.find('.//{%(pg)s}deathdate' % NAMESPACES)
        if death is not None:
            translator['death'] = int(death.text)
        result['translators'] += [translator]

    # Title
    title = book.find('.//{%(dc)s}title' % NAMESPACES)
    if title is not None:
        result['title'] = fix_subtitles(
            safe_unicode(title.text, encoding='UTF-8')
        )

    # Subjects
    result['subjects'] = set()
    for subject in book.findall('.//{%(dc)s}subject' % NAMESPACES):
        subject_type = subject.find('.//{%(dcam)s}memberOf' % NAMESPACES)
        if subject_type is None:
            continue
        subject_type = subject_type.get('{%(rdf)s}resource' % NAMESPACES)
        value = subject.find('.//{%(rdf)s}value' % NAMESPACES)
        value = value.text
        if subject_type in ('%(dc)sLCSH' % NAMESPACES):
            result['subjects'].add(value)
    result['subjects'] = list(result['subjects'])
    result['subjects'].sort()

    # Book Shelves
    result['bookshelves'] = set()
    for bookshelf in book.findall('.//{%(pg)s}bookshelf' % NAMESPACES):
        value = bookshelf.find('.//{%(rdf)s}value' % NAMESPACES)
        if value is not None:
            result['bookshelves'].add(value.text)
    result['bookshelves'] = list(result['bookshelves'])

    # Copyright
    rights = book.find('.//{%(dc)s}rights' % NAMESPACES)
    if rights.text.startswith('Public domain in the USA.'):
        result['copyright'] = False
    elif rights.text.startswith('Copyrighted.'):
        result['copyright'] = True
    else:
        result['copyright'] = None

    # Formats (preferring image URLs to `noimages` URLs)
    for file in book.findall('.//{%(pg)s}file' % NAMESPACES):
        content_type = file.find('{%(dc)s}format//{%(rdf)s}value' % NAMESPACES)
        if (
            content_type.text not in result['formats']
            or 'noimages' in result['formats'][content_type.text]
        ):
            url = file.get('{%(rdf)s}about' % NAMESPACES)
            result['formats'][content_type.text] = url

    # Type
    book_type = book.find(
        './/{%(dc)s}type//{%(rdf)s}value' % NAMESPACES
    )
    result['type'] = 'Text' if book_type is None else book_type.text

    # Languages
    languages = book.findall(
        './/{%(dc)s}language//{%(rdf)s}value' % NAMESPACES
    )
    result['languages'] = [language.text for language in languages] or []

    # Download Count
    download_count = book.find('.//{%(pg)s}downloads' % NAMESPACES)
    if download_count is not None:
        result['downloads'] = int(download_count.text)

    return result

In [None]:
def put_catalog_in_db():
    catalog = []
    book_ids = []
    for directory_item in os.listdir(CATALOG_RDF_DIR):
        item_path = os.path.join(CATALOG_RDF_DIR, directory_item)
        if os.path.isdir(item_path):
            try:
                book_id = int(directory_item)
            except ValueError:
                # Ignore the item if it's not a book ID number.
                pass
            else:
                book_ids.append(book_id)
    book_ids.sort()
    book_directories = [str(id) for id in book_ids]

    for directory in book_directories:
        id = int(directory)

        if (id > 0) and (id % 500 == 0):
            print('    %d' % id)

        book_path = os.path.join(
            CATALOG_RDF_DIR,
            directory,
            'pg' + directory + '.rdf'
        )

        catalog.append(get_book(id, book_path))
    return catalog


In [None]:
print(put_catalog_in_db())

[{'id': 1, 'title': 'The Declaration of Independence of the United States of America', 'authors': [{'birth': 1743, 'death': 1826, 'name': 'Jefferson, Thomas'}], 'translators': [], 'type': 'Text', 'subjects': ['United States -- History -- Revolution, 1775-1783 -- Sources', 'United States. Declaration of Independence'], 'languages': ['en'], 'formats': {'text/html': 'https://www.gutenberg.org/ebooks/1.html.images', 'application/epub+zip': 'https://www.gutenberg.org/ebooks/1.epub3.images', 'application/x-mobipocket-ebook': 'https://www.gutenberg.org/ebooks/1.kf8.images', 'text/plain; charset=us-ascii': 'https://www.gutenberg.org/ebooks/1.txt.utf-8', 'application/rdf+xml': 'https://www.gutenberg.org/ebooks/1.rdf', 'image/jpeg': 'https://www.gutenberg.org/cache/epub/1/pg1.cover.medium.jpg', 'application/octet-stream': 'https://www.gutenberg.org/cache/epub/1/pg1-h.zip'}, 'downloads': 2249, 'bookshelves': ['United States Law', 'American Revolutionary War', 'Politics'], 'copyright': False}, {'i

In [None]:
print(str(put_catalog_in_db()[0]['authors']))

[{'birth': 1743, 'death': 1826, 'name': 'Jefferson, Thomas'}]
[{'birth': 1743, 'death': 1826, 'name': 'Jefferson, Thomas'}]


In [None]:
def convert_db_to_simple_dict(db):
    result = {
        'id': int,
        'title': None,
        'authors': None,
        'first_author_birth_year': None,
        'first_author_death_year': None,
        'translators': str,
        'type': None,
        'subjects': str,
        'languages': str,
        'formats': str,
        'downloads': None,
        'bookshelves': str,
        'copyright': None
    }

    for book in db:
      result['id'] = db['id']
      result['title'] = db['title']

      authors = []
      for author in db['authors']:
        if 'name' in author:
          authors.append(author['name'])
      result['authors'] = '; '.join(authors)

      if len(db['authors']) > 0:
        result['first_author_birth_year'] = str(db['authors'][0]['birth']) if db['authors'][0]['birth'] is not None else None
        result['first_author_death_year'] = str(db['authors'][0]['death']) if db['authors'][0]['death'] is not None else None

      translators = []
      for translator in db['translators']:
        if 'name' in translator:
          translators.append(translator['name'])
      result['translators'] = ', '.join(translators)

      result['type'] = db['type']
      result['subjects'] = '; '.join(db['subjects'])
      result['languages'] = '; '.join(db['languages'])
      result['formats'] = '; '.join(list(db['formats'].keys()))
      result['downloads'] = db['downloads']
      result['bookshelves'] = '; '.join(db['bookshelves'])
      result['copyright'] = db['copyright']

      return result

In [None]:
def simple_dict_to_csv(dict):
  pd.DataFrame(dict).to_csv('extended_pg_catalog.csv', index=False)

In [None]:
db = put_catalog_in_db()

In [None]:
simple_db = [convert_db_to_simple_dict(book) for book in db]
simple_dict_to_csv(simple_db)