In [1]:
import pandas as pd
import glob
import os
import gutenberg
from gutenberg.cleanup import strip_headers
import sqlite3 as sql
import requests
import urllib
import json
from collections import Counter
from ast import literal_eval

In [2]:
df = pd.read_hdf('pg-text-3-filenames.hdf')

In [3]:
class DBPedia(): 
    """ Looks up metadata about a book from DBPedia, 
    using fuzzy matching on the title and author. """
    def __init__(self, title=None, author=None, resource=None):
        if resource==None and (title==None or author==None): 
            print('Must specify either the resource URI stub, or title and author.')
            raise
        if resource is not None: 
            resource = urllib.parse.unquote(resource)
            self.resourceURI = "http://dbpedia.org/resource/" + resource
        else: 
            response = self.getMeta(title, author)
            parsed = self.parseMeta(response)
            self.resourceURI = self.getResourceURI(parsed)
    
        if self.resourceURI is not None: 
            resourceResponse = self.queryDBP(self.resourceURI)
            resourceParsed = self.parseMeta(resourceResponse)
            self.info = self.getInfo(resourceParsed)
            if self.info is not None: 
                self.pubDate = self.getPubDate()
                self.subjects = self.getSubjects()
                self.literaryGenres = self.getLiteraryGenres()
            else: 
                self._abort()
        else: 
            self._abort()
        
    def _abort(self): 
        self.info = None 
        self.pubDate = None
        self.subjects = None
        self.literaryGenres = None
    
    def getMeta(self, title, author): 
        query = """select distinct ?book
                where {
                  ?book a dbo:Book .
                  ?book dbo:author ?author .
                  ?book dbp:name ?name .
                  FILTER (regex(?name, "%s"))
                  FILTER (regex(?author, "%s"))
                } 
                LIMIT 100""" % (title, author)
        response = requests.get("http://dbpedia.org/sparql", 
                                params=[('query', query), 
                                        ('format', 'application/json')])
        return response

    def parseMeta(self, dbpResponse): 
        try: 
            parsed = json.loads(dbpResponse.text)
        except: 
            print("Couldn't decode JSON: ", dbpResponse.text)
            return None
        return parsed

    def getResourceURI(self, jsonMeta): 
        try: 
            bindings = jsonMeta['results']['bindings']
        except KeyError: 
            print("Couldn't find the appropriate results in the JSON: ", jsonMeta)
            return None
        try: 
            uri = bindings[0]['book']['value']
        except IndexError: 
            print("Couldn't find a resource URI. There is no 0th element in the JSON: ", jsonMeta)
            return None
        return uri
    
    def queryDBP(self, resourceURI):
        query = 'DESCRIBE <%s>' % resourceURI
        params = [('default-graph-uri', 'http://dbpedia.org'), 
                  ('query', query), ('format', 'application/json')]
        r = requests.get("http://dbpedia.org/sparql", params=params)
        return r

    def getInfo(self, jsonData):
        try: 
            data = jsonData[self.resourceURI]
            return data
        except KeyError: 
            print('Strangely, there is no entry in items for this item. Jsondata: ', jsonData)
        
    
    def getPubDate(self): 
        try: 
            releaseDate = self.info['http://dbpedia.org/ontology/releaseDate']
        except KeyError: 
            print('No releaseDate in self.info ontology. Trying property.')
            print('Available info: ', self.info.keys())
            try: 
                releaseDate = self.info['http://dbpedia.org/property/releaseDate']
            except KeyError: 
                print('No releaseDate in self.info property. Giving up.')
                return None
        try: 
            firstRelease = releaseDate[0]
        except IndexError: 
            print('There is no 0th release, apparently: ', releaseDate)
            return None
        try: 
            date = firstRelease['value']
        except KeyError: 
            print('Strangely, no actual date in releaseDate:', releaseDate)
            return None
        return date
    
    def getSubjects(self): 
        subjects = []
        try: 
            subjs = self.info['http://purl.org/dc/terms/subject']
        except KeyError: 
            print('No subject in self.info.')
            return None
        for subj in subjs: 
            subject = subj['value'].split('/')[-1]
            if subject.startswith('Category:'): 
                subject = subject[9:]
            subjects.append(subject)
        return subjects
    
    def getLiteraryGenres(self): 
        literaryGenres = []
        try: 
            genres = self.info['http://dbpedia.org/ontology/literaryGenre']
        except KeyError: 
            print('No literary genre in self.info.')
            return None
        for genre in genres: 
            literaryGenre = genre['value'].split('/')[-1]
            literaryGenres.append(literaryGenre)
        return literaryGenres

In [4]:
uly = DBPedia('Ulysses', 'Joyce')

In [6]:
uly.pubDate

'1922-02-02'

In [7]:
pp = DBPedia("Pilgrim's Progress", "Bunyan")

No releaseDate in self.info ontology. Trying property.
Available info:  dict_keys(['http://www.w3.org/1999/02/22-rdf-syntax-ns#type', 'http://www.w3.org/2000/01/rdf-schema#label', 'http://www.w3.org/2000/01/rdf-schema#comment', 'http://www.w3.org/2002/07/owl#sameAs', 'http://www.w3.org/ns/prov#wasDerivedFrom', 'http://dbpedia.org/ontology/abstract', 'http://dbpedia.org/ontology/author', 'http://xmlns.com/foaf/0.1/name', 'http://dbpedia.org/property/name', 'http://xmlns.com/foaf/0.1/depiction', 'http://xmlns.com/foaf/0.1/isPrimaryTopicOf', 'http://dbpedia.org/ontology/thumbnail', 'http://purl.org/dc/terms/subject', 'http://dbpedia.org/ontology/wikiPageID', 'http://dbpedia.org/ontology/wikiPageRevisionID', 'http://dbpedia.org/ontology/wikiPageExternalLink', 'http://dbpedia.org/property/author'])
No releaseDate in self.info property. Giving up.
No literary genre in self.info.


In [77]:
df[df.subjects.notnull()].subjects.loc[2]

"['Government -- United States']"

In [9]:
twi = DBPedia(resource="The_Twilight_of_the_Gods_and_Other_Tales")

No releaseDate in self.info ontology. Trying property.
Available info:  dict_keys(['http://www.w3.org/1999/02/22-rdf-syntax-ns#type', 'http://www.w3.org/2000/01/rdf-schema#label', 'http://www.w3.org/2000/01/rdf-schema#comment', 'http://www.w3.org/2002/07/owl#sameAs', 'http://www.w3.org/ns/prov#wasDerivedFrom', 'http://dbpedia.org/ontology/abstract', 'http://dbpedia.org/ontology/author', 'http://dbpedia.org/ontology/country', 'http://dbpedia.org/ontology/language', 'http://dbpedia.org/ontology/literaryGenre', 'http://dbpedia.org/ontology/mediaType', 'http://dbpedia.org/ontology/numberOfPages', 'http://dbpedia.org/ontology/publisher', 'http://xmlns.com/foaf/0.1/name', 'http://dbpedia.org/property/name', 'http://xmlns.com/foaf/0.1/depiction', 'http://xmlns.com/foaf/0.1/isPrimaryTopicOf', 'http://dbpedia.org/ontology/thumbnail', 'http://purl.org/dc/terms/subject', 'http://dbpedia.org/ontology/wikiPageID', 'http://dbpedia.org/ontology/wikiPageRevisionID', 'http://dbpedia.org/ontology/wikiPa

In [10]:
def getResource(wikiPagesRaw): 
    try: 
        wikiList = literal_eval(wikiPagesRaw)
    except ValueError: 
        return None
    for item in wikiList: 
        if 'en.wikipedia.org' in item: 
            return item.split('/')[-1]
    return None

In [35]:
def sanitizeTitle(title): 
    # Grab the first part of the title, before any colon. 
    if ':' in title: 
        title = title.split(':')[0]
    # If there are quotation marks, remove everything after. 
    if '"' in title: 
        title = title.split('"')[0]
    # If there are parentheses, remove everything after.
    if "(" in title: 
        title = title.split('(')[0]
    return title

In [47]:
def lookupStuff(start=0):
    times = 0
    for i, row in df.iterrows(): 
        if i<start: 
            continue
        print('------------ Processing row %s -------------' % i)
        resource = getResource(row.wikipedia)
        times += 1
        if resource is not None: 
            print('Getting resource: ', resource)
            book = DBPedia(resource=resource)
        else: 
            print('Looking up by title and author.')
            if row.title is not None: 
                title = sanitizeTitle(row.title)
                print('Using title: ', title)
            else: 
                continue
            author = row.author
            if author is not None: 
                author = row.author.split(',')[0]
                author = sanitizeTitle(author) # sanitize in the same way as titles
                print('Using author: ', author)
                book = DBPedia(title = title, author=author)
            else: 
                book = None
        if book is not None: 
            pubdate = book.pubDate
            print('Pubdate: ', pubdate)
            if pubdate is not None: 
                df.set_value(i, 'wp_publication_date', pubdate)
            subjects = book.subjects
            print('Subjects: ', subjects)
            if subjects is not None: 
                df.set_value(i, 'wp_subjects', str(subjects))
            literaryGenres = book.literaryGenres
            print('Literary genres: ', literaryGenres)
            if literaryGenres is not None: 
                df.set_value(i, 'wp_literary_genres', str(literaryGenres))
            if book.info is not None: 
                df.set_value(i, 'wp_info', str(book.info))

In [58]:
lookupStuff(start=52456)

------------ Processing row 52456 -------------
Looking up by title and author.
Using title:  Uncle Sam Abroad
Using author:  Conner
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 52457 -------------
Looking up by title and author.
Using title:  The Field Book
Using author:  Maxwell
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 52458 -------------
Looking up by title and author.
Using title:  Pohjalla
Using author:  Gorky
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True

Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 52479 -------------
Looking up by title and author.
Using title:  Jessie Trim
Using author:  Farjeon
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 52480 -------------
Looking up by title and author.
Using title:  From Palmerston to Disraeli 
Using author:  Various
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 52481 -------

Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 52499 -------------
Looking up by title and author.
Using title:  Sir Robert's Fortune
Using author:  Oliphant
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 52500 -------------
Looking up by title and author.
Using title:  Persialaisia kirjeitä
Using author:  Montesquieu
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 52501 

Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 52522 -------------
Looking up by title and author.
Using title:  The Writings of Thomas Jefferson, Vol. 5 
Using author:  Jefferson
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 52523 -------------
Looking up by title and author.
Using title:  Progetto filosofico di una completa riforma del culto e dell'educazione politico-morale del popolo ebreo
Using author:  Fernando
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bi

Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 52544 -------------
Looking up by title and author.
Using title:  Pikku lordi
Using author:  Burnett
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 52545 -------------
Looking up by title and author.
Using title:  Sotavanhuksen joulu
Using author:  Topelius
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 52546 -------------
L

Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 52566 -------------
Looking up by title and author.
Using title:  Rescue the Perishing
Using author:  Seibert
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 52567 -------------
Looking up by title and author.
Using title:  The Book of Clever Beasts
Using author:  Reed
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 52568 ----

Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 52588 -------------
Looking up by title and author.
Using title:  American Unitarian Hymn Writers and Hymns
Using author:  Foote
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 52589 -------------
Looking up by title and author.
Using title:  James Oliver Curwood, Disciple of the Wilds
Using author:  Swiggett
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  No

Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 52611 -------------
Looking up by title and author.
Using title:  Die dreizehn Bücher der deutschen Seele
Using author:  Schäfer
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 52612 -------------
Looking up by title and author.
Using title:  Journael ofte gedenckwaerdige beschrijvinghe van de OostIndische Reyse van Willem Ysbrantsz. Bontekoe
Using author:  Bontekoe
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings'

Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 52633 -------------
Looking up by title and author.
Using title:  Sydän
Using author:  De Amicis
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 52634 -------------
Looking up by title and author.
Using title:  The Witches' Dream Book; and Fortune Teller
Using author:  Noe
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 52635 

Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 52656 -------------
Looking up by title and author.
Using title:  The Merman and The Figure-Head
Using author:  Guernsey
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 52657 -------------
Looking up by title and author.
Using title:  A Soldier of the Legion
Using author:  Manington
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing 

Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 52679 -------------
Looking up by title and author.
Using title:  Abroad and at Home; Practical Hints for Tourists
Using author:  Morris
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 52680 -------------
Looking up by title and author.
Using title:  Belgiens Volkscharakter, Belgiens Kunst
Using author:  Bredt
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  N

Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 52698 -------------
Looking up by title and author.
Using title:  The Southern Literary Messenger, Vol. I., No. 5, January, 1835
Using author:  Various
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 52699 -------------
Looking up by title and author.
Using title:  Erotopægnion, sive Priapeia Veterum et Recentiorum
Using author:  Noel
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects: 

Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 52721 -------------
Looking up by title and author.
Using title:  The American Missionary — Volume 33, No. 5, May, 1879
Using author:  Various
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 52722 -------------
Looking up by title and author.
Using title:  Memoirs of the life, exile, and conversations of the Emperor Napoleon. 
Using author:  Cases
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  No

Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 52742 -------------
Looking up by title and author.
Using title:  Marjorie Dean Macy's Hamilton Colony
Using author:  Chase
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 52743 -------------
Looking up by title and author.
Using title:  The Bride of the Sun
Using author:  Leroux
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row

Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 52765 -------------
Looking up by title and author.
Using title:  The Younger Sister, Vol. I.
Using author:  Hubback
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 52766 -------------
Looking up by title and author.
Using title:  The Younger Sister, Volume II.
Using author:  Hubback
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing

Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 52787 -------------
Looking up by title and author.
Using title:  Graham's Magazine, Vol. XXXVII, No. 5, November 1850
Using author:  Various
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 52788 -------------
Looking up by title and author.
Using title:  When She Came Home from College
Using author:  McNeely
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  No

Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 52809 -------------
Looking up by title and author.
Using title:  Historical Record of the Twelfth, or the East Suffolk, Regiment of Foot, Containing an Account of the Formation of the Regiment in 1685, and of Its Subsequent Services to 1847
Using author:  Cannon
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 52810 -------------
Looking up by title and author.
Using title:  Letters to Children
Using author:  Bridgman
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'resu

Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 52831 -------------
Looking up by title and author.
Using title:  Essays
Using author:  Spencer
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 52832 -------------
Looking up by title and author.
Using title:  The Development of Certain Tendencies in Modern Opera
Using author:  Browne
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processin

Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 52853 -------------
Looking up by title and author.
Using title:  The Light that Lies
Using author:  McCutcheon
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 52854 -------------
Looking up by title and author.
Using title:  Shot With Crimson
Using author:  McCutcheon
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 52855 ----

Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 52876 -------------
Looking up by title and author.
Using title:  Tom Pinder, Foundling
Using author:  Sykes
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 52877 -------------
Looking up by title and author.
Using title:  Squire Arden; volume 2 of 3
Using author:  Oliphant
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 52878

Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 52898 -------------
Looking up by title and author.
Using title:  English Lands Letters and Kings
Using author:  Mitchell
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 52899 -------------
Looking up by title and author.
Using title:  The New Forest
Using author:  Wise
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 52900 ---

Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 52920 -------------
Looking up by title and author.
Using title:  Ariosto, Shakespeare, Corneille
Using author:  Croce
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 52921 -------------
Looking up by title and author.
Using title:  London at Night
Using author:  Carter
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 52922 ---

Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 52943 -------------
Looking up by title and author.
Using title:  The History and Romance of Crime—Oriental Prisons
Using author:  Griffiths
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 52944 -------------
Looking up by title and author.
Using title:  Latvia & Russia
Using author:  Bergs
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Pro

Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 52965 -------------
Looking up by title and author.
Using title:  The Principles of Chemistry, Volume II
Using author:  Mendeleyev
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 52966 -------------
Looking up by title and author.
Using title:  American War Ballads and Lyrics, Vol. 2 
Using author:  Various
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None

Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 52987 -------------
Looking up by title and author.
Using title:  The Irish Penny Journal, Vol. 1 No. 13, September 26, 1840
Using author:  Various
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 52988 -------------
Looking up by title and author.
Using title:  The Vivisectors' Directory
------------ Processing row 52989 -------------
Looking up by title and author.
Using title:  The American Missionary — Volume 33, No. 7, July, 1879
Using author:  Various
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head'

Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 53011 -------------
Looking up by title and author.
Using title:  Frank Reade Jr.'s Air Wonder, The 
Using author:  Senarens
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 53012 -------------
Looking up by title and author.
Using title:  A Plain and Literal Translation of the Arabian Nights Entertainments, now entituled The Book of the Thousand Nights and a Night 
Using author:  Burton
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'order

Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 53033 -------------
Looking up by title and author.
Using title:  Proclamation of the Twelve Apostles of the Church of Jesus Christ of Latter-Day Saints
Using author:  Various
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 53034 -------------
Looking up by title and author.
Using title:  A History of the Peninsula War
Using author:  Oman
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjec

Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 53054 -------------
Looking up by title and author.
Using title:  Copyright Renewals
Using author:  Congress
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 53055 -------------
Looking up by title and author.
Using title:  Marks' first lessons in geometry
Using author:  Marks
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 530

Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 53077 -------------
Looking up by title and author.
Using title:  Norwich; A Sketch-Book
Using author:  Cole
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 53078 -------------
Looking up by title and author.
Using title:  Paris; A Sketch-Book
Using author:  Béjot
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 53079 ---------

Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 53100 -------------
Looking up by title and author.
Using title:  Abridgement of the Debates of Congress, from 1789 to 1856, Vol. 3 
Using author:  Various
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 53101 -------------
Looking up by title and author.
Using title:  Queens of the Renaissance
Using author:  Ryley
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genre

Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 53122 -------------
Looking up by title and author.
Using title:  A Blundering Boy
Using author:  Munro
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 53123 -------------
Looking up by title and author.
Using title:  Die Harpyen von Madrit, oder die Postkutsche
Using author:  Solórzano
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Process

Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 53144 -------------
Looking up by title and author.
Using title:  Köyhäin aarteet
Using author:  Maeterlinck
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Literary genres:  None
------------ Processing row 53145 -------------
Looking up by title and author.
Using title:  Schilderungen des Treibens im Leben und Handel in den Vereinigten Staaten und Havana.
Using author:  Ries
Couldn't find a resource URI. There is no 0th element in the JSON:  {'head': {'link': [], 'vars': ['book']}, 'results': {'distinct': False, 'ordered': True, 'bindings': []}}
Pubdate:  None
Subjects:  None
Li

TypeError: argument of type 'float' is not iterable

In [59]:
df.to_hdf('pg-text-4-dbpedia.hdf', 'pg')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed-integer,key->block1_values] [items->['LCC', 'author', 'authoryearofbirth', 'authoryearofdeath', 'downloads', 'formats', 'languages', 'lcsh', 'title', 'type', '_repo', '_version', 'alternative_title', 'contributor', 'covers', 'creator', 'description', 'edition_identifiers', 'edition_note', 'gutenberg_bookshelf', 'gutenberg_issued', 'gutenberg_type', 'identifiers', 'jmdate', 'subjects', 'language_note', 'production_note', 'publication_date', 'publication_note', 'publisher', 'rights', 'rights_url', 'series_note', 'summary', 'tableOfContents', 'titlepage_image', 'url', 'wikipedia', 'filename', 'wp_publication_date', 'wp_subjects', 'wp_info', 'wp_literary_genres']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)


In [86]:
df[df.wp_subjects.notnull()].wp_subjects.loc[11]

"['Fictional_subterranea', 'Alice_in_Wonderland', 'High_fantasy_novels', '1865_novels', '1860s_fantasy_novels', 'Literature_featuring_anthropomorphic_characters', 'Macmillan_Publishers_books', 'Size_change_in_fiction', 'Animals_in_media', 'British_novels_adapted_into_films', 'D._Appleton_&_Company_books', 'Cultural_depictions_of_Benjamin_Disraeli', 'Novels_adapted_into_radio_programs', 'Novels_adapted_into_television_programs', 'Novels_adapted_into_operas', 'Novels_adapted_into_plays', 'Novels_adapted_into_comics']"

In [78]:
allSubjs = []
for i, row in df.iterrows(): 
    subjs = row['subjects']
    try: 
        subjs = literal_eval(subjs)
    except: 
        continue
    for subj in subjs: 
        allSubjs.append(subj)
    if type(row.wp_subjects) == list: 
        for subj in row.wp_subjects: 
            allSubjs.append(subj)
        

In [87]:
allWPSubjs = []
for i, row in df.iterrows(): 
    subjs = row['wp_subjects']
    try: 
        subjs = literal_eval(subjs)
    except: 
        continue
    for subj in subjs: 
        allWPSubjs.append(subj)

In [89]:
Counter(allWPSubjs).most_common(50)

[('Novels_first_published_in_serial_form', 247),
 ('British_novels_adapted_into_films', 109),
 ('19th-century_American_novels', 99),
 ('Victorian_novels', 96),
 ('British_novels', 93),
 ('Novels_adapted_into_plays', 92),
 ('English_novels', 88),
 ('20th-century_American_novels', 85),
 ('American_science_fiction_novels', 63),
 ('American_novels_adapted_into_films', 59),
 ('19th-century_novels', 58),
 ('Historical_novels', 57),
 ('Debut_novels', 51),
 ('Novels_set_in_London', 47),
 ('1910s_fantasy_novels', 44),
 ("American_children's_novels", 42),
 ('Harper_&_Brothers_books', 41),
 ('Macmillan_Publishers_books', 40),
 ('American_fantasy_novels', 40),
 ('1915_novels', 39),
 ('Novels_by_H._Rider_Haggard', 39),
 ('Chapman_&_Hall_books', 38),
 ('Gothic_novels', 37),
 ('Novels_adapted_into_comics', 36),
 ('Novels_adapted_into_films', 36),
 ('Picaresque_novels', 36),
 ('Novels_adapted_into_television_programs', 34),
 ("British_children's_novels", 34),
 ('Novels_about_orphans', 33),
 ("Children

In [79]:
Counter(allSubjs).most_common(50)

[('Fiction', 53),
 ('French literature', 31),
 ('Science fiction', 19),
 ('Poetry', 16),
 ('Love stories', 16),
 ('Adventure stories', 16),
 ('Short stories', 15),
 ('Didactic fiction', 13),
 ('Classical literature', 13),
 ('Historical fiction', 12),
 ('Fantasy fiction', 12),
 ('Psychological fiction', 11),
 ('England -- Fiction', 11),
 ('Western stories', 11),
 ('Detective and mystery stories', 10),
 ('Triangles (Interpersonal relations) -- Fiction', 8),
 ('Essays', 8),
 ('Humorous stories', 8),
 ('Domestic fiction', 7),
 ('Young women -- Fiction', 7),
 ('Pastoral fiction', 6),
 ('Bildungsromans', 6),
 ('Orphans -- Fiction', 6),
 ('Wessex (England) -- Fiction', 5),
 ('War stories', 5),
 ('Mars (Planet) -- Fiction', 5),
 ('London (England) -- Fiction', 5),
 ('Farm life -- Fiction', 4),
 ('Brothers and sisters -- Fiction', 4),
 ('Religion', 4),
 ('Revenge -- Fiction', 4),
 ('Tarzan (Fictitious character) -- Fiction', 4),
 ('Satire', 4),
 ('Married people -- Fiction', 4),
 ('Sea stories'

In [64]:
len(df[df['wp_info'].notnull()])

1898

In [158]:
df = df.rename(columns={"subjects": "wikiSubjects"})

In [159]:
df.to_sql('pg', conn, if_exists='append')