In [1]:
import pandas as pd
from nltk.metrics import edit_distance, fractional_presence, jaccard_distance
import re

In [2]:
re.search('Volume\s[0-9]', 'Shakespeare Volume 1')

<_sre.SRE_Match object; span=(12, 20), match='Volume 1'>

In [3]:
database = 'sqlite:////home/jon/Code/gitenberg-scrape/pg-text-7.db'

In [4]:
query = """select author, authoryearofbirth, authoryearofdeath, count(1) as books 
           from meta 
           where LCC like "%PR%" and languages like "%en%" 
           group by author having books > 10"""
authorList = pd.read_sql(query, database)

In [5]:
droplist = []
for i, row in authorList.iterrows():
    author = row.author
    if len(author) < 1: 
        droplist.append(i)
#     if 'Anonymous' in author:
#         droplist.append(i)
authorList = authorList.drop(droplist)

In [6]:
authorList.sort_values('books', ascending=False)

Unnamed: 0,author,authoryearofbirth,authoryearofdeath,books
83,"Lytton, Edward Bulwer Lytton, Baron",1803,1873,209
115,"Shakespeare, William",1564,1616,174
68,"Jacobs, W. W. (William Wymark)",1863,1943,111
92,"Meredith, George",1828,1909,109
119,"Stevenson, Robert Louis",1850,1894,78
40,"Dickens, Charles",1812,1870,74
126,"Trollope, Anthony",1815,1882,69
79,"Le Queux, William",1864,1927,62
113,"Scott, Walter",1771,1832,57
43,"Doyle, Arthur Conan",1859,1930,57


In [7]:
def toInt(numStr): 
    if len(numStr) > 0: 
        return int(numStr)
    else: 
        return None
    
def getTitles(authorRow): 
    author = authorRow.author
    dob = int(authorRow.authoryearofbirth) if len(authorRow.authoryearofbirth) > 0 else None
    dod = int(authorRow.authoryearofdeath) if len(authorRow.authoryearofdeath) > 0 else None
    query = 'select id, title, author, authoryearofbirth, authoryearofdeath, gr_pubDate from meta where author = "' + author + '" and LCC like "%PR%" and languages like "%en%"'
    df = pd.read_sql(query, database)
    df.drop_duplicates('title', keep='last', inplace=True)
    droplist = []
    for i, row in df.iterrows():
        # Remove multi-volume works, 
        # since they're usually accompanied by a complete version anyway. 
        if re.search('Volume\s[0-9]', row.title) is not None: 
            droplist.append(i)
            continue
        
        # This removes titles like "The Complete Works of...," 
        # which often contain works found elsewhere in the corpus. 
        if "Works of" in row.title:
            droplist.append(i)
            continue
        
        if len(row.gr_pubDate)>3:
            date = int(row.gr_pubDate[:4])
            # Sanity check on the date of publication. 
            # Is it before or after the author's birth/death dates? 
            if dob is not None and dod is not None:
                if date < dob or date > dod: 
                    droplist.append(i)
                    continue
        
        else:
            # Scrap those without dates for now. 
            # TODO: look these up on OpenLibrary or equivalent API
            droplist.append(i)
            continue
        
    df.drop(droplist, inplace=True)
    df['date'] = df['gr_pubDate'].apply(lambda x: int(x[:4]))
    df['authoryearofbirth'] = df['authoryearofbirth'].apply(toInt)
    df['authoryearofdeath'] = df['authoryearofdeath'].apply(toInt)
    df['age'] = df['date']-df['authoryearofbirth'] if df['date'] is not None and df['authoryearofbirth'] is not None else None
    df['yearsToDeath'] = df['authoryearofdeath']-df['date']
    return df

In [8]:
getTitles(authorList.loc[91])

Unnamed: 0,id,title,author,authoryearofbirth,authoryearofdeath,gr_pubDate,date,age,yearsToDeath
0,222.0,The Moon and Sixpence,"Maugham, W. Somerset (William Somerset)",1874,1965,1919--,1919,45,46
1,351.0,Of Human Bondage,"Maugham, W. Somerset (William Somerset)",1874,1965,1915--,1915,41,50
3,16517.0,Liza of Lambeth,"Maugham, W. Somerset (William Somerset)",1874,1965,1897--,1897,23,68
4,26854.0,The Trembling of a Leaf: Little Stories of the...,"Maugham, W. Somerset (William Somerset)",1874,1965,1921--,1921,47,44
8,34860.0,East of Suez: A Play in Seven Scenes,"Maugham, W. Somerset (William Somerset)",1874,1965,1922-1-1,1922,48,43
12,42395.0,The Circle: A Comedy in Three Acts,"Maugham, W. Somerset (William Somerset)",1874,1965,1921--,1921,47,44


In [9]:
authorTitlesDict = {authorRow.author: getTitles(authorRow)
                    for i, authorRow in authorList.iterrows() 
                    if len(getTitles(authorRow)) > 7}

In [10]:
authorTitlesDict['Shakespeare, William']

Unnamed: 0,id,title,author,authoryearofbirth,authoryearofdeath,gr_pubDate,date,age,yearsToDeath
1,1041.0,Shakespeare's Sonnets,"Shakespeare, William",1564,1616,1592--,1592,28,24
2,1045.0,Venus and Adonis,"Shakespeare, William",1564,1616,1592--,1592,28,24
5,1102.0,The Third Part of King Henry the Sixth,"Shakespeare, William",1564,1616,1591--,1591,27,25
8,1105.0,The Sonnets,"Shakespeare, William",1564,1616,1592--,1592,28,24
15,1112.0,The Tragedy of Romeo and Juliet,"Shakespeare, William",1564,1616,1595--,1595,31,21
20,1117.0,Second Part of King Henry IV,"Shakespeare, William",1564,1616,1597--,1597,33,19
30,1127.0,"The Tragedy of Othello, Moor of Venice","Shakespeare, William",1564,1616,1603--,1603,39,13
32,1129.0,The Tragedy of Macbeth,"Shakespeare, William",1564,1616,1606--,1606,42,10
41,1500.0,"King Henry VI, First Part","Shakespeare, William",1564,1616,1590--,1590,26,26
42,1501.0,"History of King Henry the Sixth, Second Part","Shakespeare, William",1564,1616,1590--,1590,26,26


In [11]:
masterList = pd.concat(authorTitlesDict.values())

In [12]:
masterList

Unnamed: 0,id,title,author,authoryearofbirth,authoryearofdeath,gr_pubDate,date,age,yearsToDeath
0,4340.0,The British Barbarians,"Allen, Grant",1848,1899,1895--,1895,47,4
1,4396.0,The Woman Who Did,"Allen, Grant",1848,1899,1895--,1895,47,4
5,5869.0,Michael's Crag,"Allen, Grant",1848,1899,1893--,1893,45,6
6,6010.0,What's Bred in the Bone,"Allen, Grant",1848,1899,1891--,1891,43,8
7,6060.0,Philistia,"Allen, Grant",1848,1899,1884--,1884,36,15
8,13876.0,The Great Taboo,"Allen, Grant",1848,1899,1891--,1891,43,8
9,18788.0,Post-Prandial Philosophy,"Allen, Grant",1848,1899,1894--,1894,46,5
10,30970.0,Miss Cayley's Adventures,"Allen, Grant",1848,1899,1899--,1899,51,0
13,43688.0,"Wednesday the Tenth, A Tale of the South Pacific","Allen, Grant",1848,1899,1890--,1890,42,9
0,14005.0,The Ladies Delight,Anonymous,,,1732--,1732,,


In [13]:
masterList.to_csv('bibliography/everyone.csv')