In [112]:
import requests
import json
import random
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
# Create queries for corpus-DB
baseURL = "http://localhost:8000" # For local testing
#baseURL = "http://corpus-db.org" 

First, let's get a list of all available Library of Congress subject headings (i.e. sub-corpora). 

In [5]:
allSubjects = requests.get(baseURL + "/api/subjects").text

In [7]:
subjects = json.loads(allSubjects)

List the 15 most common subjects. 

In [11]:
subjects[:15]

[['Fiction', 1920],
 ['Short stories', 1581],
 ['Science fiction', 1283],
 ['Adventure stories', 766],
 ['Historical fiction', 634],
 ['Poetry', 633],
 ['Love stories', 619],
 [', ', 557],
 ['English wit and humor -- Periodicals', 555],
 ['Conduct of life -- Juvenile fiction', 554],
 ['Detective and mystery stories', 537],
 ['Western stories', 432],
 ['Man-woman relationships -- Fiction', 389],
 ['England -- Fiction', 309],
 ['Friendship -- Juvenile fiction', 307]]

Let's compare "detective and mystery stories" with general fiction. First, get metadata for all the detective stories, and all the general fiction. 

In [12]:
detectiveMeta = json.loads(requests.get(baseURL + "/api/subject/Detective and mystery stories").text)

In [16]:
fictionMeta = json.loads(requests.get(baseURL + "/api/subject/Fiction").text)

Now let's narrow by Library of Congress category, to make sure that we're only dealing with British novels. 

In [31]:
britishFictionMeta = [item for item in fictionMeta if 'PR' in item['LCC']]
britishDetectiveMeta = [item for item in detectiveMeta if 'PR' in item['LCC']]

Let's take a look. Grab a sample of 10 random works from each. 

In [41]:
detectiveMetaSample = pd.DataFrame(
    [(item['id'], item['author'], item['title']) 
     for item in random.sample(britishDetectiveMeta, 10)])

In [42]:
detectiveMetaSample

Unnamed: 0,0,1,2
0,17040.0,"Oppenheim, E. Phillips (Edward Phillips)",The Survivor
1,37820.0,"Morrison, Arthur",Chronicles of Martin Hewitt
2,5308.0,"Fletcher, J. S. (Joseph Smith)",The Paradise Mystery
3,7687.0,"Lytton, Edward Bulwer Lytton, Baron",Lucretia — Volume 03
4,26447.0,"Tracy, Louis",The Strange Case of Mortimer Fenley
5,29173.0,"Le Queux, William",The White Lie
6,244.0,"Doyle, Arthur Conan",A Study in Scarlet
7,17063.0,"Oppenheim, E. Phillips (Edward Phillips)",A Lost Leader
8,1872.0,"Milne, A. A. (Alan Alexander)",The Red House Mystery
9,9808.0,"Jepson, Edgar",The Loudwater Mystery


In [106]:
fictionMetaSample = pd.DataFrame(
    [(item['id'], item['author'], item['title']) 
     for item in random.sample(britishFictionMeta, 10)])

In [107]:
fictionMetaSample

Unnamed: 0,0,1,2
0,2126.0,"Rohmer, Sax",The Quest of the Sacred Slipper
1,43529.0,"Fenwick, E. (Eliza)","Secresy; or, Ruin on the Rock"
2,42324.0,"Shelley, Mary Wollstonecraft","Frankenstein; Or, The Modern Prometheus"
3,2525.0,"Jerome, Jerome K. (Jerome Klapka)","John Ingerfield, and Other Stories"
4,1918.0,"Haggard, H. Rider (Henry Rider)",Long Odds
5,40333.0,"Cleland, Robert",A Rich Man's Relatives (Vol. 3 of 3)
6,10318.0,"Godwin, William",Damon and Delia: A Tale
7,12398.0,"Richardson, Samuel",Clarissa Harlowe; or the history of a young la...
8,46467.0,"Blackmore, R. D. (Richard Doddridge)","The Remarkable History of Sir Thomas Upmore, b..."
9,5795.0,"Yeats, W. B. (William Butler)",The Secret Rose


Not terrible.

In [108]:
def getFulltext(bookID): 
    return json.loads(requests.get(baseURL + '/api/id/' + bookID + '/fulltext').text)

In [87]:
detectiveFulltexts = [getFulltext(bookID) for bookID in detectiveMetaSample[0]] 

In [109]:
fictionFulltexts = [getFulltext(bookID) for bookID in fictionMetaSample[0]]   

In [92]:
detectiveTexts = [text[0]['text'] for text in detectiveFulltexts]

In [110]:
fictionTexts = [text[0]['text'] for text in fictionFulltexts]

In [113]:
tfidf = TfidfVectorizer()