# Finding instances from one corpus in Hathi

This shows how to churn through two corpuses to find copies of one in the other.


In [None]:
import SRP
import numpy as np
from scipy.spatial.distance import cdist

In [None]:
%load_ext autoreload
%autoreload 2

This assumes that you've already created the txtlab file described in the notebook "Hash a corpus of text files into SRP space"

In [None]:
guten = SRP.Vector_file("pg-srp-vecs.bin", dims = 640, precision = 4, mode = "r").to_matrix(unit_length = True)

In [None]:
guten['matrix'].shape

In [None]:
def hathi_chunker(max_size=1000, dims = 640):
    hathi = SRP.Vector_file("/home/bschmidt/vector_models/hathi.bin")
    id_cache = []
    row_cache = np.zeros((max_size, dims),"<f4")
    for id,row in hathi:
        row_cache[len(id_cache)] = row[:dims]/np.linalg.norm(row[:dims])
        id_cache.append(id)
        if len(id_cache) == max_size:
            yield (id_cache, row_cache)
            id_cache = []
            row_cache = np.zeros((max_size,dims),"<f4")        
            


Use an array to store the neighbors of each index: store the top ten items to start.

In [None]:
trans_guten = guten['matrix'].transpose()

In [None]:
np.argpartition(np.array([1,2,3,4,3,2,1]), 3)

In [None]:
hathi_chunks = hathi_chunker()

knn = 30
neighbors = []
for i in range(len(guten["names"])):
    neighbors.append([])
    for j in range(knn):
        neighbors[-1].append( (0, "nothing") )

n_chunked = 0

for ids,rows in hathi_chunks:
    n_chunked += 1
    pairwise = np.dot(rows, trans_guten)
    close = np.where(pairwise > 0.78)
    for hathi_i, guten_i in zip(*close):
        sim = pairwise[hathi_i, guten_i]
        if sim > neighbors[guten_i][-1][0]:
            neighbors[guten_i][-1] = (sim,ids[hathi_i])
            neighbors[guten_i].sort(reverse=True)
        elif sim > .9:
            # Catch everything that close
            neighbors[guten_i].append((sim,ids[hathi_i]))
            neighbors[guten_i].sort(reverse=True)
    if n_chunked % 100 == 0:
        print "checked {},000 in hathi\r".format(n_chunked)

In [None]:
len([n[0] for n in neighbors if n[0][0] > .9])

In [None]:
neighbors[1][0]

In [None]:
guten_title(guten['names'][3])

In [None]:
pgmeta = pd.read_csv("pg-meta.csv")
pgmeta.gid = pgmeta.id.astype("int64")
pglookup = dict(zip(pgmeta.id.tolist(), pgmeta.title.tolist()))

In [None]:
oput = []
import pandas as pd
for i in range(len(neighbors)):
    guten_id = guten['names'][i]
    for i, (dist, htid) in enumerate(neighbors[i]):
        if i == 0 or dist is not 0:
            oput.append({"gid": guten_id, "htid": htid, "dist": dist})
pd.DataFrame(oput).to_csv("crosswalk.csv")

## What's not matched?

24,000 out of 38,000 are being matched. What's being missed? A random sample.

In [None]:
printed = 0
import random
random.seed(0)
for i in random.sample(range(len(neighbors)), 200):
    guten_id = guten['names'][i]
    for i, (dist, htid) in enumerate(neighbors[i]):
        if i == 0 and dist == 0:
            print(" * " + pglookup[int(guten_id)])
            printed += 1
            if printed > 50:
                break
            


What's *not* scanned in Hathi is as interesting as what is. Many of these are journals where the one or two copies in Hathi probably just doesn't have good OCR--or, possibly, is bound in a different way than the Gutenberg editions.

Some are crazily small texts that exist elsewhere in PG: "The Bible, King James version, Book 52: 1 Thessalonians"

Some are specifically PG texts. Things like just the number e to a gazillion decimal points: http://www.gutenberg.org/files/127/127.txt 

Others are translations into non-English languages, especially of the sci-fi that prevails in PG. 'https://nl.wikipedia.org/wiki/Naar_het_middelpunt_der_aarde'

In [None]:
import urllib2
import ujson as json
from IPython.display import HTML

guten_cache = {}

def guten_title(id, force = False):
    return pglookup[int(id)]

guten_title(guten['names'][14])

In [None]:
import urllib2
import ujson as json
from IPython.display import HTML

#hathi_cache = {}

def jsonify(id, force = False):
    global hathi_cache
    if id in hathi_cache and not force:
        return hathi_cache[id]
    sons = "\n".join(urllib2.urlopen("http://catalog.hathitrust.org/api/volumes/brief/htid/%s.json" %id.replace("+",":").replace("=","/")).readlines())
    hathi_cache[id] = json.loads(sons)
    return hathi_cache[id]

def descend(record):
    # Parse a hathi API call response.
    a = record['records']
    try:
        return a[a.keys()[0]]
    except IndexError:
        print record
        raise
        
def pretty_print(htid,text):
    output_string = ""#u"<ul>"
    try:
        a = descend(jsonify(htid))
        a['url'] = u"https://babel.hathitrust.org/cgi/pt?id=" + htid
        try:
            output_string += u"<li><a href={}>{} ({})</a><br>{}</li>".format(
                a['url'],a['titles'][0].encode("ascii","ignore"),a['publishDates'][0],text.encode("ascii","ignore"))
        except:
            print a
    except IndexError:
        print ('no index',p)
        pass
    except:
        print ""
        raise
    return HTML(output_string + "")#)"</ul>")

class Hathi_Book():
    def __init__(self,htid,text=""):
        self.htid = htid
        self.desc = descend(jsonify(htid))
        self.text = text
    def _repr_html_(self):
        self.desc['url'] = u"https://babel.hathitrust.org/cgi/pt?id=" + self.htid
        output_string = u"<li><a href={}>{} ({})</a><br>{}</li>".format(
                self.desc['url'],self.desc['titles'][0].encode("ascii","ignore"),self.desc['publishDates'][0],self.text.decode("utf-8","ignore"))
        return output_string
    def title(self):
        return self.desc['titles'][0]
    
Hathi_Book("inu.30000026383574","Some sample text to go with, ❤").title()

This is code to debug the matches that I find. It's involved in the way that research code can be.

Essentially, though, it spends most of its time on data cleaning and cutoff. The big challenge is 
that I don't want it to flag for me as a problem when Hathi has a "The Works of Charles Dickens, vol 3" 
and the textlab has "Great Expectations."

So it doesn't bother to compare matches for uninformative Hathi titles.

Then it does some string replacement to normalize words or strings like "and", "roman", and "œ":
finally, it can compare the titles from Hathi to see if they're the same as those in the textlab. If not,
it prints to console suggesting that we check up.

In many cases, this reveals problems in the original data: the textlab called a book "The Vicar of Wrexham", but it's actuall *The vicar of Wrexhill*. The machine is a decent proofreader!

In [None]:
nearly = []
import IPython
for i,neighbor in enumerate(neighbors):
    for dist, hathi in neighbor:
        if dist > .85:
            name = guten_title(guten["names"][i])
            nearly.append((dist,name,hathi))
            #IPython.display.display(Hathi_Book(nearly[-1][2], u"similarity of {:02f} to {}".format(nearly[-1][0], nearly[-1][1]).encode("utf-8")))

In [None]:
nearly.sort(reverse=True)
seen = set()
last_dist = 0

In [None]:
import IPython.display

What are the fourteen books with the most perfect matches? Sanity check.

In [None]:
for i in range(14):
    IPython.display.display(Hathi_Book(nearly[i][2], u"similarity of {:02f} to {}".format(nearly[i][0], nearly[i][1]).encode("utf-8")))


In [None]:
IPython.display.display(Hathi_Book(nearly[i][2], u"similarity of {:02f} to {}".format(nearly[i][0], nearly[i][1]).encode("utf-8")))

In [None]:
seen = set([])
for dist,name,hathi in nearly:
    if dist < .9 and last_dist >= .9:
        print "***seen {} at .1 distance, the conservative cutoff.".format(len(seen))
    if dist < .82 and last_dist >= .82:
        print "***seen {} at .18 distance, the hand-picked cutoff for best performance at this task".format(len(seen))
    last_dist = dist
    if name in seen:
        # The first match for a book is the best.
        continue
    try:
        hathi_title = Hathi_Book(hathi).title()
    except: 
        continue
    broken = False
    for workmarker in [
        u"sämmtliche", u"Novels and tales",u"works of", "novels of",
        u"Werke", u"Gesammelte", u"Romane und Erzählungen", "werke", "Romane", u"Erzählungen",
        u"Works", u"Life and works", u"v.",u"O︠e︡uvres", u"complètes", u"complètes","gesammelt",u"Sämmtliche",
        u"OEuvres", "The writings of", "Tales and novels", u"Œuvres", "Waverley novels", u"Erzählungen",
        u"Oeuvres", "gesammelte Romane", "Standard novels", "uvres comple", u"sämtliche", u"sämliche","Samtliche",
    "Deutsche Literatur", "prose tales", "Romans", "ovels of", "'s works"
        "in philology", "Agora", # These are both 20C journals I can't check to see if they published an old novel.
        "Dichtungen und Schriften"]:
        if workmarker in hathi_title.lower():
            broken = True
    if broken:
        # Don't make me check "Works v. 4"
        continue
    import sys   
    seen.add(name)
    mcgill_title = name
    mt = mcgill_title.decode("utf-8")
    try:
        ht = hathi_title.decode("utf-8", errors="ignore")
    except UnicodeEncodeError:
        ht = "Error"
        hathi_title = "Error"
    for find, replace in [
        (u"'",""),
        (u"œ", "oe"),
        ("the", ""),
        (" ",""),
        (u"è", "e"),
        ("-",""),
        (u"é","e"),
        ("man","men"),
        ("dela", ""),
        ("de", ""),
        (",", ""),
        (":", ""),
        (";", ""),
        (u"ß","ss"),
        (",roman",""),
        (u" —", "")
    ]:
        mt = mt.lower().replace(find, replace)
        ht = ht.lower().replace(find, replace)
    if mt[:15] in ht or ht[:15] in mt:
        sys.stdout.write(".")
        continue
    print u"\n{} is {:0.4f} from {} ({})".format(mcgill_title.decode("utf-8"), dist, hathi_title.decode("utf-8"), hathi)
    
