Skip to content

Commit

Permalink
Merge 9bb7f5f into e362491
Browse files Browse the repository at this point in the history
  • Loading branch information
thatandromeda committed Jun 27, 2018
2 parents e362491 + 9bb7f5f commit 0e5140f
Show file tree
Hide file tree
Showing 20 changed files with 831 additions and 184 deletions.
3 changes: 2 additions & 1 deletion .codeclimate.yml
@@ -1,3 +1,4 @@
exclude_paths:
exclude_patterns:
- "tests.py"
- "urls.py"
- "migrations/"
1 change: 1 addition & 0 deletions .gitignore
Expand Up @@ -9,6 +9,7 @@ notes/
model/
*.model
*.npy
refs.p

# But we need to keep a few (chosen for smallness) so that we can run tests on
# Travis.
Expand Down
1 change: 1 addition & 0 deletions Pipfile
Expand Up @@ -5,6 +5,7 @@ name = "pypi"

[dev-packages]
requests = "*"
refextract = {git = "https://github.com/MITLibraries/refextract.git"}

[packages]
django = {version = "~=1.11"}
Expand Down
359 changes: 180 additions & 179 deletions Pipfile.lock

Large diffs are not rendered by default.

27 changes: 27 additions & 0 deletions docs/developer.md
@@ -1,6 +1,33 @@
# Documentation
This document is for people who are trying to stand up an instance of Hamlet on localhost in order to write code. It assumes you are generally familiar with setting up development environments (for instance, that you can install Python dependencies and stand up local Postgres).

## Standing up Hamlet
You will need:
* git
* pipenv (https://pipenv.readthedocs.io/en/latest/install/#installing-pipenv)
* the Heroku CLI
* postgres

* Get the code and dependencies
* `git clone https://github.com/MITLibraries/hamlet.git`
* `cd hamlet`
* `pipenv install`
* If you want to do neural net training or OCR source files, there are additional non-pip dependencies; see below.
* `pipenv shell`
* Set up your postgres database
* Create a database
* The name of this database should be `hamlet`, or else set an environment variable `DJANGO_DB` with its name
* Create a database user
* The name of this user should be `hamlet`, or else set an environment variable `DJANGO_DB_USER` with its name
* Grant all privileges on your database to your user
* Set an environment variable `DJANGO_DB_PASSWORD` with your database user's password
* `python manage.py migrate`
* Ask Andy or Andromeda for a data dump to populate the db.
* Set an environment variable `DJANGO_SETTINGS_MODULE=hamlet.settings.local`
* `python manage.py createsuperuser` and follow the prompts - this will let you log in at `/admin`
* `python manage.py collectstatic --noinput`
* `heroku local`

## Tests
Run tests with `python manage.py test --settings=hamlet.settings.test`.

Expand Down
Empty file added hamlet/citations/__init__.py
Empty file.
12 changes: 12 additions & 0 deletions hamlet/citations/admin.py
@@ -0,0 +1,12 @@
from django.contrib import admin

from .models import Citation


class CitationAdmin(admin.ModelAdmin):
model = Citation
search_fields = ('author', 'year', 'doi', 'thesis__title', 'raw_ref')
list_display = ('thesis', 'raw_ref')


admin.site.register(Citation, CitationAdmin)
214 changes: 214 additions & 0 deletions hamlet/citations/extract_refs.py
@@ -0,0 +1,214 @@
import multiprocessing as mp
import os
import pickle
import string

from refextract import extract_references_from_string

from django.conf import settings

localpath = os.path.join(settings.PROJECT_DIR, 'neural', 'files', 'main')


# Categorizing references as good or bad ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
def _enough_punctuation(ref):
# Determine whether a string has a high enough percentage of punctuation
# to be a plausible citation.
#
# Citations have a much higher percentage of punctuation than normal
# free text, as citation formats dictate that they are mostly short strings
# (like years and personal names) set off by commas, parentheses, etc.
# This test has a moderate false positive rate but an extremely low false
# negative rate.
punct = len([c for c in ref if c in string.punctuation])
total = len(ref)
if round(100 * punct / total) >= 10:
return True
else:
return False


def _classify_ref(handle, refdict, goodrefs, badrefs):
# Determine whether a given candidate citation should be categorized as
# good or bad.
#
# :param handle: The filename of the current thesis.
# :type handle: str
# :param refdict: A dictionary representing a single candidate citation.
# :type refdict: dict
# :param goodrefs: The current believed-good citations.
# :type goodrefs: dict. Its keys are handles; its values are lists of
# dicts.
# :param badrefs: The current believed-bad citations.
# :type badrefs: dict. Per goodrefs.
# :rtype: dict
# :rtype: dict
try:
ref = refdict['raw_ref'][0]
except KeyError:
return goodrefs, badrefs

if all([len(ref) <= 500,
len(ref) >= 30],
not ref.lower().startswith('table'),
not ref.lower().startswith('figure'),
_enough_punctuation(ref),
ref.upper() != ref):
goodrefs.setdefault(handle, []).append(refdict)
else:
badrefs.setdefault(handle, []).append(refdict)

return goodrefs, badrefs


def _verify_reftuple_format(reftuples):
# Ensure that reftuples has the data structure we expect.
# This is a smoke test - it's just checking the first member, not all of
# them.
assert isinstance(reftuples, list)
first_tuple = reftuples[0]
assert isinstance(first_tuple, tuple)
assert isinstance(first_tuple[0], str)
assert isinstance(first_tuple[1], list)
assert isinstance(first_tuple[1][0], dict)
assert 'raw_ref' in first_tuple[1][0].keys()


def _find_candidate_refs(reftuples):
# Given citation data associated with various filenames, classify the
# citations as good or bad.
#
# :param reftuples: list of tuples of (filename, list of dictd of citation
# data).
# :rtype: two dicts. Each dict has filenames as keys; its values are lists
# of dicts of citation data.
if not reftuples:
return {}, {}

goodrefs = {}
badrefs = {}

for reftuple in reftuples:
try:
handle = reftuple[0]
reflist = reftuple[1]
for refdict in reflist:
goodrefs, badrefs = \
_classify_ref(handle, refdict, goodrefs, badrefs)
except TypeError:
pass

return goodrefs, badrefs


def _reprocess_bad(good, bad):
for handle in bad.keys():
filteredbad = [refdict for refdict in bad[handle]
if refdict and len(refdict['raw_ref'][0]) < 200]

for x in range(0, len(filteredbad) - 1):
testref = '{} {}'.format(filteredbad[x]['raw_ref'][0],
filteredbad[x + 1]['raw_ref'][0])
if len(testref) < 200:
ref = extract_references_from_string(testref)

local_good, local_bad = _find_candidate_refs([(handle, ref)])
good.setdefault(handle, []).append(local_good)
bad.setdefault(handle, []).append(local_bad)

return good, bad


def _find_good_refs(reftuples):
# Given a list of tuples of (filename, list of citation data dicts),
# find the probably good citation data.
#
# :param reftuples: A list of 2-tuples. Each tuple is (filename, list of
# candidate references). Each candidate reference is a dict.
# :rtype: dict. The keys of the dict are filenames; the values are lists of
# dicts of believed-good citation data.
_verify_reftuple_format(reftuples)

good_candidates, bad_candidates = _find_candidate_refs(reftuples)

# At this point the goodrefs are very good. The badrefs include some
# goodrefs - particularly in cases where refs have been split over multiple
# lines. Let's try to extract those.
new_good, new_bad = _reprocess_bad(good_candidates, bad_candidates)

print("Theses with good candidates: %d" % len(new_good))
print("Theses with bad candidates: %d" % len(new_bad))

print("Total good candidates: %d" %
sum([len(x) for x in new_good.values()]))
print("Total bad candidates: %d" %
sum([len(x) for x in new_bad.values()]))

return new_good


# Extracting references from files ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
def _extract_refs(handle):
# Extract potential references from ends of files. Return a tuple of
# the file handle and the candidate references (a list of dicts).
#
# We don't want to parse the beginning, because if the extractor sees
# "References" or "Bibliography" in the table of contents, it may conclude
# it has found a reference section and parse the entire file for
# references, which is unacceptably time-consuming.
#
# :param handle: name of a file containing thesis text (not the full path,
# just the name).
# :type handle: str.
# :rtype: tuple or None
filepath = os.path.join(localpath, handle)
if os.path.isfile(filepath):
try:
popenstring = 'tail -n 1000 {}'.format(filepath)
end_of_file = os.popen(popenstring).read()
# The 'is_only_references' flag for this function, when False, is
# supposed to increase accuracy for text that may contain things
# other than the reference section (as ours does). It doesn't
# seem to work, however.
refs = extract_references_from_string(end_of_file)
if refs:
return (handle, refs)
except:
pass


# The main function ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
def extract_good_refs(maxfiles):
# Extracts believed-good citations from files and returns them.
# Along the way pickles the extracted but unprocessed refs to simplify
# debugging and future processing (the extraction is the time-consuming
# step, so we persist its results.)
#
# :param maxfiles: The maximum number of files to extract data from.
# (Extracting citations from all 43K+ theses takes days, even parallelized,
# so a shorter option is provided for testing purposes.)
# :type maxfiles: int
# :rtype: dict. The keys of the dict are filenames; the values are lists of
# dicts of believed-good citation data.

# Extract refs from files ----------------
pool = mp.Pool(mp.cpu_count())
if not maxfiles:
file_list = os.listdir(localpath)
else:
file_list = os.listdir(localpath)[:maxfiles]

results = pool.map(_extract_refs, file_list)

# Even at 8 cores this data takes days to extract, so persist it now -
# that way if there are any problems with the subsequent steps, you can
# recover.
pickle.dump(results, open("refs.p", "wb"))

pool.close()
pool.join()

# Find good refs -------------------------
good = _find_good_refs(results)

return good
Empty file.
Empty file.
77 changes: 77 additions & 0 deletions hamlet/citations/management/commands/populate_citations.py
@@ -0,0 +1,77 @@
import re

from django.core.management.base import BaseCommand
from django.utils.timezone import now

from hamlet.citations.extract_refs import extract_good_refs
from hamlet.citations.models import Citation
from hamlet.theses.models import Thesis


class Command(BaseCommand):
help = 'Processes citations from theses and adds to the database'

def add_arguments(self, parser):
parser.add_argument('maxfiles', type=int)

def handle(self, *args, **options):
start_time = now()

maxfiles = options['maxfiles']
pattern = re.compile('1721.1\-(\d+)\.txt')
base_fields = Citation._meta.get_fields()
fields = [f.name for f in base_fields
if f.name not in ['thesis', 'raw_ref', 'id']]

good = extract_good_refs(maxfiles)

total_attempts = 0
total_created = 0

for handle, refs in good.items():
identifier = pattern.match(handle).group(1)
t = Thesis.objects.get(identifier=identifier)

for item in refs:
total_attempts += 1
raw_ref = item.get('raw_ref')
if not raw_ref:
continue

# raw_ref is produced as a one-element list instead of a string.
raw_ref = raw_ref[0]

# Strip off any reference numbers left over from the
# bibliography.
raw_ref = re.sub(r'^\[\d+\] ', '', raw_ref)

c, _ = Citation.objects.get_or_create(thesis=t,
raw_ref=raw_ref)
for field in fields:
value = item.get(field)

# Save blanks, not nulls.
if not value:
value = ''
else:
value = value[0]

setattr(c, field, value)

try:
c.save()
total_created += 1
except:
print('FAIL')
for field in fields:
value = item.get(field)
print(field)
print(value)

end_time = now()
elapsed = (end_time - start_time).seconds

self.stdout.write(
self.style.SUCCESS(' %d citations created' % total_created))
self.stdout.write(
self.style.SUCCESS('%d seconds elapsed' % elapsed))
34 changes: 34 additions & 0 deletions hamlet/citations/migrations/0001_initial.py
@@ -0,0 +1,34 @@
# -*- coding: utf-8 -*-
# Generated by Django 1.11.10 on 2018-03-19 20:00
from __future__ import unicode_literals

from django.db import migrations, models
import django.db.models.deletion


class Migration(migrations.Migration):

initial = True

dependencies = [
('theses', '0007_remove_thesis__vector'),
]

operations = [
migrations.CreateModel(
name='Citation',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('doi', models.CharField(max_length=66)),
('journal', models.TextField()),
('url', models.URLField()),
('author', models.TextField()),
('title', models.TextField()),
('isbn', models.CharField(max_length=20)),
('publisher', models.CharField(max_length=32)),
('year', models.CharField(max_length=4)),
('raw_ref', models.TextField()),
('thesis', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='theses.Thesis')),
],
),
]

0 comments on commit 0e5140f

Please sign in to comment.