Merge 9bb7f5f into e362491

thatandromeda · Jun 27, 2018 · 0e5140f · 0e5140f
2 parents e362491 + 9bb7f5f
commit 0e5140f
Show file tree

Hide file tree

Showing 20 changed files with 831 additions and 184 deletions.
diff --git a/.codeclimate.yml b/.codeclimate.yml
@@ -1,3 +1,4 @@
-exclude_paths:
+exclude_patterns:
 - "tests.py"
 - "urls.py"
+- "migrations/"
diff --git a/.gitignore b/.gitignore
@@ -9,6 +9,7 @@ notes/
 model/
 *.model
 *.npy
+refs.p
 
 # But we need to keep a few (chosen for smallness) so that we can run tests on
 # Travis.

diff --git a/Pipfile b/Pipfile
@@ -5,6 +5,7 @@ name = "pypi"
 
 [dev-packages]
 requests = "*"
+refextract = {git = "https://github.com/MITLibraries/refextract.git"}
 
 [packages]
 django = {version = "~=1.11"}

diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/docs/developer.md b/docs/developer.md
@@ -1,6 +1,33 @@
 # Documentation
 This document is for people who are trying to stand up an instance of Hamlet on localhost in order to write code. It assumes you are generally familiar with setting up development environments (for instance, that you can install Python dependencies and stand up local Postgres).
 
+## Standing up Hamlet
+You will need:
+* git
+* pipenv (https://pipenv.readthedocs.io/en/latest/install/#installing-pipenv)
+* the Heroku CLI
+* postgres
+
+* Get the code and dependencies
+  * `git clone https://github.com/MITLibraries/hamlet.git`
+  * `cd hamlet`
+  * `pipenv install`
+    * If you want to do neural net training or OCR source files, there are additional non-pip dependencies; see below.
+  * `pipenv shell`
+* Set up your postgres database
+  * Create a database
+    * The name of this database should be `hamlet`, or else set an environment variable `DJANGO_DB` with its name
+  * Create a database user
+    * The name of this user should be `hamlet`, or else set an environment variable `DJANGO_DB_USER` with its name
+  * Grant all privileges on your database to your user
+  * Set an environment variable `DJANGO_DB_PASSWORD` with your database user's password
+  * `python manage.py migrate`
+  * Ask Andy or Andromeda for a data dump to populate the db.
+* Set an environment variable `DJANGO_SETTINGS_MODULE=hamlet.settings.local`
+* `python manage.py createsuperuser` and follow the prompts - this will let you log in at `/admin`
+* `python manage.py collectstatic --noinput`
+* `heroku local`
+
 ## Tests
 Run tests with `python manage.py test --settings=hamlet.settings.test`.
 

diff --git a/hamlet/citations/__init__.py b/hamlet/citations/__init__.py
diff --git a/hamlet/citations/admin.py b/hamlet/citations/admin.py
@@ -0,0 +1,12 @@
+from django.contrib import admin
+
+from .models import Citation
+
+
+class CitationAdmin(admin.ModelAdmin):
+    model = Citation
+    search_fields = ('author', 'year', 'doi', 'thesis__title', 'raw_ref')
+    list_display = ('thesis', 'raw_ref')
+
+
+admin.site.register(Citation, CitationAdmin)
diff --git a/hamlet/citations/extract_refs.py b/hamlet/citations/extract_refs.py
@@ -0,0 +1,214 @@
+import multiprocessing as mp
+import os
+import pickle
+import string
+
+from refextract import extract_references_from_string
+
+from django.conf import settings
+
+localpath = os.path.join(settings.PROJECT_DIR, 'neural', 'files', 'main')
+
+
+# Categorizing references as good or bad ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+def _enough_punctuation(ref):
+    # Determine whether a string has a high enough percentage of punctuation
+    # to be a plausible citation.
+    #
+    # Citations have a much higher percentage of punctuation than normal
+    # free text, as citation formats dictate that they are mostly short strings
+    # (like years and personal names) set off by commas, parentheses, etc.
+    # This test has a moderate false positive rate but an extremely low false
+    # negative rate.
+    punct = len([c for c in ref if c in string.punctuation])
+    total = len(ref)
+    if round(100 * punct / total) >= 10:
+        return True
+    else:
+        return False
+
+
+def _classify_ref(handle, refdict, goodrefs, badrefs):
+    # Determine whether a given candidate citation should be categorized as
+    # good or bad.
+    #
+    # :param handle: The filename of the current thesis.
+    # :type handle: str
+    # :param refdict: A dictionary representing a single candidate citation.
+    # :type refdict: dict
+    # :param goodrefs: The current believed-good citations.
+    # :type goodrefs: dict. Its keys are handles; its values are lists of
+    #                 dicts.
+    # :param badrefs: The current believed-bad citations.
+    # :type badrefs: dict. Per goodrefs.
+    # :rtype: dict
+    # :rtype: dict
+    try:
+        ref = refdict['raw_ref'][0]
+    except KeyError:
+        return goodrefs, badrefs
+
+    if all([len(ref) <= 500,
+            len(ref) >= 30],
+            not ref.lower().startswith('table'),
+            not ref.lower().startswith('figure'),
+            _enough_punctuation(ref),
+            ref.upper() != ref):
+        goodrefs.setdefault(handle, []).append(refdict)
+    else:
+        badrefs.setdefault(handle, []).append(refdict)
+
+    return goodrefs, badrefs
+
+
+def _verify_reftuple_format(reftuples):
+    # Ensure that reftuples has the data structure we expect.
+    # This is a smoke test - it's just checking the first member, not all of
+    # them.
+    assert isinstance(reftuples, list)
+    first_tuple = reftuples[0]
+    assert isinstance(first_tuple, tuple)
+    assert isinstance(first_tuple[0], str)
+    assert isinstance(first_tuple[1], list)
+    assert isinstance(first_tuple[1][0], dict)
+    assert 'raw_ref' in first_tuple[1][0].keys()
+
+
+def _find_candidate_refs(reftuples):
+    # Given citation data associated with various filenames, classify the
+    # citations as good or bad.
+    #
+    # :param reftuples: list of tuples of (filename, list of dictd of citation
+    # data).
+    # :rtype: two dicts. Each dict has filenames as keys; its values are lists
+    # of dicts of citation data.
+    if not reftuples:
+        return {}, {}
+
+    goodrefs = {}
+    badrefs = {}
+
+    for reftuple in reftuples:
+        try:
+            handle = reftuple[0]
+            reflist = reftuple[1]
+            for refdict in reflist:
+                goodrefs, badrefs = \
+                    _classify_ref(handle, refdict, goodrefs, badrefs)
+        except TypeError:
+            pass
+
+    return goodrefs, badrefs
+
+
+def _reprocess_bad(good, bad):
+    for handle in bad.keys():
+        filteredbad = [refdict for refdict in bad[handle]
+                       if refdict and len(refdict['raw_ref'][0]) < 200]
+
+        for x in range(0, len(filteredbad) - 1):
+            testref = '{} {}'.format(filteredbad[x]['raw_ref'][0],
+                                     filteredbad[x + 1]['raw_ref'][0])
+            if len(testref) < 200:
+                ref = extract_references_from_string(testref)
+
+                local_good, local_bad = _find_candidate_refs([(handle, ref)])
+                good.setdefault(handle, []).append(local_good)
+                bad.setdefault(handle, []).append(local_bad)
+
+    return good, bad
+
+
+def _find_good_refs(reftuples):
+    # Given a list of tuples of (filename, list of citation data dicts),
+    # find the probably good citation data.
+    #
+    # :param reftuples: A list of 2-tuples. Each tuple is (filename, list of
+    # candidate references). Each candidate reference is a dict.
+    # :rtype: dict. The keys of the dict are filenames; the values are lists of
+    # dicts of believed-good citation data.
+    _verify_reftuple_format(reftuples)
+
+    good_candidates, bad_candidates = _find_candidate_refs(reftuples)
+
+    # At this point the goodrefs are very good. The badrefs include some
+    # goodrefs - particularly in cases where refs have been split over multiple
+    # lines. Let's try to extract those.
+    new_good, new_bad = _reprocess_bad(good_candidates, bad_candidates)
+
+    print("Theses with good candidates: %d" % len(new_good))
+    print("Theses with bad candidates: %d" % len(new_bad))
+
+    print("Total good candidates: %d" %
+        sum([len(x) for x in new_good.values()]))
+    print("Total bad candidates: %d" %
+        sum([len(x) for x in new_bad.values()]))
+
+    return new_good
+
+
+# Extracting references from files ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+def _extract_refs(handle):
+    # Extract potential references from ends of files. Return a tuple of
+    # the file handle and the candidate references (a list of dicts).
+    #
+    # We don't want to parse the beginning, because if the extractor sees
+    # "References" or "Bibliography" in the table of contents, it may conclude
+    # it has found a reference section and parse the entire file for
+    # references, which is unacceptably time-consuming.
+    #
+    # :param handle: name of a file containing thesis text (not the full path,
+    # just the name).
+    # :type handle: str.
+    # :rtype: tuple or None
+    filepath = os.path.join(localpath, handle)
+    if os.path.isfile(filepath):
+        try:
+            popenstring = 'tail -n 1000 {}'.format(filepath)
+            end_of_file = os.popen(popenstring).read()
+            # The 'is_only_references' flag for this function, when False, is
+            # supposed to increase accuracy for text that may contain things
+            # other than the reference section (as ours does). It doesn't
+            # seem to work, however.
+            refs = extract_references_from_string(end_of_file)
+            if refs:
+                return (handle, refs)
+        except:
+            pass
+
+
+# The main function ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+def extract_good_refs(maxfiles):
+    # Extracts believed-good citations from files and returns them.
+    # Along the way pickles the extracted but unprocessed refs to simplify
+    # debugging and future processing (the extraction is the time-consuming
+    # step, so we persist its results.)
+    #
+    # :param maxfiles: The maximum number of files to extract data from.
+    # (Extracting citations from all 43K+ theses takes days, even parallelized,
+    # so a shorter option is provided for testing purposes.)
+    # :type maxfiles: int
+    # :rtype: dict. The keys of the dict are filenames; the values are lists of
+    # dicts of believed-good citation data.
+
+    # Extract refs from files ----------------
+    pool = mp.Pool(mp.cpu_count())
+    if not maxfiles:
+        file_list = os.listdir(localpath)
+    else:
+        file_list = os.listdir(localpath)[:maxfiles]
+
+    results = pool.map(_extract_refs, file_list)
+
+    # Even at 8 cores this data takes days to extract, so persist it now -
+    # that way if there are any problems with the subsequent steps, you can
+    # recover.
+    pickle.dump(results, open("refs.p", "wb"))
+
+    pool.close()
+    pool.join()
+
+    # Find good refs -------------------------
+    good = _find_good_refs(results)
+
+    return good
diff --git a/hamlet/citations/management/__init__.py b/hamlet/citations/management/__init__.py
diff --git a/hamlet/citations/management/commands/__init__.py b/hamlet/citations/management/commands/__init__.py
diff --git a/hamlet/citations/management/commands/populate_citations.py b/hamlet/citations/management/commands/populate_citations.py
@@ -0,0 +1,77 @@
+import re
+
+from django.core.management.base import BaseCommand
+from django.utils.timezone import now
+
+from hamlet.citations.extract_refs import extract_good_refs
+from hamlet.citations.models import Citation
+from hamlet.theses.models import Thesis
+
+
+class Command(BaseCommand):
+    help = 'Processes citations from theses and adds to the database'
+
+    def add_arguments(self, parser):
+        parser.add_argument('maxfiles', type=int)
+
+    def handle(self, *args, **options):
+        start_time = now()
+
+        maxfiles = options['maxfiles']
+        pattern = re.compile('1721.1\-(\d+)\.txt')
+        base_fields = Citation._meta.get_fields()
+        fields = [f.name for f in base_fields
+                  if f.name not in ['thesis', 'raw_ref', 'id']]
+
+        good = extract_good_refs(maxfiles)
+
+        total_attempts = 0
+        total_created = 0
+
+        for handle, refs in good.items():
+            identifier = pattern.match(handle).group(1)
+            t = Thesis.objects.get(identifier=identifier)
+
+            for item in refs:
+                total_attempts += 1
+                raw_ref = item.get('raw_ref')
+                if not raw_ref:
+                    continue
+
+                # raw_ref is produced as a one-element list instead of a string.
+                raw_ref = raw_ref[0]
+
+                # Strip off any reference numbers left over from the
+                # bibliography.
+                raw_ref = re.sub(r'^\[\d+\] ', '', raw_ref)
+
+                c, _ = Citation.objects.get_or_create(thesis=t,
+                        raw_ref=raw_ref)
+                for field in fields:
+                    value = item.get(field)
+
+                    # Save blanks, not nulls.
+                    if not value:
+                        value = ''
+                    else:
+                        value = value[0]
+
+                    setattr(c, field, value)
+
+                try:
+                    c.save()
+                    total_created += 1
+                except:
+                    print('FAIL')
+                    for field in fields:
+                        value = item.get(field)
+                        print(field)
+                        print(value)
+
+        end_time = now()
+        elapsed = (end_time - start_time).seconds
+
+        self.stdout.write(
+            self.style.SUCCESS(' %d citations created' % total_created))
+        self.stdout.write(
+            self.style.SUCCESS('%d seconds elapsed' % elapsed))
diff --git a/hamlet/citations/migrations/0001_initial.py b/hamlet/citations/migrations/0001_initial.py
@@ -0,0 +1,34 @@
+# -*- coding: utf-8 -*-
+# Generated by Django 1.11.10 on 2018-03-19 20:00
+from __future__ import unicode_literals
+
+from django.db import migrations, models
+import django.db.models.deletion
+
+
+class Migration(migrations.Migration):
+
+    initial = True
+
+    dependencies = [
+        ('theses', '0007_remove_thesis__vector'),
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name='Citation',
+            fields=[
+                ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
+                ('doi', models.CharField(max_length=66)),
+                ('journal', models.TextField()),
+                ('url', models.URLField()),
+                ('author', models.TextField()),
+                ('title', models.TextField()),
+                ('isbn', models.CharField(max_length=20)),
+                ('publisher', models.CharField(max_length=32)),
+                ('year', models.CharField(max_length=4)),
+                ('raw_ref', models.TextField()),
+                ('thesis', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='theses.Thesis')),
+            ],
+        ),
+    ]