Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
20 changed files
with
831 additions
and
184 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
exclude_paths: | ||
exclude_patterns: | ||
- "tests.py" | ||
- "urls.py" | ||
- "migrations/" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
from django.contrib import admin | ||
|
||
from .models import Citation | ||
|
||
|
||
class CitationAdmin(admin.ModelAdmin): | ||
model = Citation | ||
search_fields = ('author', 'year', 'doi', 'thesis__title', 'raw_ref') | ||
list_display = ('thesis', 'raw_ref') | ||
|
||
|
||
admin.site.register(Citation, CitationAdmin) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,214 @@ | ||
import multiprocessing as mp | ||
import os | ||
import pickle | ||
import string | ||
|
||
from refextract import extract_references_from_string | ||
|
||
from django.conf import settings | ||
|
||
localpath = os.path.join(settings.PROJECT_DIR, 'neural', 'files', 'main') | ||
|
||
|
||
# Categorizing references as good or bad ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
def _enough_punctuation(ref): | ||
# Determine whether a string has a high enough percentage of punctuation | ||
# to be a plausible citation. | ||
# | ||
# Citations have a much higher percentage of punctuation than normal | ||
# free text, as citation formats dictate that they are mostly short strings | ||
# (like years and personal names) set off by commas, parentheses, etc. | ||
# This test has a moderate false positive rate but an extremely low false | ||
# negative rate. | ||
punct = len([c for c in ref if c in string.punctuation]) | ||
total = len(ref) | ||
if round(100 * punct / total) >= 10: | ||
return True | ||
else: | ||
return False | ||
|
||
|
||
def _classify_ref(handle, refdict, goodrefs, badrefs): | ||
# Determine whether a given candidate citation should be categorized as | ||
# good or bad. | ||
# | ||
# :param handle: The filename of the current thesis. | ||
# :type handle: str | ||
# :param refdict: A dictionary representing a single candidate citation. | ||
# :type refdict: dict | ||
# :param goodrefs: The current believed-good citations. | ||
# :type goodrefs: dict. Its keys are handles; its values are lists of | ||
# dicts. | ||
# :param badrefs: The current believed-bad citations. | ||
# :type badrefs: dict. Per goodrefs. | ||
# :rtype: dict | ||
# :rtype: dict | ||
try: | ||
ref = refdict['raw_ref'][0] | ||
except KeyError: | ||
return goodrefs, badrefs | ||
|
||
if all([len(ref) <= 500, | ||
len(ref) >= 30], | ||
not ref.lower().startswith('table'), | ||
not ref.lower().startswith('figure'), | ||
_enough_punctuation(ref), | ||
ref.upper() != ref): | ||
goodrefs.setdefault(handle, []).append(refdict) | ||
else: | ||
badrefs.setdefault(handle, []).append(refdict) | ||
|
||
return goodrefs, badrefs | ||
|
||
|
||
def _verify_reftuple_format(reftuples): | ||
# Ensure that reftuples has the data structure we expect. | ||
# This is a smoke test - it's just checking the first member, not all of | ||
# them. | ||
assert isinstance(reftuples, list) | ||
first_tuple = reftuples[0] | ||
assert isinstance(first_tuple, tuple) | ||
assert isinstance(first_tuple[0], str) | ||
assert isinstance(first_tuple[1], list) | ||
assert isinstance(first_tuple[1][0], dict) | ||
assert 'raw_ref' in first_tuple[1][0].keys() | ||
|
||
|
||
def _find_candidate_refs(reftuples): | ||
# Given citation data associated with various filenames, classify the | ||
# citations as good or bad. | ||
# | ||
# :param reftuples: list of tuples of (filename, list of dictd of citation | ||
# data). | ||
# :rtype: two dicts. Each dict has filenames as keys; its values are lists | ||
# of dicts of citation data. | ||
if not reftuples: | ||
return {}, {} | ||
|
||
goodrefs = {} | ||
badrefs = {} | ||
|
||
for reftuple in reftuples: | ||
try: | ||
handle = reftuple[0] | ||
reflist = reftuple[1] | ||
for refdict in reflist: | ||
goodrefs, badrefs = \ | ||
_classify_ref(handle, refdict, goodrefs, badrefs) | ||
except TypeError: | ||
pass | ||
|
||
return goodrefs, badrefs | ||
|
||
|
||
def _reprocess_bad(good, bad): | ||
for handle in bad.keys(): | ||
filteredbad = [refdict for refdict in bad[handle] | ||
if refdict and len(refdict['raw_ref'][0]) < 200] | ||
|
||
for x in range(0, len(filteredbad) - 1): | ||
testref = '{} {}'.format(filteredbad[x]['raw_ref'][0], | ||
filteredbad[x + 1]['raw_ref'][0]) | ||
if len(testref) < 200: | ||
ref = extract_references_from_string(testref) | ||
|
||
local_good, local_bad = _find_candidate_refs([(handle, ref)]) | ||
good.setdefault(handle, []).append(local_good) | ||
bad.setdefault(handle, []).append(local_bad) | ||
|
||
return good, bad | ||
|
||
|
||
def _find_good_refs(reftuples): | ||
# Given a list of tuples of (filename, list of citation data dicts), | ||
# find the probably good citation data. | ||
# | ||
# :param reftuples: A list of 2-tuples. Each tuple is (filename, list of | ||
# candidate references). Each candidate reference is a dict. | ||
# :rtype: dict. The keys of the dict are filenames; the values are lists of | ||
# dicts of believed-good citation data. | ||
_verify_reftuple_format(reftuples) | ||
|
||
good_candidates, bad_candidates = _find_candidate_refs(reftuples) | ||
|
||
# At this point the goodrefs are very good. The badrefs include some | ||
# goodrefs - particularly in cases where refs have been split over multiple | ||
# lines. Let's try to extract those. | ||
new_good, new_bad = _reprocess_bad(good_candidates, bad_candidates) | ||
|
||
print("Theses with good candidates: %d" % len(new_good)) | ||
print("Theses with bad candidates: %d" % len(new_bad)) | ||
|
||
print("Total good candidates: %d" % | ||
sum([len(x) for x in new_good.values()])) | ||
print("Total bad candidates: %d" % | ||
sum([len(x) for x in new_bad.values()])) | ||
|
||
return new_good | ||
|
||
|
||
# Extracting references from files ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
def _extract_refs(handle): | ||
# Extract potential references from ends of files. Return a tuple of | ||
# the file handle and the candidate references (a list of dicts). | ||
# | ||
# We don't want to parse the beginning, because if the extractor sees | ||
# "References" or "Bibliography" in the table of contents, it may conclude | ||
# it has found a reference section and parse the entire file for | ||
# references, which is unacceptably time-consuming. | ||
# | ||
# :param handle: name of a file containing thesis text (not the full path, | ||
# just the name). | ||
# :type handle: str. | ||
# :rtype: tuple or None | ||
filepath = os.path.join(localpath, handle) | ||
if os.path.isfile(filepath): | ||
try: | ||
popenstring = 'tail -n 1000 {}'.format(filepath) | ||
end_of_file = os.popen(popenstring).read() | ||
# The 'is_only_references' flag for this function, when False, is | ||
# supposed to increase accuracy for text that may contain things | ||
# other than the reference section (as ours does). It doesn't | ||
# seem to work, however. | ||
refs = extract_references_from_string(end_of_file) | ||
if refs: | ||
return (handle, refs) | ||
except: | ||
pass | ||
|
||
|
||
# The main function ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
def extract_good_refs(maxfiles): | ||
# Extracts believed-good citations from files and returns them. | ||
# Along the way pickles the extracted but unprocessed refs to simplify | ||
# debugging and future processing (the extraction is the time-consuming | ||
# step, so we persist its results.) | ||
# | ||
# :param maxfiles: The maximum number of files to extract data from. | ||
# (Extracting citations from all 43K+ theses takes days, even parallelized, | ||
# so a shorter option is provided for testing purposes.) | ||
# :type maxfiles: int | ||
# :rtype: dict. The keys of the dict are filenames; the values are lists of | ||
# dicts of believed-good citation data. | ||
|
||
# Extract refs from files ---------------- | ||
pool = mp.Pool(mp.cpu_count()) | ||
if not maxfiles: | ||
file_list = os.listdir(localpath) | ||
else: | ||
file_list = os.listdir(localpath)[:maxfiles] | ||
|
||
results = pool.map(_extract_refs, file_list) | ||
|
||
# Even at 8 cores this data takes days to extract, so persist it now - | ||
# that way if there are any problems with the subsequent steps, you can | ||
# recover. | ||
pickle.dump(results, open("refs.p", "wb")) | ||
|
||
pool.close() | ||
pool.join() | ||
|
||
# Find good refs ------------------------- | ||
good = _find_good_refs(results) | ||
|
||
return good |
Empty file.
Empty file.
77 changes: 77 additions & 0 deletions
77
hamlet/citations/management/commands/populate_citations.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
import re | ||
|
||
from django.core.management.base import BaseCommand | ||
from django.utils.timezone import now | ||
|
||
from hamlet.citations.extract_refs import extract_good_refs | ||
from hamlet.citations.models import Citation | ||
from hamlet.theses.models import Thesis | ||
|
||
|
||
class Command(BaseCommand): | ||
help = 'Processes citations from theses and adds to the database' | ||
|
||
def add_arguments(self, parser): | ||
parser.add_argument('maxfiles', type=int) | ||
|
||
def handle(self, *args, **options): | ||
start_time = now() | ||
|
||
maxfiles = options['maxfiles'] | ||
pattern = re.compile('1721.1\-(\d+)\.txt') | ||
base_fields = Citation._meta.get_fields() | ||
fields = [f.name for f in base_fields | ||
if f.name not in ['thesis', 'raw_ref', 'id']] | ||
|
||
good = extract_good_refs(maxfiles) | ||
|
||
total_attempts = 0 | ||
total_created = 0 | ||
|
||
for handle, refs in good.items(): | ||
identifier = pattern.match(handle).group(1) | ||
t = Thesis.objects.get(identifier=identifier) | ||
|
||
for item in refs: | ||
total_attempts += 1 | ||
raw_ref = item.get('raw_ref') | ||
if not raw_ref: | ||
continue | ||
|
||
# raw_ref is produced as a one-element list instead of a string. | ||
raw_ref = raw_ref[0] | ||
|
||
# Strip off any reference numbers left over from the | ||
# bibliography. | ||
raw_ref = re.sub(r'^\[\d+\] ', '', raw_ref) | ||
|
||
c, _ = Citation.objects.get_or_create(thesis=t, | ||
raw_ref=raw_ref) | ||
for field in fields: | ||
value = item.get(field) | ||
|
||
# Save blanks, not nulls. | ||
if not value: | ||
value = '' | ||
else: | ||
value = value[0] | ||
|
||
setattr(c, field, value) | ||
|
||
try: | ||
c.save() | ||
total_created += 1 | ||
except: | ||
print('FAIL') | ||
for field in fields: | ||
value = item.get(field) | ||
print(field) | ||
print(value) | ||
|
||
end_time = now() | ||
elapsed = (end_time - start_time).seconds | ||
|
||
self.stdout.write( | ||
self.style.SUCCESS(' %d citations created' % total_created)) | ||
self.stdout.write( | ||
self.style.SUCCESS('%d seconds elapsed' % elapsed)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
# -*- coding: utf-8 -*- | ||
# Generated by Django 1.11.10 on 2018-03-19 20:00 | ||
from __future__ import unicode_literals | ||
|
||
from django.db import migrations, models | ||
import django.db.models.deletion | ||
|
||
|
||
class Migration(migrations.Migration): | ||
|
||
initial = True | ||
|
||
dependencies = [ | ||
('theses', '0007_remove_thesis__vector'), | ||
] | ||
|
||
operations = [ | ||
migrations.CreateModel( | ||
name='Citation', | ||
fields=[ | ||
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), | ||
('doi', models.CharField(max_length=66)), | ||
('journal', models.TextField()), | ||
('url', models.URLField()), | ||
('author', models.TextField()), | ||
('title', models.TextField()), | ||
('isbn', models.CharField(max_length=20)), | ||
('publisher', models.CharField(max_length=32)), | ||
('year', models.CharField(max_length=4)), | ||
('raw_ref', models.TextField()), | ||
('thesis', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='theses.Thesis')), | ||
], | ||
), | ||
] |
Oops, something went wrong.