Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
24 changed files
with
452 additions
and
117 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
113 changes: 113 additions & 0 deletions
113
hamlet/citations/management/commands/delete_garbage_citations.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,113 @@ | ||
import string | ||
|
||
from django.core.management.base import BaseCommand | ||
|
||
from hamlet.citations.models import Citation | ||
|
||
|
||
class Command(BaseCommand): | ||
help = 'Deletes citations which are probably garbage' | ||
citation_fields = Citation._meta.get_fields() | ||
|
||
def _percentage_nonpunctuation(self, mystring): | ||
''' | ||
Returns the % of mystring which is not punctuation (expressed as | ||
a decimal between 0 and 1). | ||
''' | ||
# The string with all its punctuation removed. | ||
nonpunc = mystring.translate(str.maketrans('', '', string.punctuation)) | ||
|
||
return len(nonpunc) / len(mystring) | ||
|
||
def _percentage_lowercase(self, mystring): | ||
''' | ||
Returns the % of mystring which is not capital letters (expressed as | ||
a decimal between 0 and 1). | ||
Technically this isn't the same as being lowercase (characters could be | ||
digits or spaces), but it's less confusing to write into inequalities | ||
than "_percentage_noncapital". | ||
''' | ||
# The string with all its capitals removed. | ||
lowercase = mystring.translate( | ||
str.maketrans('', '', string.ascii_uppercase)) | ||
|
||
return len(lowercase) / len(mystring) | ||
|
||
def _nonempty_field_count(self, citation): | ||
''' | ||
Returns the number of nonempty fields in a citation. (Should always | ||
be at least 2 - raw_ref and thesis.) | ||
''' | ||
return len([x for x in self.citation_fields | ||
if getattr(citation, x.name)]) | ||
|
||
def handle(self, *args, **options): | ||
orig_count = Citation.objects.count() | ||
deleted = 0 | ||
loopcount = 0 | ||
for c in Citation.objects.all()[0:orig_count]: | ||
# Things with too high a percentage of punctuation are probably | ||
# equations or figure captions or tables. Things with too little | ||
# punctuation can't be well-formed citations. | ||
nonpunct = self._percentage_nonpunctuation(c.raw_ref) | ||
if nonpunct < 0.8 or nonpunct > 0.98: | ||
c.delete() | ||
deleted += 1 | ||
continue | ||
|
||
# Things with too high a percentage of lowercase letters are | ||
# probably text fragments not caught by the previous check - | ||
# but they may also be citations containing URLs (which, unlike | ||
# journal and article titles, tend to be entirely lowercase). | ||
lowercase = self._percentage_lowercase(c.raw_ref) | ||
if lowercase > 0.97 and 'http' not in c.raw_ref: | ||
c.delete() | ||
deleted += 1 | ||
continue | ||
|
||
# Things with too low a percentage of lowercase letters are also | ||
# garbage - figure captions, OCR errors, etc. However, this | ||
# percentage must be quite low, because some people put author | ||
# names in all caps. | ||
if lowercase < 0.6: | ||
c.delete() | ||
deleted += 1 | ||
continue | ||
|
||
# These are equations, figures, etc. | ||
if any([ | ||
'>' in c.raw_ref, | ||
'<' in c.raw_ref, | ||
'%' in c.raw_ref | ||
]) and 'http' not in c.raw_ref: | ||
c.delete() | ||
deleted += 1 | ||
continue | ||
|
||
# Random garbage | ||
if all([ | ||
c.raw_ref[0] not in string.ascii_uppercase, | ||
c.raw_ref[0] != '"', | ||
self._nonempty_field_count(c) < 4 | ||
]): | ||
c.delete() | ||
deleted += 1 | ||
continue | ||
|
||
# Remaining citations are probably mostly okay. They may be | ||
# fragmentary, but that's still useful; people can likely still | ||
# track them down. | ||
|
||
# Progress indicator | ||
loopcount += 1 | ||
if loopcount % 100 == 0: | ||
self.stdout.write(self.style.WARNING( | ||
'%d citations processed' % loopcount)) | ||
self.stdout.write(self.style.WARNING( | ||
'%d citations deleted' % deleted)) | ||
|
||
self.stdout.write( | ||
self.style.NOTICE(' %d citations deleted' % deleted)) | ||
self.stdout.write( | ||
self.style.SUCCESS('%d citations remain' % (orig_count - deleted))) |
21 changes: 21 additions & 0 deletions
21
hamlet/citations/management/commands/remove_reference_numbering.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
import re | ||
|
||
from django.core.management.base import BaseCommand | ||
|
||
from hamlet.citations.models import Citation | ||
|
||
|
||
class Command(BaseCommand): | ||
help = 'Removes reference numbers from the beginnings of citations' | ||
|
||
def handle(self, *args, **options): | ||
for c in Citation.objects.all(): | ||
if re.match(r'\d+\.', c.raw_ref): | ||
c.raw_ref = re.sub(r'^\d+\. ', '', c.raw_ref).lstrip() | ||
c.save() | ||
if re.match(r'\[\d+\].', c.raw_ref): | ||
c.raw_ref = re.sub(r'\[\d+\].', '', c.raw_ref).lstrip() | ||
c.save() | ||
if re.match(r'\[\d+\]', c.raw_ref): | ||
c.raw_ref = re.sub(r'\[\d+\]', '', c.raw_ref).lstrip() | ||
c.save() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
# -*- coding: utf-8 -*- | ||
# Generated by Django 1.11.13 on 2018-06-29 13:47 | ||
from __future__ import unicode_literals | ||
|
||
from django.db import migrations | ||
|
||
|
||
class Migration(migrations.Migration): | ||
|
||
dependencies = [ | ||
('citations', '0002_auto_20180319_2044'), | ||
] | ||
|
||
operations = [ | ||
migrations.AlterModelOptions( | ||
name='citation', | ||
options={'ordering': ['raw_ref']}, | ||
), | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -25,3 +25,6 @@ class Citation(models.Model): | |
|
||
def __str__(self): | ||
return self.raw_ref | ||
|
||
class Meta: | ||
ordering = ['raw_ref'] |
20 changes: 20 additions & 0 deletions
20
hamlet/citations/templates/citations/lit_review_buddy.html
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
{% extends "base.html" %} | ||
|
||
{% block content %} | ||
<h2>Your literature review buddy</h2> | ||
|
||
<p> | ||
Upload an article draft, thesis chapter-in-progress, et cetera and find out what works have been cited by similar MIT theses. | ||
</p> | ||
|
||
<p> | ||
That's right: we'll do a first-pass lit review for you. | ||
</p> | ||
|
||
{% include "upload_form.html" %} | ||
|
||
<p class="copy-sup"> | ||
Note: Citations have been automatically extracted from fulltext and vary in accuracy. As ever, use your judgment. | ||
</p> | ||
|
||
{% endblock %} |
33 changes: 33 additions & 0 deletions
33
hamlet/citations/templates/citations/lit_review_outcomes.html
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
{% extends "base.html" %} | ||
|
||
{% block content %} | ||
<h2>Your literature review buddy</h2> | ||
|
||
{% if not total_suggestions %} | ||
<p> | ||
Unfortunately we couldn't find anything for you. However, your <a href="https://libraries.mit.edu/research-support/">friendly local reference librarians</a> can help. | ||
</p> | ||
{% else %} | ||
<p class="copy-sup"> | ||
These citations have been extracted automatically from OCRed text files; | ||
expect messiness! | ||
</p> | ||
|
||
{% for thesis in suggestions %} | ||
{% if thesis.citation_set.all %} | ||
<div class="panel panel-info"> | ||
<div class="panel-body"> | ||
<i>from</i> <a href="{{ thesis.get_absolute_url }}">{{ thesis.title }}</a>: | ||
<ul class="list-unbulleted"> | ||
{% for citation in thesis.citation_set.all %} | ||
<li> | ||
{{ citation }} | ||
</li> | ||
{% endfor %} | ||
</ul> | ||
</div> | ||
</div> | ||
{% endif %} | ||
{% endfor %} | ||
{% endif %} | ||
{% endblock %} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
import os | ||
from unittest import skip | ||
|
||
from django.conf import settings | ||
from django.core.urlresolvers import reverse | ||
from django.test import Client, TestCase, override_settings | ||
|
||
|
||
@override_settings(COMPRESS_ENABLED=False) | ||
class ViewTests(TestCase): | ||
|
||
def setUp(self): | ||
self.client = Client() | ||
# If you forgot to define the URL, the test suite will fail here. | ||
self.url = reverse('citations:lit_review_buddy') | ||
self.fix_path = os.path.join( | ||
settings.BASE_DIR, 'hamlet/theses/fixtures') | ||
|
||
def test_page_loads(self): | ||
response = self.client.get(self.url) | ||
assert response.status_code == 200 | ||
|
||
def test_front_page_has_widget(self): | ||
response = self.client.get(reverse('home')) | ||
assert self.url in response.content.decode('utf-8') | ||
|
||
def test_render_on_success(self): | ||
''' | ||
Check that we render the correct template with the correct context on | ||
a successful post. | ||
''' | ||
url = reverse('citations:lit_review_buddy') | ||
with open(os.path.join(self.fix_path, '1721.1-33360.txt'), 'rb') as fp: | ||
response = self.client.post(url, | ||
{"file": fp, "captcha_0": "sometext", "captcha_1": "PASSED"}) | ||
|
||
assert response.status_code == 200 | ||
assert 'suggestions' in response.context | ||
assert 'total_suggestions' in response.context | ||
template_names = [t.name for t in response.templates] | ||
assert 'citations/lit_review_outcomes.html' in template_names | ||
|
||
# This is failing, even though performing the same behavior in the | ||
# browser, with the test settings file, works. It looks like infer_vector | ||
# maybe doesn't return the same thing each time (!) and so this test can | ||
# fail even when a very similar one in hamlet/theses/tests/test_views.py | ||
# succeeds. | ||
@skip | ||
def test_citations_found(self): | ||
''' | ||
Check that we get the expected suggestions on a successful post. | ||
''' | ||
citation = "Dr. Orhan Soykan. Power Sources for Implantable Medical Devices. Medical Device Manufacturing & Technology, 2002." # noqa | ||
url = reverse('citations:lit_review_buddy') | ||
with open(os.path.join(self.fix_path, '1721.1-33360.txt'), 'rb') as fp: | ||
response = self.client.post(url, | ||
{"file": fp, "captcha_0": "sometext", "captcha_1": "PASSED"}) | ||
|
||
assert citation in response.content.decode('utf-8') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
from django.conf.urls import url | ||
|
||
from . import views | ||
|
||
urlpatterns = [ | ||
url(r'^lit_review_buddy/$', | ||
views.LitReviewBuddyView.as_view(), name='lit_review_buddy'), | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
from django.shortcuts import render | ||
from django.views.generic.edit import FormView | ||
|
||
from hamlet.common.document import factory | ||
from hamlet.common.forms import UploadFileForm | ||
from hamlet.common.inferred_vectors import get_similar_documents | ||
|
||
|
||
class LitReviewBuddyView(FormView): | ||
template_name = 'citations/lit_review_buddy.html' | ||
form_class = UploadFileForm | ||
|
||
def form_valid(self, form): | ||
context = {} | ||
doc = factory(self.request.FILES['file']) | ||
simdocs = get_similar_documents(doc) | ||
context['suggestions'] = simdocs | ||
context['total_suggestions'] = sum([ | ||
doc.citation_set.count() for doc in simdocs | ||
]) | ||
return render(self.request, 'citations/lit_review_outcomes.html', | ||
context) |
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
from captcha.fields import CaptchaField | ||
|
||
from django import forms | ||
from django.core.exceptions import ValidationError | ||
from django.core.validators import FileExtensionValidator | ||
from django.utils.deconstruct import deconstructible | ||
|
||
|
||
# By analogy with django.core.validators.FileExtensionValidator source. | ||
@deconstructible | ||
class MimetypeValidator: | ||
message = "The MIME type is not valid (it appears to be '%(mimetype)s'). Allowed MIME types are: '%(allowed_mimetypes)s'." # noqa | ||
|
||
def __init__(self, allowed_mimetypes=None): | ||
if allowed_mimetypes is not None: | ||
allowed_mimetypes = [allowed_mimetype.lower() for allowed_mimetype in allowed_mimetypes] # noqa | ||
self.allowed_mimetypes = allowed_mimetypes | ||
|
||
def __call__(self, value): | ||
if value.content_type not in self.allowed_mimetypes: | ||
raise ValidationError(self.message % | ||
{'mimetype': value.content_type, | ||
'allowed_mimetypes': ', '.join(self.allowed_mimetypes)}) | ||
|
||
|
||
@deconstructible | ||
class FileSizeValidator: | ||
message = 'The file is too large (%(size)s KB). The maximum file size is %(allowed_size)s KB.' # noqa | ||
|
||
def __init__(self, max_size=2 * 1024 * 1024): | ||
self.max_size = max_size | ||
|
||
def __call__(self, value): | ||
if len(value) >= self.max_size: | ||
raise ValidationError(self.message % | ||
{'size': round(len(value) / 1024), | ||
'allowed_size': round(self.max_size / 1024)}) | ||
|
||
|
||
class UploadFileForm(forms.Form): | ||
allowed_extensions = ['txt', 'docx'] | ||
allowed_mimetypes = ['text/plain', | ||
'application/vnd.openxmlformats-officedocument.wordprocessingml.document'] # noqa | ||
max_size = 4 * 1024 * 1024 | ||
|
||
file = forms.FileField( | ||
validators=[FileSizeValidator(max_size), | ||
FileExtensionValidator(allowed_extensions), | ||
MimetypeValidator(allowed_mimetypes)], | ||
widget=forms.ClearableFileInput(attrs={'class': 'field field-upload'}), | ||
help_text='.txt or .docx only.') | ||
captcha = CaptchaField(help_text='Sorry, no spammers.') |
Oops, something went wrong.