Skip to content

Commit

Permalink
examiner: Replace md5 with sha1 in PDF hasher
Browse files Browse the repository at this point in the history
  • Loading branch information
JakobGM committed Dec 15, 2018
1 parent e768ec0 commit d71a418
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 14 deletions.
25 changes: 25 additions & 0 deletions examiner/migrations/0007_auto_20181215_2305.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# -*- coding: utf-8 -*-
# Generated by Django 1.11.2 on 2018-12-15 22:05
from __future__ import unicode_literals

import django.core.validators
from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('examiner', '0006_remove_scrapedpdf_filetype'),
]

operations = [
migrations.RemoveField(
model_name='scrapedpdf',
name='md5_hash',
),
migrations.AddField(
model_name='scrapedpdf',
name='sha1_hash',
field=models.CharField(help_text='Unik sha1 hash relativt til filinnhold.', max_length=40, null=True, unique=True, validators=[django.core.validators.RegexValidator(message='Not a valid SHA1 hash string.', regex='^[0-9a-f]{40}$')]),
),
]
21 changes: 21 additions & 0 deletions examiner/migrations/0008_auto_20181215_2306.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# -*- coding: utf-8 -*-
# Generated by Django 1.11.2 on 2018-12-15 22:06
from __future__ import unicode_literals

import django.core.validators
from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('examiner', '0007_auto_20181215_2305'),
]

operations = [
migrations.AlterField(
model_name='scrapedpdf',
name='sha1_hash',
field=models.CharField(help_text='Unik sha1 hash relativt til filinnhold.', max_length=40, unique=True, validators=[django.core.validators.RegexValidator(message='Not a valid SHA1 hash string.', regex='^[0-9a-f]{40}$')]),
),
]
20 changes: 12 additions & 8 deletions examiner/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from django.contrib.auth.models import User
from django.core.files.base import ContentFile
from django.core.validators import URLValidator
from django.core.validators import RegexValidator, URLValidator
from django.db import models
from django.utils import timezone

Expand All @@ -29,11 +29,15 @@ class ScrapedPdf(models.Model):
default=None,
help_text=_('Filinnhold i rent tekstformat.'),
)
md5_hash = models.CharField(
max_length=32,
sha1_hash = models.CharField(
max_length=40,
unique=True,
null=False,
help_text=_('Unik md5 hash relativt til filinnhold.'),
help_text=_('Unik sha1 hash relativt til filinnhold.'),
validators=[RegexValidator(
regex='^[0-9a-f]{40}$',
message='Not a valid SHA1 hash string.',
)],
)
created_at = models.DateTimeField(editable=False)
updated_at = models.DateTimeField()
Expand Down Expand Up @@ -182,14 +186,14 @@ def backup_file(self) -> None:
self.save()
return

md5 = hashlib.md5(response.content).hexdigest()
sha1 = hashlib.sha1(response.content).hexdigest()
content_file = ContentFile(response.content)

try:
file_backup = ScrapedPdf.objects.get(md5_hash=md5)
file_backup = ScrapedPdf.objects.get(sha1_hash=sha1)
except ScrapedPdf.DoesNotExist:
file_backup = ScrapedPdf(md5_hash=md5)
file_backup.file.save(name=md5, content=content_file)
file_backup = ScrapedPdf(sha1_hash=sha1)
file_backup.file.save(name=sha1, content=content_file)
file_backup.save()

self.scraped_pdf = file_backup
Expand Down
12 changes: 6 additions & 6 deletions examiner/tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,12 +123,12 @@ def test_file_backup(tmpdir, settings):

# The downloaded file should be hashed and the result stored
assert exam_url.dead_link is False
expected_md5_hash = 'adc7a2fa473be1b091f7324aa4067c8a'
expected_sha1_hash = '4dc828ea76ab618be6d72d135af13c40de3b9ce6'
file_backup = exam_url.scraped_pdf
assert file_backup.md5_hash == expected_md5_hash
assert file_backup.sha1_hash == expected_sha1_hash

# And the stored file should be named according to its hash
assert file_backup.file.name == 'examiner/FileBackup/' + expected_md5_hash
assert file_backup.file.name == 'examiner/FileBackup/' + expected_sha1_hash

# The directory for file backups should now contain one file
backup_directory = Path(settings.MEDIA_ROOT / 'examiner/FileBackup/')
Expand Down Expand Up @@ -238,9 +238,9 @@ def test_string_content():
"""FileBackup PDFs should be parsable."""
pdf_path = Path(__file__).parent / 'data' / 'matmod_exam_des_2017.pdf'
pdf_content = ContentFile(pdf_path.read_bytes())
md5 = 'a8c5b61d8e750db6e719937a251e93b9'
pdf_backup = ScrapedPdf(md5_hash=md5)
pdf_backup.file.save(md5, content=pdf_content)
sha1 = 'a8c5b61d8e750db6e719937a251e93b9'
pdf_backup = ScrapedPdf(sha1_hash=sha1)
pdf_backup.file.save(sha1, content=pdf_content)
pdf_backup.read_text()
pdf_backup.save()

Expand Down

0 comments on commit d71a418

Please sign in to comment.