Skip to content

Commit 327d496

Browse files
committed
add file list func
1 parent 45b0f26 commit 327d496

File tree

8 files changed

+235
-16
lines changed

8 files changed

+235
-16
lines changed

MIT-LCS-TR-834.pdf

227 KB
Binary file not shown.

Pipfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ requests = "*"
1212
structlog = "*"
1313
attr = "*"
1414
click = "*"
15+
lxml = "*"
1516

1617
[requires]
1718
python_version = "3.7"

Pipfile.lock

Lines changed: 48 additions & 16 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Untitled.html

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

dsaps/models.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
import datetime
22
from functools import partial
3+
import os
34
import operator
45
import requests
56
import time
67

78
import attr
9+
from lxml import html
810
import structlog
911

1012
op = operator.attrgetter('name')
@@ -123,6 +125,28 @@ class MetadataEntry(BaseRecord):
123125
language = Field()
124126

125127

128+
def build_file_list_local(directory, file_extension):
129+
"""Build list of files in local directory."""
130+
file_list = {}
131+
for root, dirs, files in os.walk(directory, topdown=True):
132+
for file in files:
133+
if file.endswith(file_extension):
134+
full_file_path = os.path.join(root, file)
135+
file_list[file] = full_file_path
136+
return file_list
137+
138+
139+
def build_file_list_remote(directory_url, file_extension):
140+
"""Build list of files in local directory."""
141+
file_list = {}
142+
response = requests.get(directory_url)
143+
links = html.fromstring(response.content).iterlinks()
144+
for link in links:
145+
if link[2].endswith(file_extension):
146+
file_list[link[2]] = f'{directory_url}{link[2]}'
147+
return file_list
148+
149+
126150
def elapsed_time(start_time, label):
127151
"""Calculate elapsed time."""
128152
td = datetime.timedelta(seconds=time.time() - start_time)

retrieve_remote_files.py

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
import datetime
2+
import json
3+
import requests
4+
5+
import structlog
6+
7+
logger = structlog.get_logger()
8+
9+
user_full_name = 'Test Name'
10+
email = 'test@test.com'
11+
12+
base_url = 'http://publications.csail.mit.edu/lcs/pubs/pdf/'
13+
id = 'MIT-LCS-TR-834'
14+
15+
16+
def download_remote_file(base_url, id):
17+
"""Download file from remote location."""
18+
url = f'{base_url}{id}.??????????????????????????'
19+
logger.info(url)
20+
response = requests.get(url)
21+
if response.status_code != 200:
22+
logger.info('File not found')
23+
else:
24+
return response.content
25+
26+
27+
metadata_json = [{'file_identifier': '123',
28+
'metadata': [{'key': 'dc.title', 'value': 'Test title',
29+
'language': 'en_US'}]}
30+
]
31+
32+
33+
def post_items_to_coll(self, base_url, coll_id, metadata_json, file_list):
34+
"""Post items to specified collection."""
35+
collectionMetadata = json.load(open(metadata_json))
36+
for item_metadata in collectionMetadata:
37+
file_identifier = item_metadata['file_identifier']
38+
file_exists = ''
39+
data = json.dumps(item_metadata['metadata'])
40+
for k in file_list:
41+
if file_identifier in k:
42+
file_exists = True
43+
if file_exists is True:
44+
logger.info(file_identifier)
45+
post = requests.post(f'{base_url}{coll_id}/items',
46+
headers=self.header, cookies=self.cookies,
47+
data=data).json()
48+
logger.info(json.dumps(post))
49+
item_metadata['item_url'] = post['link']
50+
yield item_metadata
51+
52+
53+
def post_bitstreams_to_item(self, item_metadata, file_list):
54+
"""Post bitstreams to specified item."""
55+
file_identifier = item_metadata['file_identifier']
56+
item_url = item_metadata['item_url']
57+
for k, v in file_list.items():
58+
if k.startswith(file_identifier):
59+
bitstream = file_list[k]
60+
file_name = bitstream[bitstream.rfind('/') + 1:]
61+
data = open(bitstream, 'rb')
62+
post = requests.post(f'{item_url}/bitstreams?name='
63+
+ file_name, headers=self.header_file_upload,
64+
cookies=self.cookies, data=data).json()
65+
logger.info(json.dumps(post))
66+
yield post
67+
68+
69+
def create_prov_note(self, item_url, prov_note_prefix):
70+
"""Create provenance note"""
71+
prov_note = {}
72+
prov_note['key'] = 'dc.description.provenance'
73+
prov_note['language'] = 'en_US'
74+
utcTime = datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ')
75+
prov_note_value = f'{prov_note_prefix} {utcTime} '
76+
bitstreams = requests.get(f'{item_url}/bitstreams', headers=self.header,
77+
cookies=self.cookies).json()
78+
prov_note_value += f'(GMT). No. of bitstreams: {len(bitstreams)}'
79+
for bitstream in bitstreams:
80+
prov_note_value = retrieve_bitstream_metadata(bitstream,
81+
prov_note_value)
82+
prov_note['value'] = prov_note_value
83+
return prov_note
84+
85+
86+
def retrieve_bitstream_metadata(bitstream, prov_note_value):
87+
"""Retrieve count of bitstreams attached to items."""
88+
fileName = bitstream['name']
89+
size = bitstream['sizeBytes']
90+
checksum = bitstream['checkSum']['value']
91+
algorithm = bitstream['checkSum']['checkSumAlgorithm']
92+
prov_note_value += f' {fileName}: {size} bytes, checkSum: {checksum}'
93+
prov_note_value += f' ({algorithm})'
94+
return prov_note_value
95+
96+
97+
def add_prov_note(self, item_url, avail_prov_note, subm_prov_note):
98+
"""Add provenance note to item metadata."""
99+
provNote = json.dumps([avail_prov_note, subm_prov_note])
100+
post = requests.put(f'{item_url}/metadata', headers=self.header,
101+
cookies=self.cookies, data=provNote)
102+
logger.info(post)
103+
104+
105+
# content = build_file_list_remote(directory_url, file_extension)
106+
# print(content)
107+
# file = download_remote_file(base_url, id)
108+
# if type(file) is not None:
109+
# with open(f'{id}.pdf', 'wb') as f:
110+
# f.write(file)
111+
# logger.info(f'{f.name} downloaded')
112+
113+
114+
# directory_url = 'http://publications.csail.mit.edu/lcs/pubs/pdf/'
115+
# file_extension = 'pdf'
116+
# comm_handle = '1721.1/121959'
117+
# coll_name = 'Test Collection'
118+
# file_list = build_file_list_remote(directory_url, file_extension)
119+
# coll_id = post_coll_to_comm(base_url, comm_handle, coll_name)
120+
items = post_items_to_coll(base_url, coll_id, metadata_json, file_list)
121+
for item in items:
122+
item = post_bitstreams_to_item(item, file_list)
123+
avail_prov_note = create_prov_note(item['item_url'],
124+
'Made available in DSpace on')
125+
subm_prov_note = create_prov_note(item['item_url'],
126+
f'Submitted by {user_full_name} '
127+
+ f'({email}) on')
128+
post = add_prov_note(item['item_url'], avail_prov_note, subm_prov_note)

sample.html

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN"><html><head><title>Index of /pdf</title></head><body><h1>Index of /pdf</h1><table><tr><th>Name</th><th>Last modified</th><th>Size</th></tr><tr><td><a href="999.pdf">999.pdf</a></td><td>2001-02-16 11:59 </td><td>107K</td></tr></table></body></html>

tests/test_models.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import os
2+
13
import attr
24
import pytest
35
import requests_mock
@@ -60,6 +62,7 @@ def test_filtered_item_search(client):
6062

6163

6264
def test__pop_inst(client):
65+
"""Test _pop_inst function."""
6366
class_type = models.Collection
6467
rec_obj = {'name': 'Test title', 'type': 'collection', 'items': []}
6568
rec_obj = client._pop_inst(class_type, rec_obj)
@@ -68,7 +71,36 @@ def test__pop_inst(client):
6871

6972

7073
def test__build_uuid_list(client):
74+
"""Test _build_uuid_list function."""
7175
rec_obj = {'items': [{'uuid': '1234'}]}
7276
children = 'items'
7377
child_list = client._build_uuid_list(rec_obj, children)
7478
assert '1234' in child_list
79+
80+
81+
def test_build_file_list_local():
82+
"""Test filtered_item_search function."""
83+
file_extension = 'pdf'
84+
directory = 'test_temp'
85+
os.mkdir(directory)
86+
open(f'{directory}/999.pdf', 'w')
87+
file_list = models.build_file_list_local(directory, file_extension)
88+
os.remove(f'{directory}/999.pdf')
89+
os.rmdir(directory)
90+
assert '999.pdf' in file_list
91+
92+
93+
def test_build_file_list_remote():
94+
"""Test build_file_list_remote function."""
95+
content = '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN"><html>'
96+
content += '<head><title>Index of /pdf</title></head><body><h1>Index of /'
97+
content += 'pdf</h1><table><tr><th>Name</th><th>Last modified</th><th>'
98+
content += 'Size</th></tr><tr><td><a href="999.pdf">999.pdf</a></td><td>'
99+
content += '2001-02-16 11:59 </td><td>107K</td></tr></table></body></html>'
100+
with requests_mock.Mocker() as m:
101+
directory_url = 'mock://test.com/pdfs/'
102+
file_extension = 'pdf'
103+
m.get(directory_url, text=content)
104+
file_list = models.build_file_list_remote(directory_url,
105+
file_extension)
106+
assert '999.pdf' in file_list

0 commit comments

Comments
 (0)