diff --git a/data/organisations/data.json b/data/organisations/data.json index 8baa3fa2..e5d5d48a 100644 --- a/data/organisations/data.json +++ b/data/organisations/data.json @@ -51,6 +51,7 @@ ], "isShared": true, "isDedicated": true, + "serverName": "fredi.hepvs.ch", "platformName": "FREDI", "files": [ { @@ -64,6 +65,7 @@ "name": "Haute École Pédagogique BEJUNE", "isShared": true, "isDedicated": true, + "serverName": "roar.hep-bejune.ch", "platformName": "# ROAR\n#### Répertoire ouvert et archives BEJUNE", "files": [ { @@ -113,6 +115,7 @@ ], "isShared": true, "isDedicated": true, + "serverName": "folia.unifr.ch", "platformName": "# FOLIA\n#### Fribourg Open Library and Archive", "files": [ { diff --git a/pyproject.toml b/pyproject.toml index fcbfd4c1..c0d91708 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -86,6 +86,7 @@ resources = "sonar.resources.cli:resources" imports = "sonar.modules.cli.imports:imports" fixtures = "sonar.modules.cli.fixtures:fixtures" utils = "sonar.modules.cli.utils:utils" +sitemap = "sonar.modules.sitemap.cli.sitemap:sitemap" [tool.poetry.plugins."invenio_base.apps"] sonar = "sonar.ext:Sonar" @@ -108,6 +109,7 @@ shibboleth_authenticator = "sonar.modules.shibboleth_authenticator.views.client: theme = "sonar.theme.views:blueprint" validation = "sonar.modules.validation.views:blueprint" users = "sonar.modules.users.views:blueprint" +sitemap = "sonar.modules.sitemap.views:blueprint" [tool.poetry.plugins."invenio_base.api_blueprints"] deposits = "sonar.modules.deposits.rest:api_blueprint" diff --git a/sonar/config.py b/sonar/config.py index ac522bf5..42d5edd1 100644 --- a/sonar/config.py +++ b/sonar/config.py @@ -183,6 +183,11 @@ def _(x): 'documents-stats': { 'task': ('sonar.modules.stats.tasks.collect_stats'), 'schedule': crontab(minute=0, hour=1), # Every day at 01:00 UTC, + }, + # Sitemap + 'sitemap': { + 'task': 'sonar.modules.sitemap.tasks.sitemap_generate_task', + 'schedule': crontab(minute=0, hour=2), # Every day at 02:00 UTC, } } CELERY_BROKER_HEARTBEAT = 0 diff --git a/sonar/config_sonar.py b/sonar/config_sonar.py index 98cc2a06..11a78997 100644 --- a/sonar/config_sonar.py +++ b/sonar/config_sonar.py @@ -17,11 +17,17 @@ """Specific configuration SONAR.""" +SONAR_APP_SERVER_NAME = 'sonar.rero.ch' + SONAR_APP_API_URL = 'https://localhost:5000/api/' SONAR_APP_ANGULAR_URL = 'https://localhost:5000/manage/' """Link to angular integrated app root.""" +SONAR_APP_PRODUCTION_STATE = False + +SONAR_APP_SITEMAP_ENTRY_SIZE = 10000 + SONAR_APP_LANGUAGES_MAP = { 'aar': 'aa', 'abk': 'ab', diff --git a/sonar/modules/organisations/api.py b/sonar/modules/organisations/api.py index 961d4056..4a8b8489 100644 --- a/sonar/modules/organisations/api.py +++ b/sonar/modules/organisations/api.py @@ -84,6 +84,19 @@ def get_shared_or_dedicated_list(self): ['pid', 'name', 'isShared', 'isDedicated']).execute().hits + def get_organisation_pid_by_server_name(self, server_name): + """Get organisation by server_name.""" + if hits := self.filter('term', serverName=server_name) \ + .source(['pid']).execute().hits: + return hits[0].pid + + def get_dedicated_list(self): + """Get the list of dedicated organisations. + + :returns: Iterator of dedicated organisations. + """ + return self.filter('term', isDedicated=True).execute().hits + class OrganisationRecord(SonarRecord): """Organisation record class.""" diff --git a/sonar/modules/organisations/jsonschemas/organisations/organisation-v1.0.0_src.json b/sonar/modules/organisations/jsonschemas/organisations/organisation-v1.0.0_src.json index 3c40e5a6..27224558 100644 --- a/sonar/modules/organisations/jsonschemas/organisations/organisation-v1.0.0_src.json +++ b/sonar/modules/organisations/jsonschemas/organisations/organisation-v1.0.0_src.json @@ -146,6 +146,29 @@ "hideExpression": "!field.model.isShared" } }, + "serverName": { + "title": "Server name (without http)", + "description": "Organisation server name for dedicated.", + "type": "string", + "form": { + "hideExpression": "!field.model.isDedicated", + "expressionProperties": { + "templateOptions.required": "field.model.isDedicated" + }, + "validation": { + "validators": { + "uniqueValueKeysInObject": { + "keys": [ + "serverName" + ] + } + }, + "messages": { + "uniqueValueKeysInObjectMessage": "This domain name must be unique." + } + } + } + }, "allowedIps": { "title": "Allowed IP addresses", "description": "List of IP addresses or ranges that allow access to private files (access: embargoed or restricted), which are accessible only within the organisation. Note: the bibliographic record (metadata) is always public. Enter one rule per line.", @@ -441,6 +464,7 @@ "footer", "isShared", "isDedicated", + "serverName", "allowedIps", "platformName", "documentsCustomField1", diff --git a/sonar/modules/organisations/mappings/v7/organisations/organisation-v1.0.0.json b/sonar/modules/organisations/mappings/v7/organisations/organisation-v1.0.0.json index 892932f6..30edaa79 100644 --- a/sonar/modules/organisations/mappings/v7/organisations/organisation-v1.0.0.json +++ b/sonar/modules/organisations/mappings/v7/organisations/organisation-v1.0.0.json @@ -54,6 +54,9 @@ "isDedicated": { "type": "boolean" }, + "serverName": { + "type": "keyword" + }, "allowedIps": { "type": "text" }, diff --git a/sonar/modules/organisations/marshmallow/json.py b/sonar/modules/organisations/marshmallow/json.py index 0624b608..e501d9e8 100644 --- a/sonar/modules/organisations/marshmallow/json.py +++ b/sonar/modules/organisations/marshmallow/json.py @@ -47,6 +47,7 @@ class OrganisationMetadataSchemaV1(StrictKeysMixin): footer = fields.List(fields.Dict()) isShared = fields.Boolean() isDedicated = fields.Boolean() + serverName = fields.Str(dump_only=True) allowedIps = SanitizedUnicode() platformName = SanitizedUnicode() documentsCustomField1 = fields.Dict() diff --git a/sonar/modules/sitemap/__init__.py b/sonar/modules/sitemap/__init__.py new file mode 100644 index 00000000..bd8b87fa --- /dev/null +++ b/sonar/modules/sitemap/__init__.py @@ -0,0 +1,18 @@ +# -*- coding: utf-8 -*- +# +# Swiss Open Access Repository +# Copyright (C) 2022 RERO +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +"""Sitemap Modules.""" diff --git a/sonar/modules/sitemap/cli/sitemap.py b/sonar/modules/sitemap/cli/sitemap.py new file mode 100644 index 00000000..4bc4f395 --- /dev/null +++ b/sonar/modules/sitemap/cli/sitemap.py @@ -0,0 +1,45 @@ +# -*- coding: utf-8 -*- +# +# Swiss Open Access Repository +# Copyright (C) 2022 RERO +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +"""Sitemap cli.""" + + +import click +from flask import current_app +from flask.cli import with_appcontext + +from sonar.modules.sitemap.sitemap import sitemap_generate + + +@click.group() +def sitemap(): + """Sitemap.""" + +@sitemap.command() +@click.option( + '-d', '--server-name', 'server_name', required=True, default=None) +@with_appcontext +def generate(server_name): + """Generate a sitemap. + + :param: server_name: organisation server name. + """ + sitemap_generate( + server_name, + current_app.config.get('SONAR_APP_SITEMAP_ENTRY_SIZE', 10000) + ) + click.secho(f'Generate sitemap for {server_name}', fg='green') diff --git a/sonar/modules/sitemap/sitemap.py b/sonar/modules/sitemap/sitemap.py new file mode 100644 index 00000000..6fb33695 --- /dev/null +++ b/sonar/modules/sitemap/sitemap.py @@ -0,0 +1,159 @@ +# -*- coding: utf-8 -*- +# +# Swiss Open Access Repository +# Copyright (C) 2022 RERO +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +"""Sitemap.""" + +import glob +import math +import os +from datetime import datetime + +from flask import current_app, url_for + +from sonar.modules.documents.api import DocumentSearch +from sonar.modules.organisations.api import OrganisationSearch + + +def sitemap_generate(server_name, size=10000): + """Generate a sitemap. + + :param: server_name: organisation server name. + :param: size: size of the set of sitemarp urls. + """ + # Find Organisation by server name and set view and server name + search = DocumentSearch() + if org_pid := OrganisationSearch() \ + .get_organisation_pid_by_server_name(server_name): + search = search.filter('term', organisation__pid=org_pid) + else: + server_name = current_app.config.get('SONAR_APP_SERVER_NAME') + org_pid = current_app.config.get('SONAR_APP_DEFAULT_ORGANISATION') + + with current_app.test_request_context(f'https://{server_name}'): + files_splitted = 0 + file_name = 'sitemap.xml' + folder = ['sitemap'] + if org_pid != current_app.config.get('SONAR_APP_DEFAULT_ORGANISATION'): + folder.append(org_pid) + sitemap_folder = os.path.join(current_app.static_folder, *folder) + sitemap_file = os.path.join(sitemap_folder, file_name) + # Create a destination directory + _create_folder_or_remove_files(sitemap_folder) + + if count := search.count(): + # Elasticsearch query for current organisation + hits = search \ + .sort({'_updated': 'asc'}) \ + .params(preserve_order=True) \ + .source(['pid', '_updated']) \ + .scan() + + if count > size: + # In multiple files mode, generate the index + files_splitted = math.ceil(count / size); + _generate_index_sitemap(sitemap_file, org_pid, files_splitted) + + _generate_sitemap( + sitemap_folder, sitemap_file, file_name, org_pid, + files_splitted, hits, size) + + +def _create_folder_or_remove_files(sitemap_folder): + """Create a folder or remove all files. + + :param: sitemap_folder: folder path. + """ + # Create a destination directory + if not os.path.isdir(sitemap_folder): + # Recursive + os.makedirs(sitemap_folder) + + # remove all files into the current folder + for file in glob.glob(f'{sitemap_folder}/*.xml'): + os.remove(file) + + +def _get_url_sets(hits, max, org_pid, last=True): + """Get url sets.""" + n = 0 + for hit in hits: + yield { + 'loc': url_for( + 'invenio_records_ui.doc', + view=org_pid, + pid_value=hit.pid, + _external=True), + 'lastmod': datetime.fromisoformat(hit._updated) \ + .strftime('%Y-%m-%d') + } + n += 1 + if not last and n == max: + break + + +def _generate_index_sitemap(sitemap_file, org_pid, files_splitted): + """Generate sitemap index for more one file of urls. + + :param: sitemap_file: sitemap file path. + :param: org_pid: organisation pid. + :param: files_splitted: Number of indexes to generate. + """ + def get_splitted_files(): + for i in range(1, files_splitted+1): + yield { + "loc": url_for( + 'sitemap.sitemap_index', + view=org_pid, + index=i, + _external=True) + } + template = current_app.jinja_env.get_template( + 'sonar/sitemap_index.xml') + rv = template.stream(sitemaps=get_splitted_files()) + rv.dump(sitemap_file) + + +def _generate_sitemap( + sitemap_folder, sitemap_file, file_name, org_pid, files_splitted, hits, + size): + """Generate the sitemap file(s). + + :param: sitemap_folder: destination folder. + :param: sitemap_file: file path. + :param: file_name: file name. + :param: org_pid: Organisation pid. + :param: files_splitted: Number of sitemap file to generate. + :param: hits: search ES query scan. + :param: size: Size of the set. + """ + # Get the template + template = current_app.jinja_env.get_template('sonar/sitemap.xml') + + if files_splitted > 1: + # Multiple files + file = file_name.split('.') + for i in range(1, files_splitted+1): + file_path = os.path.join(sitemap_folder, f'{file[0]}_{i}.xml') + rv = template.stream( + urlsets=_get_url_sets(hits, size, org_pid, files_splitted == i)) + rv.enable_buffering(100) + rv.dump(file_path) + else: + # Single file + rv = template.stream(urlsets=_get_url_sets(hits, size, org_pid)) + rv.enable_buffering(100) + rv.dump(sitemap_file) diff --git a/sonar/modules/sitemap/tasks.py b/sonar/modules/sitemap/tasks.py new file mode 100644 index 00000000..5baa719d --- /dev/null +++ b/sonar/modules/sitemap/tasks.py @@ -0,0 +1,43 @@ +# -*- coding: utf-8 -*- +# +# Swiss Open Access Repository +# Copyright (C) 2022 RERO +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +"""Sitemap tasks.""" + +from celery import shared_task +from flask import current_app + +from sonar.modules.organisations.api import OrganisationSearch +from sonar.modules.sitemap.sitemap import sitemap_generate + + +@shared_task(ignore_result=True) +def sitemap_generate_task(): + """Generate sitemap. + + Used as celery task. "ignore_result" flag means that we don't want to + get the status and/or the result of the task, execution is faster. + """ + size = current_app.config.get('SONAR_APP_SITEMAP_ENTRY_SIZE', 10000) + # Generate dedicated organisations sitemaps + orgs = OrganisationSearch().get_dedicated_list() + for org in orgs: + if server_name := org.serverName: + sitemap_generate(server_name, size) + + # Generate global sitemap + sitemap_generate( + current_app.config.get('SONAR_APP_DEFAULT_ORGANISATION'), size) diff --git a/sonar/modules/sitemap/templates/sonar/sitemap.xml b/sonar/modules/sitemap/templates/sonar/sitemap.xml new file mode 100644 index 00000000..e47c6656 --- /dev/null +++ b/sonar/modules/sitemap/templates/sonar/sitemap.xml @@ -0,0 +1,9 @@ + + + {%- for urlset in urlsets %} + + {{ urlset.loc }} + {{ urlset.lastmod }} + + {%- endfor %} + diff --git a/sonar/modules/sitemap/templates/sonar/sitemap_index.xml b/sonar/modules/sitemap/templates/sonar/sitemap_index.xml new file mode 100644 index 00000000..4d1f022f --- /dev/null +++ b/sonar/modules/sitemap/templates/sonar/sitemap_index.xml @@ -0,0 +1,8 @@ + + + {%- for sitemap in sitemaps %} + + {{ sitemap.loc }} + + {%- endfor %} + diff --git a/sonar/modules/sitemap/views.py b/sonar/modules/sitemap/views.py new file mode 100644 index 00000000..8a510dfa --- /dev/null +++ b/sonar/modules/sitemap/views.py @@ -0,0 +1,53 @@ +# -*- coding: utf-8 -*- +# +# Swiss Open Access Repository +# Copyright (C) 2022 RERO +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +"""Sitemap views.""" + +import os + +from flask import Blueprint, Response, abort, current_app + +blueprint = Blueprint('sitemap', + __name__, + template_folder='templates', + url_prefix='/') + + +@blueprint.route('//sitemap.xml') +def sitemap(view): + """Get the sitemap file.""" + return response_file([view,'sitemap.xml']) + + +@blueprint.route('//sitemap_.xml') +def sitemap_index(view, index): + """Get the sitemap index file.""" + return response_file([view, f'sitemap_{index}.xml']) + + +def response_file(file_path): + """Generate the file path and load file.""" + def stream_file(): + """Stream file.""" + with open(sitemap_file, 'r', encoding='utf-8',buffering=100000) as f: + yield from f + + sitemap_folder = os.path.join(current_app.static_folder, 'sitemap') + sitemap_file = os.path.join(sitemap_folder, *file_path) + if not os.path.isdir(sitemap_folder) or not os.path.isfile(sitemap_file): + abort(404) + return Response(stream_file(), mimetype='application/xml') diff --git a/sonar/theme/templates/sonar/robot.txt b/sonar/theme/templates/sonar/robot.txt new file mode 100644 index 00000000..89958e75 --- /dev/null +++ b/sonar/theme/templates/sonar/robot.txt @@ -0,0 +1,9 @@ +{%- if not state -%} +User-agent: * +Disallow: / +{%- else -%} +User-agent: * +Allow: / + +Sitemap: {{ sitemap }} +{% endif %} diff --git a/sonar/theme/views.py b/sonar/theme/views.py index e8a1a433..c234fbed 100644 --- a/sonar/theme/views.py +++ b/sonar/theme/views.py @@ -26,6 +26,7 @@ import re from datetime import datetime +from urllib.parse import urlparse import dateutil.parser import pytz @@ -44,6 +45,7 @@ from sonar.modules.deposits.permissions import DepositPermission from sonar.modules.documents.api import DocumentRecord from sonar.modules.documents.permissions import DocumentPermission +from sonar.modules.organisations.api import OrganisationSearch from sonar.modules.organisations.permissions import OrganisationPermission from sonar.modules.permissions import can_access_manage_view from sonar.modules.subdivisions.permissions import \ @@ -65,6 +67,33 @@ def init_view(): current_menu.submenu('settings').submenu('admin').hide() +@blueprint.route('/robot.txt') +def robot_txt(): + """Generate dynamically robot.txt.""" + if not current_app.config.get('SONAR_APP_PRODUCTION_STATE'): + # If we are not in production status, we disable all robots + return current_app.response_class( + response=render_template('sonar/robot.txt', state=False), + status=200, + mimetype='text/plain') + url_data = urlparse(request.url) + scheme = url_data.scheme + server_name = url_data.netloc.split(':')[0] + if org_pid := OrganisationSearch() \ + .get_organisation_pid_by_server_name(server_name): + sitemap = f'{scheme}://{server_name}/{org_pid}/sitemap.xml' + else: + view = current_app.config.get('SONAR_APP_DEFAULT_ORGANISATION') + sitemap = f'{scheme}://{url_data.netloc}/{view}/sitemap.xml' + return current_app.response_class( + response=render_template( + 'sonar/robot.txt', + state=current_app.config.get('SONAR_APP_PRODUCTION_STATE'), + sitemap=sitemap), + status=200, + mimetype='text/plain') + + @blueprint.route('/users/profile') @blueprint.route('/users/profile/') @login_required diff --git a/tests/conftest.py b/tests/conftest.py index 31952e0e..da7120a7 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -142,6 +142,10 @@ def app_config(app_config): app_config['CELERY_BROKER_URL'] = 'memory://' app_config['CELERY_TASK_ALWAYS_EAGER'] = True app_config['CELERY_TASK_EAGER_PROPAGATES'] = True + + # Config + app_config['SONAR_APP_SERVER_NAME'] = 'sonar.rero.ch' + app_config['SONAR_APP_DEFAULT_ORGANISATION'] = 'global' return app_config @@ -472,9 +476,8 @@ def _make_document(organisation='org', with_file=False, pid=None): if organisation: make_organisation(organisation) document_json['organisation'] = [{ - '$ref': - 'https://sonar.ch/api/organisations/org' - }] + '$ref': 'https://sonar.ch/api/organisations/org'}] + if pid: document_json['pid'] = pid diff --git a/tests/ui/sitemap/test_sitemap.py b/tests/ui/sitemap/test_sitemap.py new file mode 100644 index 00000000..433d93a5 --- /dev/null +++ b/tests/ui/sitemap/test_sitemap.py @@ -0,0 +1,111 @@ +# -*- coding: utf-8 -*- +# +# Swiss Open Access Repository +# Copyright (C) 2022 RERO +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +"""Test sitemap.""" + +import os +import shutil +import xml.etree.ElementTree as ET +from datetime import date + +from sonar.modules.documents.api import DocumentRecord +from sonar.modules.sitemap.sitemap import sitemap_generate + + +def test_sitemap(app, db, organisation, document): + """Test sitemap generator.""" + # Set current directory on static_folder config + path = os.path.dirname(os.path.realpath(__file__)) + app.static_folder = path + # namespace of sitemap + namespace = '{http://www.sitemaps.org/schemas/sitemap/0.9}' + + sitemap_generate('org.domain.com', 10) + + sitemap_file = os.path.join(path, 'sitemap', 'sitemap.xml') + assert os.path.isfile(sitemap_file) + + # Control data into the xml file + tree = ET.parse(sitemap_file) + url = tree.findall(f'{namespace}url')[0] + assert 'https://sonar.rero.ch/global/documents/1' == \ + url.find(f'{namespace}loc').text + assert date.today().strftime("%Y-%m-%d") == \ + url.find(f'{namespace}lastmod').text + + + # ------- test for a dedicated organisation + organisation['isDedicated'] = True + organisation['serverName'] = 'org.domain.com' + organisation.commit() + organisation.reindex() + db.session.commit() + + sitemap_generate('org.domain.com', 10) + + sitemap_file = os.path.join( + path, 'sitemap', organisation['pid'], 'sitemap.xml') + assert os.path.isfile(sitemap_file) + + # Control data into the xml file + tree = ET.parse(sitemap_file) + url = tree.findall(f'{namespace}url')[0] + assert 'https://org.domain.com/org/documents/1' == \ + url.find(f'{namespace}loc').text + assert date.today().strftime("%Y-%m-%d") == \ + url.find(f'{namespace}lastmod').text + + + # ------- Generate multiple files with index sitemap + document.pop('pid', None) + document.pop('_oai', None) + document['identifiedBy'] = [{ + 'value': 'R003415714', + 'type': 'bf:Local', + 'source': 'RERO' + }] + doc = DocumentRecord.create(document) + doc.reindex() + db.session.commit() + + sitemap_generate('org.domain.com', 1) + + sitemap_index = os.path.join( + path, 'sitemap', organisation['pid'], 'sitemap.xml') + assert os.path.isfile(sitemap_index) + + tree = ET.parse(sitemap_index) + sitemaps = tree.findall(f'{namespace}sitemap') + for n, sitemap in enumerate(sitemaps, start=1): + assert f'https://org.domain.com/org/sitemap_{n}.xml' == \ + sitemap.find(f'{namespace}loc').text + + for i in range(1, 3): + sitemap_file = os.path.join( + path, 'sitemap', organisation['pid'], f'sitemap_{i}.xml') + assert os.path.isfile(sitemap_file) + tree = ET.parse(sitemap_file) + url = tree.findall(f'{namespace}url')[0] + assert f'https://org.domain.com/org/documents/{i}' == \ + url.find(f'{namespace}loc').text + assert date.today().strftime("%Y-%m-%d") == \ + url.find(f'{namespace}lastmod').text + + # Remove folder after test + sitemap_folder = os.path.join(path, 'sitemap') + if os.path.isdir(sitemap_folder): + shutil.rmtree(sitemap_folder, ignore_errors=True) diff --git a/tests/ui/sitemap/test_sitemap_views.py b/tests/ui/sitemap/test_sitemap_views.py new file mode 100644 index 00000000..058a53f5 --- /dev/null +++ b/tests/ui/sitemap/test_sitemap_views.py @@ -0,0 +1,73 @@ +# -*- coding: utf-8 -*- +# +# Swiss Open Access Repository +# Copyright (C) 2022 RERO +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +"""Test sitemap views.""" + +import mock +import pytest +from flask import Response, url_for + + +def response_urlset(): + """.""" + return Response(""" + + + + http://www.domain.com/document/1 + 2022-01-01 + + + """) + + +def response_sitemapindex(): + """.""" + return Response(""" + + + + http://www.domain.com/sitemap_1.xml + + + """) + + +def test_sitemap_file_error(app, client): + """File Not Found exception if file doen't exists.""" + with pytest.raises(Exception): + url = url_for('sitemap.sitemap', view='global') + client.get(url) + + + +@mock.patch('sonar.modules.sitemap.views.response_file', + mock.MagicMock(return_value=response_urlset())) +def test_sitemap_file(app, client): + """Test entrypoint for sitemap urlset.""" + url = url_for('sitemap.sitemap', view='global') + res = client.get(url) + assert res.status_code == 200 + + +@mock.patch('sonar.modules.sitemap.views.response_file', + mock.MagicMock(return_value=response_sitemapindex())) +def test_sitemap_index_file(app, client): + """Test entrypoint for sitemap index.""" + url = url_for('sitemap.sitemap_index', view='global', index=1) + res = client.get(url) + assert res.status_code == 200 diff --git a/tests/ui/test_views.py b/tests/ui/test_views.py index 71426f47..9f81b766 100644 --- a/tests/ui/test_views.py +++ b/tests/ui/test_views.py @@ -35,6 +35,22 @@ def test_error(client): assert client.get(url_for('sonar.error')) +def test_robot_txt(app): + """Test le robot file.""" + with app.test_client() as client: + url = url_for('sonar.robot_txt') + app.config.update(SONAR_APP_PRODUCTION_STATE=False) + res = client.get(url) + assert res.status_code == 200 + assert b'User-agent: *\nDisallow: /' in res.data + + app.config.update(SONAR_APP_PRODUCTION_STATE=True) + res = client.get(url) + assert res.status_code == 200 + assert b'User-agent: *\nAllow: /\n\n'\ + b'Sitemap: http://localhost/global/sitemap.xml' in res.data + + def test_admin_record_page(app, admin, user_without_role): """Test admin page redirection to defaults.""" with app.test_client() as client: