Skip to content

Commit

Permalink
add sitemap functionality
Browse files Browse the repository at this point in the history
* Adds a new field serverName on organisation resource.
* Implements sitemap generation for global and dedicated organisations.
* Implements generation of the file robot.txt dynamically.
* Closes rero#798.

Co-Authored-by: Bertrand Zuchuat <bertrand.zuchuat@rero.ch>
  • Loading branch information
Garfield-fr committed Jun 28, 2022
1 parent a9d5dc2 commit 2b40174
Show file tree
Hide file tree
Showing 22 changed files with 648 additions and 4 deletions.
3 changes: 2 additions & 1 deletion .github/workflows/continuous-integration-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@ jobs:
pip install --upgrade coveralls
- name: Run Test
run: poetry run ./scripts/test
# run: poetry run ./scripts/test
run: poetry run pytest -vvs --no-cov --disable-warnings --disable-pytest-warnings tests/ui/sitemap

- name: Code Coverage
env:
Expand Down
3 changes: 3 additions & 0 deletions data/organisations/data.json
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
],
"isShared": true,
"isDedicated": true,
"serverName": "fredi.hepvs.ch",
"platformName": "FREDI",
"files": [
{
Expand All @@ -64,6 +65,7 @@
"name": "Haute École Pédagogique BEJUNE",
"isShared": true,
"isDedicated": true,
"serverName": "roar.hep-bejune.ch",
"platformName": "# ROAR\n#### Répertoire ouvert et archives BEJUNE",
"files": [
{
Expand Down Expand Up @@ -113,6 +115,7 @@
],
"isShared": true,
"isDedicated": true,
"serverName": "folia.unifr.ch",
"platformName": "# FOLIA\n#### Fribourg Open Library and Archive",
"files": [
{
Expand Down
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ resources = "sonar.resources.cli:resources"
imports = "sonar.modules.cli.imports:imports"
fixtures = "sonar.modules.cli.fixtures:fixtures"
utils = "sonar.modules.cli.utils:utils"
sitemap = "sonar.modules.sitemap.cli.sitemap:sitemap"

[tool.poetry.plugins."invenio_base.apps"]
sonar = "sonar.ext:Sonar"
Expand All @@ -108,6 +109,7 @@ shibboleth_authenticator = "sonar.modules.shibboleth_authenticator.views.client:
theme = "sonar.theme.views:blueprint"
validation = "sonar.modules.validation.views:blueprint"
users = "sonar.modules.users.views:blueprint"
sitemap = "sonar.modules.sitemap.views:blueprint"

[tool.poetry.plugins."invenio_base.api_blueprints"]
deposits = "sonar.modules.deposits.rest:api_blueprint"
Expand Down Expand Up @@ -186,6 +188,7 @@ subdivisions = "sonar.modules.subdivisions.jsonresolvers"
[tool.poetry.plugins."invenio_celery.tasks"]
documents = "sonar.modules.documents.tasks"
stats = "sonar.modules.stats.tasks"
sitemap = "sonar.modules.sitemap.tasks"

[tool.poetry.plugins."invenio_admin.views"]
stats = "sonar.modules.stats.admin:stats_adminview"
Expand Down
5 changes: 5 additions & 0 deletions sonar/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,11 @@ def _(x):
'documents-stats': {
'task': ('sonar.modules.stats.tasks.collect_stats'),
'schedule': crontab(minute=0, hour=1), # Every day at 01:00 UTC,
},
# Sitemap
'sitemap': {
'task': 'sonar.modules.sitemap.tasks.sitemap_generate_task',
'schedule': crontab(minute=0, hour=2), # Every day at 02:00 UTC,
}
}
CELERY_BROKER_HEARTBEAT = 0
Expand Down
6 changes: 6 additions & 0 deletions sonar/config_sonar.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,17 @@

"""Specific configuration SONAR."""

SONAR_APP_SERVER_NAME = 'sonar.rero.ch'

SONAR_APP_API_URL = 'https://localhost:5000/api/'

SONAR_APP_ANGULAR_URL = 'https://localhost:5000/manage/'
"""Link to angular integrated app root."""

SONAR_APP_PRODUCTION_STATE = False

SONAR_APP_SITEMAP_ENTRY_SIZE = 10000

SONAR_APP_LANGUAGES_MAP = {
'aar': 'aa',
'abk': 'ab',
Expand Down
13 changes: 13 additions & 0 deletions sonar/modules/organisations/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,19 @@ def get_shared_or_dedicated_list(self):
['pid', 'name', 'isShared',
'isDedicated']).execute().hits

def get_organisation_pid_by_server_name(self, server_name):
"""Get organisation by server_name."""
if hits := self.filter('term', serverName=server_name) \
.source(['pid']).execute().hits:
return hits[0].pid

def get_dedicated_list(self):
"""Get the list of dedicated organisations.
:returns: Iterator of dedicated organisations.
"""
return self.filter('term', isDedicated=True).execute().hits


class OrganisationRecord(SonarRecord):
"""Organisation record class."""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,29 @@
"hideExpression": "!field.model.isShared"
}
},
"serverName": {
"title": "Server name (without http)",
"description": "Organisation server name for dedicated.",
"type": "string",
"form": {
"hideExpression": "!field.model.isDedicated",
"expressionProperties": {
"templateOptions.required": "field.model.isDedicated"
},
"validation": {
"validators": {
"uniqueValueKeysInObject": {
"keys": [
"serverName"
]
}
},
"messages": {
"uniqueValueKeysInObjectMessage": "This domain name must be unique."
}
}
}
},
"allowedIps": {
"title": "Allowed IP addresses",
"description": "List of IP addresses or ranges that allow access to private files (access: embargoed or restricted), which are accessible only within the organisation. Note: the bibliographic record (metadata) is always public. Enter one rule per line.",
Expand Down Expand Up @@ -441,6 +464,7 @@
"footer",
"isShared",
"isDedicated",
"serverName",
"allowedIps",
"platformName",
"documentsCustomField1",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,9 @@
"isDedicated": {
"type": "boolean"
},
"serverName": {
"type": "keyword"
},
"allowedIps": {
"type": "text"
},
Expand Down
1 change: 1 addition & 0 deletions sonar/modules/organisations/marshmallow/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ class OrganisationMetadataSchemaV1(StrictKeysMixin):
footer = fields.List(fields.Dict())
isShared = fields.Boolean()
isDedicated = fields.Boolean()
serverName = fields.Str(dump_only=True)
allowedIps = SanitizedUnicode()
platformName = SanitizedUnicode()
documentsCustomField1 = fields.Dict()
Expand Down
18 changes: 18 additions & 0 deletions sonar/modules/sitemap/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# -*- coding: utf-8 -*-
#
# Swiss Open Access Repository
# Copyright (C) 2022 RERO
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, version 3 of the License.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

"""Sitemap Modules."""
45 changes: 45 additions & 0 deletions sonar/modules/sitemap/cli/sitemap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# -*- coding: utf-8 -*-
#
# Swiss Open Access Repository
# Copyright (C) 2022 RERO
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, version 3 of the License.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

"""Sitemap cli."""


import click
from flask import current_app
from flask.cli import with_appcontext

from sonar.modules.sitemap.sitemap import sitemap_generate


@click.group()
def sitemap():
"""Sitemap."""

@sitemap.command()
@click.option(
'-d', '--server-name', 'server_name', required=True, default=None)
@with_appcontext
def generate(server_name):
"""Generate a sitemap.
:param: server_name: organisation server name.
"""
sitemap_generate(
server_name,
current_app.config.get('SONAR_APP_SITEMAP_ENTRY_SIZE', 10000)
)
click.secho(f'Generate sitemap for {server_name}', fg='green')
159 changes: 159 additions & 0 deletions sonar/modules/sitemap/sitemap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
# -*- coding: utf-8 -*-
#
# Swiss Open Access Repository
# Copyright (C) 2022 RERO
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, version 3 of the License.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

"""Sitemap."""

import glob
import math
import os
from datetime import datetime

from flask import current_app, url_for

from sonar.modules.documents.api import DocumentSearch
from sonar.modules.organisations.api import OrganisationSearch


def sitemap_generate(server_name, size=10000):
"""Generate a sitemap.
:param: server_name: organisation server name.
:param: size: size of the set of sitemarp urls.
"""
# Find Organisation by server name and set view and server name
search = DocumentSearch()
if org_pid := OrganisationSearch() \
.get_organisation_pid_by_server_name(server_name):
search = search.filter('term', organisation__pid=org_pid)
else:
server_name = current_app.config.get('SONAR_APP_SERVER_NAME')
org_pid = current_app.config.get('SONAR_APP_DEFAULT_ORGANISATION')

with current_app.test_request_context(f'https://{server_name}'):
files_splitted = 0
file_name = 'sitemap.xml'
folder = ['sitemap']
if org_pid != current_app.config.get('SONAR_APP_DEFAULT_ORGANISATION'):
folder.append(org_pid)
sitemap_folder = os.path.join(current_app.static_folder, *folder)
sitemap_file = os.path.join(sitemap_folder, file_name)
# Create a destination directory
_create_folder_or_remove_files(sitemap_folder)

if count := search.count():
# Elasticsearch query for current organisation
hits = search \
.sort({'_updated': 'asc'}) \
.params(preserve_order=True) \
.source(['pid', '_updated']) \
.scan()

if count > size:
# In multiple files mode, generate the index
files_splitted = math.ceil(count / size);
_generate_index_sitemap(sitemap_file, org_pid, files_splitted)

_generate_sitemap(
sitemap_folder, sitemap_file, file_name, org_pid,
files_splitted, hits, size)


def _create_folder_or_remove_files(sitemap_folder):
"""Create a folder or remove all files.
:param: sitemap_folder: folder path.
"""
# Create a destination directory
if not os.path.isdir(sitemap_folder):
# Recursive
os.makedirs(sitemap_folder)

# remove all files into the current folder
for file in glob.glob(f'{sitemap_folder}/*.xml'):
os.remove(file)


def _get_url_sets(hits, max, org_pid, last=True):
"""Get url sets."""
n = 0
for hit in hits:
yield {
'loc': url_for(
'invenio_records_ui.doc',
view=org_pid,
pid_value=hit.pid,
_external=True),
'lastmod': datetime.fromisoformat(hit._updated) \
.strftime('%Y-%m-%d')
}
n += 1
if not last and n == max:
break


def _generate_index_sitemap(sitemap_file, org_pid, files_splitted):
"""Generate sitemap index for more one file of urls.
:param: sitemap_file: sitemap file path.
:param: org_pid: organisation pid.
:param: files_splitted: Number of indexes to generate.
"""
def get_splitted_files():
for i in range(1, files_splitted+1):
yield {
"loc": url_for(
'sitemap.sitemap_index',
view=org_pid,
index=i,
_external=True)
}
template = current_app.jinja_env.get_template(
'sonar/sitemap_index.xml')
rv = template.stream(sitemaps=get_splitted_files())
rv.dump(sitemap_file)


def _generate_sitemap(
sitemap_folder, sitemap_file, file_name, org_pid, files_splitted, hits,
size):
"""Generate the sitemap file(s).
:param: sitemap_folder: destination folder.
:param: sitemap_file: file path.
:param: file_name: file name.
:param: org_pid: Organisation pid.
:param: files_splitted: Number of sitemap file to generate.
:param: hits: search ES query scan.
:param: size: Size of the set.
"""
# Get the template
template = current_app.jinja_env.get_template('sonar/sitemap.xml')

if files_splitted > 1:
# Multiple files
file = file_name.split('.')
for i in range(1, files_splitted+1):
file_path = os.path.join(sitemap_folder, f'{file[0]}_{i}.xml')
rv = template.stream(
urlsets=_get_url_sets(hits, size, org_pid, files_splitted == i))
rv.enable_buffering(100)
rv.dump(file_path)
else:
# Single file
rv = template.stream(urlsets=_get_url_sets(hits, size, org_pid))
rv.enable_buffering(100)
rv.dump(sitemap_file)
Loading

0 comments on commit 2b40174

Please sign in to comment.