# Notebook objectives

authors: Thomas Rosnet, Alban Gaignard

Aim: prototyping the FAIR-signposting recommendations to ease the findability of web landing page metadata : https://signposting.org/FAIR/apples-hackathon/

In [88]:
import sys
parentdir = ".."
sys.path.insert(0, parentdir)

import requests
from os import path
from tqdm.notebook import tqdm
import pandas as pd
import time

from metrics.WebResource import WebResource
from metrics.FAIRMetricsFactory import FAIRMetricsFactory
from metrics.AbstractFAIRMetrics import AbstractFAIRMetrics

# Retrieving signposting metadata : 

- [ ] `cite-as`, 
- [ ] `describedby`, 
- [ ] `item`, 
- [ ] `linkset`

In [89]:
pangaea_LP = WebResource("https://doi.pangaea.de/10.1594/PANGAEA.932827")

http://schema.org/


In [90]:
print(pangaea_LP.get_headers())

{'Server': 'nginx/1.21.6', 'Date': 'Tue, 22 Mar 2022 18:09:44 GMT', 'Content-Type': 'text/html;charset=utf-8', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'Set-Cookie': 'pansessid=d22684ac8c343ff68765763ba4366bf2; Path=/; Domain=.pangaea.de; Secure; HttpOnly', 'Vary': 'Cookie, Authorization, Accept, Accept-Encoding', 'Cache-Control': 'public', 'X-CID': 'd22684ac8c343ff68765763ba4366bf2', 'Content-Encoding': 'gzip', 'Link': '<https://doi.org/10.1594/PANGAEA.932827>;rel="cite-as", <https://doi.pangaea.de/10.1594/PANGAEA.932827?format=metadata_jsonld>;rel="describedby";type="application/ld+json", <https://doi.pangaea.de/10.1594/PANGAEA.932827?format=citation_ris>;rel="describedby";type="application/x-research-info-systems", <https://doi.pangaea.de/10.1594/PANGAEA.932827?format=citation_bibtex>;rel="describedby";type="application/x-bibtex", <https://doi.pangaea.de/10.1594/PANGAEA.932827?format=zip>;rel="item";type="application/zip", <https://orcid.org/0000-0001-5427-0151>;r

In [91]:
print(pangaea_LP.get_html_requests())



In [92]:
print(pangaea_LP.get_html_selenium())

<html lang="en"><head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1, maximum-scale=1, user-scalable=no">
<link rel="stylesheet" href="//fonts.googleapis.com/css?family=Open+Sans:400,600,400italic,700,700italic,600italic,300,300italic,800,800italic">
<link rel="stylesheet" href="//www.pangaea.de/assets/v.623cd1d08e15a2ba35cc9bdc15d50a4a/bootstrap-24col/css/bootstrap.min.css">
<link rel="stylesheet" href="//www.pangaea.de/assets/v.623cd1d08e15a2ba35cc9bdc15d50a4a/css/pangaea.css">
<!--[if lte IE 9]>
<style>#topics-pulldown-wrapper label:after { display:none; }</style>
<![endif]-->
<link rel="shortcut icon" href="//www.pangaea.de/assets/v.623cd1d08e15a2ba35cc9bdc15d50a4a/favicon.ico">
<link rel="icon" href="//www.pangaea.de/assets/v.623cd1d08e15a2ba35cc9bdc15d50a4a/favicon.ico" type="image/vnd.microsoft.icon">
<link rel="image_src" type="image/png" href="https://www.pangaea.de/assets/social-icons/pangaea-share.png">
<meta prope

# 1. Checking the headers for "links"

In [93]:
def retrieve_links_from_headers(landing_page):
    links_col = []
    decsribed_by_col = []
    cite_as_col = []
    item_col = []
    headers = landing_page.get_headers()
    for k in headers.keys():
        if 'link' in k.lower() : 
            l_header = headers[k]
            print(l_header)
            links = l_header.split(',')
            for link in links : 
                #print("----")
                links_col.append(link)
                tokens = link.split(';')
                #print(tokens)
                for t in tokens :
                    if 'rel="describedby"' in t:
                        decsribed_by_col.append(link)
                    elif 'rel="item"' in t:
                        item_col.append(link)
                    elif 'rel="cite-as"' in t:
                        cite_as_col.append(link)
            
    return cite_as_col, decsribed_by_col, item_col

In [94]:
cite_as, decsribed_by, items = retrieve_links_from_headers(pangaea_LP)

<https://doi.org/10.1594/PANGAEA.932827>;rel="cite-as", <https://doi.pangaea.de/10.1594/PANGAEA.932827?format=metadata_jsonld>;rel="describedby";type="application/ld+json", <https://doi.pangaea.de/10.1594/PANGAEA.932827?format=citation_ris>;rel="describedby";type="application/x-research-info-systems", <https://doi.pangaea.de/10.1594/PANGAEA.932827?format=citation_bibtex>;rel="describedby";type="application/x-bibtex", <https://doi.pangaea.de/10.1594/PANGAEA.932827?format=zip>;rel="item";type="application/zip", <https://orcid.org/0000-0001-5427-0151>;rel="author", <https://orcid.org/0000-0001-7322-5145>;rel="author", <https://orcid.org/0000-0003-0967-8945>;rel="author"


In [95]:
cite_as

['<https://doi.org/10.1594/PANGAEA.932827>;rel="cite-as"']

In [96]:
decsribed_by

[' <https://doi.pangaea.de/10.1594/PANGAEA.932827?format=metadata_jsonld>;rel="describedby";type="application/ld+json"',
 ' <https://doi.pangaea.de/10.1594/PANGAEA.932827?format=citation_ris>;rel="describedby";type="application/x-research-info-systems"',
 ' <https://doi.pangaea.de/10.1594/PANGAEA.932827?format=citation_bibtex>;rel="describedby";type="application/x-bibtex"']

In [97]:
items

[' <https://doi.pangaea.de/10.1594/PANGAEA.932827?format=zip>;rel="item";type="application/zip"']

# 2. Checkink the HTML content for "links"

In [98]:
from lxml import html

tree = html.fromstring(pangaea_LP.get_html_selenium())

txt = tree.xpath('//link[contains(@rel, "describedby")]')
for t in txt:
    print(t.values())
    
txt = tree.xpath('//link[contains(@rel, "cite-as")]')
for t in txt:
    print(t.values())

txt = tree.xpath('//link[contains(@rel, "item")]')
for t in txt:
    print(t.values())

#//a[@rel]

['describedby', 'https://doi.pangaea.de/10.1594/PANGAEA.932827?format=metadata_jsonld', 'application/ld+json']
['describedby', 'https://doi.pangaea.de/10.1594/PANGAEA.932827?format=citation_ris', 'application/x-research-info-systems']
['describedby', 'https://doi.pangaea.de/10.1594/PANGAEA.932827?format=citation_bibtex', 'application/x-bibtex']
['cite-as', 'https://doi.org/10.1594/PANGAEA.932827']
['item', 'https://doi.pangaea.de/10.1594/PANGAEA.932827?format=zip', 'application/zip']


In [215]:
http_link_header = """
<https://doi.org/10.1594/PANGAEA.932827>;rel="cite-as", <https://doi.pangaea.de/10.1594/PANGAEA.932827?format=metadata_jsonld>;rel="describedby";type="application/ld+json", <https://doi.pangaea.de/10.1594/PANGAEA.932827?format=citation_ris>;rel="describedby";type="application/x-research-info-systems", <https://doi.pangaea.de/10.1594/PANGAEA.932827?format=citation_bibtex>;rel="describedby";type="application/x-bibtex", <https://doi.pangaea.de/10.1594/PANGAEA.932827?format=zip>;rel="item";type="application/zip", <https://orcid.org/0000-0001-5427-0151>;rel="author", <https://orcid.org/0000-0001-7322-5145>;rel="author", <https://orcid.org/0000-0003-0967-8945>;rel="author"
""" 

from pyparsing import Word, alphas, alphanums, Group, Combine, Forward, ZeroOrMore, Optional, oneOf, QuotedString, Suppress
import pyparsing as pp
from pyparsing import pyparsing_common as ppc

"""
The ABNF for the field value is:

     Link       = #link-value
     link-value = "<" URI-Reference ">" *( OWS ";" OWS link-param )
     link-param = token BWS [ "=" BWS ( token / quoted-string ) ]
"""

#Definitions of literals
lt = Suppress("<")
gt = Suppress(">")
equals = Suppress("=")
comma = Suppress(",")
semicol = Suppress(";")
url = ppc.url.setName("a_url")

#token = Word(alphas+"_"+"-", alphanums+"_"+"-")
tchar = "!#$%&*+-.^_`|~" + pp.nums + pp.alphas
token = Word(tchar).setName("token")

quoted_token = Suppress('"') + token + Suppress('"') \
        ^ Suppress("'") + token + Suppress("'") \
        ^ Suppress("<") + token + Suppress(">") 

link_param = token + Optional(equals + (token ^ quoted_token)) 

link_value = lt + url + gt + semicol + ZeroOrMore(semicol + link_param)

In [216]:
#field.parseString("<mqsd>")
#field.parseString("'mqsd'")
field.parseString('"mqsd"')

ParseResults(['mqsd'], {})

In [217]:
#ppc.url.runTests('http://wikipedia.org')
url.parseString('http://wikipedia.org')

ParseResults(['http://wikipedia.org'], {'scheme': 'http', 'auth': None, 'host': 'wikipedia.org', 'port': None, 'path': None, 'query': None, 'fragment': None})

In [218]:
lp = link_param.parseString("rel = cite-as")
print(lp)

lp = link_param.parseString('rel =" cite-as "')
print(lp)

lp = link_param.parseString("rel='cite-as'")
print(lp)

['rel', 'cite-as']
['rel', 'cite-as']
['rel', 'cite-as']


In [238]:
#print(type(url))

u = url 
u.parseString('https://doi.org/10.1594/PANGAEA.932827')


try:
    u.parseString('https://doi.org/10.1594/PANGAEA.932827;')
except pp.ParseException as pe:
    print(pe.explain(depth=8))
