In [1]:
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
import re
import sys
import string
import csv
from os.path import splitext
from os.path import basename
import unicodedata
from binascii import unhexlify
from string import Template 
import datetime

In [2]:
base_url = 'http://test.wmocodes.info'

In [3]:
def notPrintable(text):
    
    cnt=1
    for c in text:
        
        if unicodedata.category(c)[0]=="C":
            utfval = "{:02x}".format(ord(c))
            print("problem character:{} at {}".format(utfval,cnt))
            return True
        cnt+=1
    
    return False

def processFile(filename,name,allfile=None):
    
    codelist_url = table_url + filename
      
    #print("getting {} from {}".format(name,codelist_url))
        
    #codelistLabel = 'Measurement or observing method'
    codelistLabel = name
    #codelistDescription = 'The method of measurement or observation used'
    codelistDescription = name

    outfilename = splitext(filename)[0]+'.ttl'
    

    
    m = re.search(r"(\d+-\d+-?\d?\d?)",filename)
    
    if not m:
        print("problem with filename {}.. cannot determine URL".format(filename))
        return False
    
    
    codelist_nr = m.group(1) 
    
    #if codelist_nr in ['1-01-01','1-01-02','1-01-03','1-01-04','1-01-05']:
    #    print("skipping {}".format(codelist_nr))
    #    return

    base_url = table_urls[codelist_nr]   
    base_name = base_url.split('/')[-1]
       
    base_description = table_descriptions[codelist_nr]
    
    notationStr = 'notation'
    descriptionStr = 'description'
    labelStr = 'name'
    
    suspectCharactersNotation = " ,:!?="
    
    
    ttlfile = open(outfilename, 'w', encoding='utf-8')
    
    webpage = urllib.request.urlopen(codelist_url)
    
    data = webpage.read()
    
    # try to encode in latin1.. this throws an error which is caught outside
    data.decode('latin1')
    
    ttlfilecsvreader = csv.reader( data.decode('utf-8').splitlines() ,  delimiter=',', quotechar='"' )  

    members = [] # accumulate the members of the collection
    first = 1
    count = 0
    strbuffer=""
    for row in ttlfilecsvreader:
        count += 1
        if first:
            keys = row
            first = 0
        else:
            row = [rr.replace("\"", "\'") for rr in row]
            d = dict(zip(keys, row))
            #print(d)
            if not notationStr in d or d[notationStr] == 'NA':
                print("ERROR notation not defined in line %d" % (count))
                print(d)
                sys.exit(1)
                continue
            
            notation = d[notationStr].strip()
            description = d[descriptionStr].strip()
            label = d[labelStr].strip()
           
            member = "{}/{}".format(base_name,notation)
        
            if member in members:
                print("Two registers with the same notation: "+notation)
                sys.exit(1)
            if notPrintable(description):
                print("ERROR unprintable character in definition in line %d" % (count))
                print(d[definitionStr])
                continue
            if notPrintable(name):
                print("ERROR unprintable character in label in line %d" % (count))
                continue
            if not all(c not in suspectCharactersNotation for c in notation):
                print("ERROR suspect character in Notation in line %d" % (count))
                continue
                
            
            try:
                notation.encode('latin1')
            except UnicodeEncodeError as ue:
                print("notation {} contains non-latin1 characters".format(notation))
                continue

            members.append(member)
            
            strbuffer += item_tpl.substitute( container=base_name , notation=notation , description=description , label=label )
            strbuffer += "\n"
            
            #print('<' + codelist + '> a ' + type + ' ;' , file=ttlfile) #TODO
            
    now_iso = datetime.datetime.now().isoformat()
    members_str = " , ".join([ "<{}>".format(m) for m in members ])
    
    ttlfile.write( header_tpl.substitute() + "\n"  )
    ttlfile.write( collection_tpl.substitute( container=base_name , description=base_description , label=base_name, members=members_str   ) + "\n" )
    ttlfile.write( strbuffer )
            
    if allfile:
        allfile.write( collection_tpl.substitute( container=base_name , description=base_description , label=base_name, members=members_str   ) + "\n" )
        allfile.write( strbuffer )
    
    ttlfile.close()
    return True

In [4]:
# get codelist to 
table_urls = {}
table_descriptions = {}
with open('wmdr-tables.csv','r') as f:
    csvreader = csv.reader(f)
    
    for line in csvreader:
        table_urls[line[0]] = line[2]
        table_descriptions[line[0]] = line[1]
                

In [13]:
header_tpl = Template("""@prefix dct:   <http://purl.org/dc/terms/> .
@prefix rdf:   <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs:  <http://www.w3.org/2000/01/rdf-schema#> .
@prefix skos:  <http://www.w3.org/2004/02/skos/core#> .
@prefix ldp:   <http://www.w3.org/ns/ldp#> .
@prefix reg:   <http://purl.org/linked-data/registry#> .
@prefix xsd:   <http://www.w3.org/2001/XMLSchema#> .
@prefix owl:   <http://www.w3.org/2002/07/owl#> .
""")

item_tpl = Template("""<$container/$notation>
    a                skos:Concept ;
    rdfs:label       "$label" ;
    dct:description  "$description"@en ;
    skos:notation    "$notation" .
    
    """)
    
collection_tpl = Template("""<$container>
    a                      skos:Collection  ;
    rdfs:label             "$description" ;
    dct:description        "WMO $description" ;
    skos:member            $members .
    """)

# get list of tables 
url = 'https://raw.githubusercontent.com/wmo-im/wmds/master/tables_en/readme.md'
#url = 'https://raw.githubusercontent.com/wmo-im/wmds/1.0RC3-en/tables_en/readme.md'

table_url = 'https://raw.githubusercontent.com/wmo-im/wmds/master/tables_en/'
#table_url = 'https://raw.githubusercontent.com/wmo-im/wmds/1.0RC3-en/tables_en/'

r = requests.get(url)
   

for line in r.text.replace("\n\n","\n").splitlines():
    #print(line)

    m = re.search('\[(.*)\]\(([^\s]*)\)(.*)',line)

    if m:
        filename = m.group(1)
        fileurl = m.group(2)
        name = m.group(3)

        #print("processing {}".format(filename))
        try:
            processFile(filename,name)
            print("{} ok".format(filename))
        except BaseException as be:
            print("problem processing {} {} {}".format(filename,name,be))


skipping 1-01-01
1-01-01.csv ok
skipping 1-01-02
1-01-02.csv ok
skipping 1-01-03
1-01-03.csv ok
skipping 1-01-04
1-01-04.csv ok
skipping 1-01-05
1-01-05.csv ok
1-02.csv ok
1-04.csv ok
1-05.csv ok
2-01.csv ok
2-02.csv ok
3-01.csv ok
3-02.csv ok
3-04.csv ok
3-08.csv ok
3-09.csv ok
4-01-01.csv ok
4-01-02.csv ok
4-01-03.csv ok
4-01-04.csv ok
4-01-05.csv ok
4-01-06.csv ok
4-01-07.csv ok
4-02.csv ok
4-03-01.csv ok
4-03-02.csv ok
4-03-03.csv ok
4-03-04.csv ok
4-04.csv ok
4-05.csv ok
4-06.csv ok
4-07.csv ok
5-01.csv ok
5-02-01.csv ok
5-02-05.csv ok
5-04.csv ok
5-05.csv ok
5-08-01.csv ok
5-08-02.csv ok
5-08-03.csv ok
5-14.csv ok
5-15.csv ok
6-03.csv ok
7-06.csv ok
7-07.csv ok
7-10.csv ok
8-02.csv ok
8-03-01.csv ok
8-03-02.csv ok
8-04.csv ok
8-05.csv ok
9-02.csv ok
10-01.csv ok
11-01.csv ok
11-02.csv ok
11-03.csv ok


In [14]:
def authenticate(session, base, userid, pss):
    auth = session.post('{}/system/security/apilogin'.format(base),
                        data={'userid':userid,
                                'password':pss})
    if not auth.status_code == 200:
        raise ValueError('auth failed')

    return session

def post_file(session, postfile, container, status, bulk=False):
    with open(postfile, 'r') as pf:
        pdata = pf.read().encode('utf-8')
        
    params = {'status':status}
    
    if container == '.':
        container = ''
    else:
        container = '/' + container
    if not container:
        container = '/'

    if bulk:
        params = 'batch-managed&' + urllib.parse.urlencode(params)
        
    url = "{u}{c}".format(u=base_url,c=container)
    #print(url)
    res = session.post(url,
                      headers={'Content-type':'text/turtle; charset=UTF-8'}, 
                      data=pdata,
                      params=params)
    
    if res.status_code > 299:
        if res.status_code == 403:
            exists = session.get(url)
            if exists.status_code != 200:
                raise ValueError('Http response code indicates failure\n{}'.format(res.status_code))
        else:
            raise ValueError('Http response code indicates failure: {} - {}'.format(res.status_code,res.text))
 
    return session

In [15]:
import requests

data = {
  'userid': 'https://api.github.com/users/kurt-hectic',
  'password': '6b0da12a49eb508d1df3b1b6b60a54a2'
}

session = requests.Session()
session = authenticate(session, base_url, data['userid'], data['password'])


#r = s.post(base_url + '/system/security/apilogin', data=data)    

In [16]:
from os import listdir
from os.path import isfile, join

In [17]:
newlist = [ f for f in listdir(".") if f.endswith(".ttl") ]

In [18]:
for file in newlist:
    #print("uploading {}".format(file))
    try: 
        post_file(session, file, "wmdr", "experimental", True)
    except ValueError as ve:
        print("issue with {} : {}".format(file,ve))

In [None]:
"μmol_mol-1".encode("latin1")