In [1]:
from numpy.core.numeric import NaN
#-*- coding: UTF-8 -*-
#@title Code { vertical-output: true, display-mode: "form" }
import ipywidgets as widgets
from IPython.display import display, clear_output
import matplotlib.pyplot as plt
import pandas as pd 
import io
import sys
import ast, re
import base64
import json
from dateutil.parser import parse
from contextlib import redirect_stderr
from csv import Sniffer
import chardet 
%matplotlib notebook

!pip install Owlready2
from owlready2 import *

#there is a bug in Owlready2 when having imports in turtle in a owl file
# if the error is thrown, load again and it is fine
try:
  mseo=get_ontology("https://purl.matolab.org/mseo/mid").load()
except:
  mseo=get_ontology("https://purl.matolab.org/mseo/mid").load()
  
cco_mu=get_ontology("http://www.ontologyrepository.com/CommonCoreOntologies/Mid/UnitsOfMeasureOntology/").load()
qudt=get_ontology('http://www.qudt.org/qudt/owl/1.0.0/unit.owl').load()

json_ld_context=["http://www.w3.org/ns/csvw", {
    "cco": "http://www.ontologyrepository.com/CommonCoreOntologies/",
    "mseo": mseo.base_iri,
    "xsd": "http://www.w3.org/2001/XMLSchema#"}
    ]

def get_encoding(file_data):
  result = chardet.detect(file_data)
  return result['encoding']

def get_column_separator(file_data):
  file_string = io.StringIO(file_data.decode(encoding.value)) 
  sniffer = Sniffer()
  dialect = sniffer.sniff(file_string.read(512))
  return dialect.delimiter

def get_header_lenght(file_data, separator_string, encoding):
  file_string = io.StringIO(file_data.decode(encoding))  
  f = io.StringIO()
  with redirect_stderr(f):
      df = pd.read_csv(file_string,sep=separator.value,error_bad_lines=False,warn_bad_lines=True,header=None)
  f.seek(0)
  #without utf string code b' 
  warn_str=f.read()[2:-2]
  warnlist=warn_str.split('\\n')[:-1]
  #print(warnlist)
  # readout row index and column count in warnings
  line_numbers=[int(re.search('Skipping line (.+?):', line).group(1)) for line in warnlist]
  column_numbers=[int(line[-1]) for line in warnlist]
  column_numbersm1=column_numbers.copy()
  if not column_numbersm1:
    #no additional header
    return 0,0
  #pop lats element, so column_numbers is always lenght +1
  column_numbersm1.pop(-1)
  #assumes that the file ends with a uniform table with constant column count
  #determine changes in counted columns starting from the last line of file
  changed_column_count_line=[line_numbers[index+1] for index in reversed(range(len(column_numbersm1))) if column_numbersm1[index]!=column_numbers[index+1]]
  #print(changed_column_count_line)
  
  if changed_column_count_line:
    # additional header has ends in line before the last change of column count
    first_head_line=changed_column_count_line[0]-1
  elif line_numbers:
    first_head_line=line_numbers[0]-1
  else:
    first_head_line=0
  #print(line_numbers)
  #print(column_numbers)
  #print(line_numbers.index(first_head_line))
  max_columns_additional_header=(max(column_numbers[:line_numbers.index(first_head_line+1)-1]))
  return first_head_line, max_columns_additional_header

  # except:
  #   e = sys.exc_info()[1]
  #   if 'Error tokenizing' in e.args[0]:
  #     #example Error tokenizing data. C error: Expected 3 fields in line 17, saw 5
  #     # column header will be at line 17
  #     line = int(re.search('fields in line (.+?),', e.args[0]).group(1))-1
  #     return line
  # # return zero if ther is no error -> no additional header
  #return 0

def get_num_header_rows_and_dataframe(file_data,separator_string, header_lenght, encoding):
  file_string = io.StringIO(file_data.decode(encoding))
  num_header_rows=1
  #decimal_delimiter='.'
  good_readout=False
  while not good_readout:
    file_string.seek(0)
    #print(num_header_rows,decimal_delimiter)
    #table_data = pd.read_csv(file_string,decimal=decimal_delimiter,header=list(range(num_header_rows)),sep=separator_string,skiprows=header_lenght,encoding=encoding)
    table_data = pd.read_csv(file_string,header=list(range(num_header_rows)),sep=separator_string,skiprows=header_lenght,encoding=encoding)    
    #test if all text values in first table row -> is a second header row
    all_text=all([get_value_type(value)=='TEXT' for column,value in table_data.iloc[0].items()])
    if all_text:
      #print('All cells of first datarow are of type text!')
      num_header_rows+=1
      continue
    else:
      #print('first data row datatypes')
      #print([get_value_type(value) for column,value in table_data.iloc[1].items()])
      good_readout=True
  return num_header_rows, table_data

def get_unit(string):
  found=list(cco_mu.search(alternative_label=string))\
          +list(cco_mu.search(SI_unit_symbol=string))\
          +list(mseo.search(alternative_label=string))\
          +list(mseo.search(SI_unit_symbol=string))\
          +list(qudt.search(symbol=string))\
          +list(qudt.search(abbreviation=string))\
          +list(qudt.search(ucumCode=string))
  if found:
    return {"cco:uses_measurement_unit": {"@id": str(found[0].iri), "@type": str(found[0].is_a)}}
  else:
    return {}

def is_date(string, fuzzy=False):
    try: 
        parse(string, fuzzy=fuzzy)
        return True

    except ValueError:
        return False

def get_value_type(string):
    string=str(string)
    #remove spaces and replace , with . and
    string=string.strip().replace(',','.')
    if len(string) == 0: return 'BLANK'
    try:
        t=ast.literal_eval(string)
    except ValueError:
        return 'TEXT'
    except SyntaxError:
        if is_date(string):
          return 'DATE'
        else:
          return 'TEXT'
    else:
        if type(t) in [int, float, bool]:
          if type(t) is int:
              return 'INT'
          if t in set((True,False)):
              return 'BOOL'
          if type(t) is float:
              return 'FLOAT'
        else:
            return 'TEXT' 

def describe_value(value_string):
  if pd.isna(value_string):
    return {}
  elif get_value_type(value_string)=='INT':
    return {'cco:has_integer_value': {'@value':value_string, '@type': 'xsd:integer'}}
  elif get_value_type(value_string)=='BOOL':
    return {'cco:has_bolean_value': {'@value':value_string, '@type': 'xsd:boolean'}}
  elif get_value_type(value_string)=='FLOAT':
    return {'cco:has_decimal_value': {'@value':value_string, '@type': 'xsd:decimal'}}
  elif get_value_type(value_string)=='DATE':
    return {'cco:has_datetime_value': {'@value':str(parse(value_string)), '@type': 'xsd:dateTime'}}
  else:
    # check if its a unit
    unit_dict=get_unit(value_string)
    if unit_dict:
      return unit_dict
    else:
      return {'cco:has_text_value': {'@value':value_string, '@type': 'xsd:string'}}

umlaute_dict = {
    '\u00e4': 'ae',  # U+00E4	   \xc3\xa4
    '\u00f6': 'oe',  # U+00F6	   \xc3\xb6
    '\u00fc': 'ue',  # U+00FC	   \xc3\xbc
    '\u00c4': 'Ae',  # U+00C4	   \xc3\x84
    '\u00d6': 'Oe',  # U+00D6	   \xc3\x96
    '\u00dc': 'Ue',  # U+00DC	   \xc3\x9c
    '\u00df': 'ss',  # U+00DF	   \xc3\x9f
}

def make_id(string,namespace=None):
  for k in umlaute_dict.keys():
        string = string.replace(k, umlaute_dict[k])
  if namespace:
    return namespace+':'+re.sub('[^A-ZÜÖÄa-z0-9]+', '', string.title().replace(" ", ""))
  else:
    return './'+re.sub('[^A-ZÜÖÄa-z0-9]+', '', string.title().replace(" ", ""))

def get_additional_header(file_data,separator,encoding):
  # get lenght of additional header
  header_lenght, max_columns_additional_header=get_header_lenght(file_data,separator,encoding)
  if header_lenght:
    #print(header_lenght,max_columns_additional_header)
    file_string = io.StringIO(file_data.decode(encoding))
    header_data = pd.read_csv(file_string,header=None,sep=separator,nrows=header_lenght,names=range(max_columns_additional_header),encoding=encoding,skip_blank_lines=False)
    header_data['row']=header_data.index
    #header_data.dropna(how='all', inplace=True)
    header_data.rename(columns={0: 'param'}, inplace=True)
    header_data.set_index('param',inplace=True)
    header_data=header_data[~header_data.index.duplicated()]
    header_data.dropna(thresh=2, inplace=True)
    return header_data, header_lenght
  else:
    return None, 0


def serialize_header(header_data,file_namespace=None):
  params=list()
  info_line_iri="cco:InformationLine"
  for parm_name, data in header_data.to_dict(orient='index').items():
    #describe_value(data['value'])
    para_dict={'@id': make_id(parm_name,file_namespace),'label':parm_name,'@type': info_line_iri}
    for col_name, value in data.items():
      #print(parm_name,col_name, value)
      if col_name=='row':
        para_dict['mseo:has_row_index']={"@value": data['row'],"@type": "xsd:integer"}
      else:
        para_dict={**para_dict,**describe_value(value)}
    params.append(para_dict)
  #print(params)
  return params
  

def process_file(file_name,file_data,separator,encoding):
  #init results dict
  data_root_url="https://github.com/Mat-O-Lab/resources/"
  #file_namespace=data_root_url+file_name.split('.')[0]
  file_namespace=None
  metadata_csvw = dict()
  metadata_csvw["@context"]=json_ld_context
  #metadata_csvw["@id"]=file_namespace
  metadata_csvw["url"]=file_name
  # read additional header lines and provide as meta in results dict
  header_data, header_lenght=get_additional_header(file_data,separator,encoding)
  #print(header_lenght)
  #metadata_csvw["params"]=header_data.dropna().to_dict(orient='index')
  if header_lenght:
    #print("serialze additinal header")
    metadata_csvw["notes"]=serialize_header(header_data,file_namespace)
  # read tabular data structure, and determine number of header lines for column description used
  #print(get_num_header_rows_and_dataframe(file_data,separator,header_lenght,encoding))
  #print(header_lenght)
  header_lines, table_data=get_num_header_rows_and_dataframe(file_data,separator,header_lenght,encoding)
  # describe dialect
  metadata_csvw["dialect"]={"delimiter": separator,
  "skipRows": header_lenght, "headerRowCount": header_lines, "encoding": encoding}
  # describe columns
  if header_lines==1:
    # see if there might be a unit string at the end of each title
    column_json=list()
    for index, title in enumerate(table_data.columns):
      if len(title.split(' '))>1:
        unit_json=get_unit(title.split(' ')[-1])  
      else:
        unit_json={}
      json_str={**{'titles': title,'@id': make_id(title), "@type": "Column"},**unit_json}
      column_json.append(json_str)
    metadata_csvw["tableSchema"]={"columns":column_json}
    #metadata_csvw["tableSchema"]={"columns":list({'titles':column, '@id': make_id(column), "@type": "Column"} for column in table_data.columns)}
  else:
    column_json=list()
    for index, (title,unit_str) in enumerate(table_data.columns):
      json_str={**{'titles': title,'@id': make_id(title), "@type": "Column"},**get_unit(unit_str)}
      #print(json_str)
      column_json.append(json_str)
    metadata_csvw["tableSchema"]={"columns":column_json}
  result=json.dumps(metadata_csvw, indent = 4)
  meta_file_name = file_name.split(sep='.')[0] + '-metadata.json'
  return meta_file_name, result

Collecting Owlready2
  Downloading Owlready2-0.34.tar.gz (23.7 MB)
[K     |████████████████████████████████| 23.7 MB 109 kB/s 
[?25hBuilding wheels for collected packages: Owlready2
  Building wheel for Owlready2 (setup.py) ... [?25l[?25hdone
  Created wheel for Owlready2: filename=Owlready2-0.34-cp37-cp37m-linux_x86_64.whl size=20419508 sha256=c11275603b33c6bb8f35bdc6c695dce34616dfb4cd91100ba6f5682a16ccdb12
  Stored in directory: /root/.cache/pip/wheels/ec/bd/94/4b682aa2b2c7a9d3cd5c7e036f7267940a92ef55889de68db6
Successfully built Owlready2
Installing collected packages: Owlready2
Successfully installed Owlready2-0.34


  http://www.ontologyrepository.com/CommonCoreOntologies/connected_with



In [3]:
#@title Dialog { vertical-output: true }
# dialog
uploader = widgets.FileUpload(
    accept='',  # Accepted file extension e.g. '.txt', '.pdf', 'image/*', 'image/*,.pdf'
    multiple=False,  # True to accept multiple files upload else False
    description='Upload'

)
clear = widgets.Button(description='Clear!', layout=widgets.Layout(width='100px')); 
def on_clear(_):
  uploader.value.clear()
  uploader._counter = 0
clear.on_click(on_clear)

file= widgets.HBox([widgets.Label(value="File:"), uploader,clear])
encoding = widgets.Dropdown(
    options=['auto', 'ISO-8859-1', 'UTF-8', 'ascii', 'latin-1','cp273'],
    value='auto',
    description='Encoding:',
    disabled=False,
)
separator = widgets.Dropdown(
    options=['auto', ',',';', '\t', '|', "\s+","\s+|\t+|\s+\t+|\t+\s+"],
    value='auto',
    description='separator:',
    disabled=False,
)
settings= widgets.HBox([encoding, separator])
button = widgets.Button(description='Process!', layout=widgets.Layout(width='200px')); 
out = widgets.Output()

def on_button_clicked(_):
  # "linking function with output"
  with out:
  # what happens when we press the button
    clear_output()
    if not uploader.value.keys():
      print('pls upload a file first')
      return
    input_file=uploader.value[list(uploader.value.keys())[0]]
    file_name = input_file['metadata']['name']
    file_data = input_file['content']
    if encoding.value=='auto':
      encoding.value=get_encoding(file_data)
    if separator.value=='auto':
      try:
        separator.value=get_column_separator(file_data)
      except:
        print('cant find separator, pls manualy select')
    metafile_name, result =process_file(file_name,file_data,separator.value,encoding.value)
    print(result)
    res = result
    b64 = base64.b64encode(res.encode())
    payload = b64.decode()
    html_buttons = '''<html>
    <head>
    <meta name="viewport" content="width=device-width, initial-scale=1">
    </head>
    <body>
    <a download="{filename}" href="data:text/json;base64,{payload}" download>
    <button class="p-Widget jupyter-widgets jupyter-button widget-button mod-warning">Download File</button>
    </a>
    </body>
    </html>
    '''
    html_button = html_buttons.format(payload=payload,filename=metafile_name)
    display(widgets.HTML(html_button))

button.on_click(on_button_clicked)
process = widgets.VBox([button,out])
display(file,settings,process)

HBox(children=(Label(value='File:'), FileUpload(value={}, description='Upload'), Button(description='Clear!', …

HBox(children=(Dropdown(description='Encoding:', options=('auto', 'ISO-8859-1', 'UTF-8', 'ascii', 'latin-1', '…

VBox(children=(Button(description='Process!', layout=Layout(width='200px'), style=ButtonStyle()), Output()))