In [4]:
from numpy.core.numeric import NaN
#-*- coding: UTF-8 -*-
#@title Code - Run Once To Start { vertical-output: true, display-mode: "form" }
import ipywidgets as widgets
from IPython.display import display, clear_output
import matplotlib.pyplot as plt
import pandas as pd 
import io
import sys
import ast, re
import base64
import json
from dateutil.parser import parse
from contextlib import redirect_stderr
from csv import Sniffer
import chardet
from urllib.request import urlopen

%matplotlib notebook

!pip install Owlready2
from owlready2 import *

#there is a bug in Owlready2 when having imports in turtle in a owl file
# if the error is thrown, load again and it is fine
try:
  mseo=get_ontology("https://purl.matolab.org/mseo/mid").load()
except:
  mseo=get_ontology("https://purl.matolab.org/mseo/mid").load()
  
cco_mu=get_ontology("http://www.ontologyrepository.com/CommonCoreOntologies/Mid/UnitsOfMeasureOntology/").load()
qudt=get_ontology('http://www.qudt.org/qudt/owl/1.0.0/unit.owl').load()

class CSV_Annotator():
  def __init__(self, csv_url=''):
      self.csv_url = csv_url
      self.json_ld_context=[
        "http://www.w3.org/ns/csvw", {
        "cco": "http://www.ontologyrepository.com/CommonCoreOntologies/",
        "mseo": mseo.base_iri,
        "label": "http://www.w3.org/2000/01/rdf-schema#label",
        "xsd": "http://www.w3.org/2001/XMLSchema#"}
        ]
      self.umlaute_dict = {
      '\u00e4': 'ae',  # U+00E4	   \xc3\xa4
      '\u00f6': 'oe',  # U+00F6	   \xc3\xb6
      '\u00fc': 'ue',  # U+00FC	   \xc3\xbc
      '\u00c4': 'Ae',  # U+00C4	   \xc3\x84
      '\u00d6': 'Oe',  # U+00D6	   \xc3\x96
      '\u00dc': 'Ue',  # U+00DC	   \xc3\x9c
      '\u00df': 'ss',  # U+00DF	   \xc3\x9f
      }

  def _create_initial_widgets(self):
      self.url_widget=widgets.Text(
          value='',
          placeholder='put ur url to a *-metadata.json here',
          description='Url:',
          disabled=False
          )
      self.uploader = widgets.FileUpload(accept='',  # Accepted file extension e.g. '.txt', '.pdf', 'image/*', 'image/*,.pdf'
                                          multiple=False,  # True to accept multiple files upload else False
                                          description='Upload'
                                          )
      self.clear_button = widgets.Button(description='Clear!', layout=widgets.Layout(width='100px')); 
      self.file_dialog= widgets.HBox([widgets.Label(value="File:"), self.url_widget ,self.uploader,self.clear_button])
      self.clear_button.on_click(self._on_clear)
      
      self.out = widgets.Output()  # this is the output widget in which the df is displayed
      self.encoding = widgets.Dropdown(
          options=['auto', 'ISO-8859-1', 'UTF-8', 'ascii', 'latin-1','cp273'],
          value='auto',
          description='Encoding:',
          disabled=False,
      )
      self.separator = widgets.Dropdown(
          options=['auto', ',',';', '\t', '|', "\s+","\s+|\t+|\s+\t+|\t+\s+"],
          value='auto',
          description='separator:',
          disabled=False,
      )
      self.settings= widgets.HBox([self.encoding, self.separator])
      self.process_button = widgets.Button(description='Process!', layout=widgets.Layout(width='200px')); 
      self.process_button.on_click(self._on_process)
  def _on_clear(self,button):
    self.url_widget.value=''
    self.uploader.value.clear()
    self.uploader._counter = 0

  def _on_process(self,button):
    with self.out:
      clear_output()
      if not (self.url_widget.value or self.uploader.value.keys()):
          print('pls upload a file first or insert a url')
          return
      if self.url_widget.value:
        self.csv_url=self.url_widget.value
        file_name=self.csv_url.split('/')[-1]
        self.file_data = urlopen(self.csv_url).read()
      else:
        input_file=self.uploader.value[list(self.uploader.value.keys())[0]]
        self.csv_meta_url=input_file['metadata']['name']
        file_name = input_file['metadata']['name']
        self.file_data = input_file['content']
      if self.encoding.value=='auto':
        self.encoding.value=self.get_encoding(self.file_data)
      if self.separator.value=='auto':
        try:
          self.separator.value=self.get_column_separator(self.file_data)
        except:
          print('cant find separator, pls manualy select')
      metafile_name, result =self.process_file(file_name,self.file_data,self.separator.value,self.encoding.value)
      print(result)
      res = result
      b64 = base64.b64encode(res.encode())
      payload = b64.decode()
      html_buttons = '''<html>
      <head>
      <meta name="viewport" content="width=device-width, initial-scale=1">
      </head>
      <body>
      <a download="{filename}" href="data:text/json;base64,{payload}" download>
      <button class="p-Widget jupyter-widgets jupyter-button widget-button mod-warning">Download File</button>
      </a>
      </body>
      </html>
      '''
      html_button = html_buttons.format(payload=payload,filename=metafile_name)
      display(widgets.HTML(html_button))

    
  def display_widgets(self):
    self._create_initial_widgets()
    display(widgets.VBox(
                [
                  self.file_dialog,
                  self.settings,
                  self.process_button,
                  self.out
                ]
            )
    )

  def get_encoding(self,file_data):
    """

    :param file_data:   content of the file we want to parse
    :return:            encoding of the specified file content e.g. utf-8, ascii..
    """
    result = chardet.detect(file_data)
    return result['encoding']

  def get_column_separator(self,file_data):
    """

    :param file_data: data of the file we want to parse
    :return:          the seperator of the specified data, e.g. ";" or ","
    """
    file_string = io.StringIO(file_data.decode(self.encoding.value))
    sniffer = Sniffer()
    dialect = sniffer.sniff(file_string.read(512))
    return dialect.delimiter

  def get_header_lenght(self,file_data, separator_string, encoding):
    """ 
    This method finds the beginning of a header line inside a csv file.
        Some csv files begin with additional information before
        displaying the actual data-table.

        We want to solve this problem by finding the beginning of the header-line
        (column-descriptors) and read the metainfo and data-table separately.

    :param file_data: content of the file we want to parse
    :param separator_string: csv-separator
    :param encoding: text encoding
    :return: a 2-tuple of (first_head_line, max_columns_additional_header)
                  where
                      first_head_line : index of the header line in the csv file
                      max_columns_additional_header : number of columns in the data-table
    """
    
    # since pandas throws errormessages when encountering a parseerror (meaning when
    # encountering a csv-file with changing column-count for example), we can
    # redirect the error to file_string. Then, we can read and analyze the error-message.
    # This is helpful since we can see in which line the parser expected n columns, but got m instead.

    file_string = io.StringIO(file_data.decode(encoding))  
    f = io.StringIO()
    with redirect_stderr(f):
        df = pd.read_csv(file_string,sep=separator_string,error_bad_lines=False,warn_bad_lines=True,header=None)
    f.seek(0)
    #without utf string code b' 
    warn_str=f.read()[2:-2]

    # split the warnings up
    warnlist=warn_str.split('\\n')[:-1]

    # The warnings we care about are of form 'Skipping line x: expected n columns, got m'
    # readout row index and column count in warnings
    line_numbers=[int(re.search('Skipping line (.+?):', line).group(1)) for line in warnlist]
    
    # get the found number of columns
    column_numbers=[int(line[-1]) for line in warnlist]
    column_numbersm1=column_numbers.copy()
    if not column_numbersm1:
      #no additional header
      return 0,0

    #pop last element, so column_numbers is always lenght +1
    column_numbersm1.pop(-1)

    #assumes that the file ends with a uniform table with constant column count
    #determine changes in counted columns starting from the last line of file
    changed_column_count_line=[line_numbers[index+1] for index in reversed(range(len(column_numbersm1))) if column_numbersm1[index]!=column_numbers[index+1]]
    
    # if there are column count - changes, then the first head-line is the the index
    # of the row of the last change of column count minus 1.
    if changed_column_count_line:

      # additional header has ends in line before the last change of column count
      first_head_line=changed_column_count_line[0]-1
    elif line_numbers:

      # edgecase is that we only have one column-count change, in this case,
      # changed_column_count_line is empty, thus, first_head_line is just the first change
      first_head_line=line_numbers[0]-1
    else:
      first_head_line=0

    # starting from first_head_line, max_columns_additional_header is the
    # maximum number of columns
    max_columns_additional_header=(max(column_numbers[:line_numbers.index(first_head_line+1)-1]))
    return first_head_line, max_columns_additional_header

  def get_num_header_rows_and_dataframe(self,file_data,separator_string, header_lenght, encoding):
    """

    :param file_data: content of the file we want to parse
    :param separator_string: csv-delimiter
    :param header_length: rows of the header
    :param encoding: csv-encoding
    :return: 2-tuple (num_header_rows, table_data)
                  where
                      num_header_rows : number of header rows
                      table_data : pandas DataFrame object containing the tabular information
    """
    
    
    file_string = io.StringIO(file_data.decode(encoding))
    num_header_rows=1

    good_readout=False
    while not good_readout:
      file_string.seek(0)
      table_data = pd.read_csv(file_string,header=list(range(num_header_rows)),sep=separator_string,skiprows=header_lenght,encoding=encoding)    
      
      #test if all text values in first table row -> is a second header row
      all_text=all([self.get_value_type(value)=='TEXT' for column,value in table_data.iloc[0].items()])
      if all_text:
        num_header_rows+=1
        continue
      else:
        good_readout=True
    return num_header_rows, table_data

  def get_unit(self,string):
    found=list(cco_mu.search(alternative_label=string))\
            +list(cco_mu.search(SI_unit_symbol=string))\
            +list(mseo.search(alternative_label=string))\
            +list(mseo.search(SI_unit_symbol=string))\
            +list(qudt.search(symbol=string))\
            +list(qudt.search(abbreviation=string))\
            +list(qudt.search(ucumCode=string))
    if found:
      return {"cco:uses_measurement_unit": {"@id": str(found[0].iri), "@type": str(found[0].is_a)}}
    else:
      return {}

  def is_date(self,string, fuzzy=False):
      try: 
          parse(string, fuzzy=fuzzy)
          return True

      except ValueError:
          return False

  def get_value_type(self,string):
      string=str(string)
      #remove spaces and replace , with . and
      string=string.strip().replace(',','.')
      if len(string) == 0: return 'BLANK'
      try:
          t=ast.literal_eval(string)
      except ValueError:
          return 'TEXT'
      except SyntaxError:
          if self.is_date(string):
            return 'DATE'
          else:
            return 'TEXT'
      else:
          if type(t) in [int, float, bool]:
            if type(t) is int:
                return 'INT'
            if t in set((True,False)):
                return 'BOOL'
            if type(t) is float:
                return 'FLOAT'
          else:
              return 'TEXT' 

  def describe_value(self,value_string):
    if pd.isna(value_string):
      return {}
    elif self.get_value_type(value_string)=='INT':
      return {'cco:has_integer_value': {'@value':value_string, '@type': 'xsd:integer'}}
    elif self.get_value_type(value_string)=='BOOL':
      return {'cco:has_bolean_value': {'@value':value_string, '@type': 'xsd:boolean'}}
    elif self.get_value_type(value_string)=='FLOAT':
      return {'cco:has_decimal_value': {'@value':value_string, '@type': 'xsd:decimal'}}
    elif self.get_value_type(value_string)=='DATE':
      return {'cco:has_datetime_value': {'@value':str(parse(value_string)), '@type': 'xsd:dateTime'}}
    else:
      # check if its a unit
      unit_dict=self.get_unit(value_string)
      if unit_dict:
        return unit_dict
      else:
        return {'cco:has_text_value': {'@value':value_string, '@type': 'xsd:string'}}

  def make_id(self,string,namespace=None):
    for k in self.umlaute_dict.keys():
          string = string.replace(k, self.umlaute_dict[k])
    if namespace:
      return namespace+':'+re.sub('[^A-ZÜÖÄa-z0-9]+', '', string.title().replace(" ", ""))
    else:
      return './'+re.sub('[^A-ZÜÖÄa-z0-9]+', '', string.title().replace(" ", ""))

  def get_additional_header(self,file_data,separator,encoding):
    """

    :param file_data: content of the file we want to parse
    :param separator: csv-separator
    :param encoding: text encoding
    :return:
    """
    
    # get lenght of additional header
    header_lenght, max_columns_additional_header=self.get_header_lenght(file_data,separator,encoding)

    if header_lenght:
      file_string = io.StringIO(file_data.decode(encoding))
      header_data = pd.read_csv(file_string,header=None,sep=separator,nrows=header_lenght,names=range(max_columns_additional_header),encoding=encoding,skip_blank_lines=False)
      header_data['row']=header_data.index
      header_data.rename(columns={0: 'param'}, inplace=True)
      header_data.set_index('param',inplace=True)
      header_data=header_data[~header_data.index.duplicated()]
      header_data.dropna(thresh=2, inplace=True)
      return header_data, header_lenght
    
    else:
      return None, 0


  def serialize_header(self,header_data,file_namespace=None):
    
    
    params=list()
    info_line_iri="cco:InformationLine"
    for parm_name, data in header_data.to_dict(orient='index').items():
      #describe_value(data['value'])
      para_dict={'@id': self.make_id(parm_name,file_namespace),'label':parm_name,'@type': info_line_iri}
      for col_name, value in data.items():
        #print(parm_name,col_name, value)
        if col_name=='row':
          para_dict['mseo:has_row_index']={"@value": data['row'],"@type": "xsd:integer"}
        else:
          para_dict={**para_dict,**self.describe_value(value)}
      params.append(para_dict)
    #print(params)
    return params
    

  def process_file(self,file_name,file_data,separator,encoding):
    """

    :param file_name: name of the file we want to process
    :param file_data: content of the file
    :param separator: csv-seperator /delimiter
    :param encoding:  text-encoding (e.g. utf-8..)
    :return: a 2-tuple (meta_filename,result)
                  where
                      result :    the resulting metadata on how to
                                  read the file (skiprows, colnames ..)
                                  as a json dump
                      meta_filename :  the name of the metafile we want to write
    """
    
    #init results dict
    data_root_url="https://github.com/Mat-O-Lab/resources/"

    file_namespace=None
    metadata_csvw = dict()
    metadata_csvw["@context"]=self.json_ld_context
    #metadata_csvw["@id"]=file_namespace
    metadata_csvw["url"]=file_name
    # read additional header lines and provide as meta in results dict
    header_data, header_lenght=self.get_additional_header(file_data,separator,encoding)

    if header_lenght:
      #print("serialze additinal header")
      metadata_csvw["notes"]=self.serialize_header(header_data,file_namespace)
    
    # read tabular data structure, and determine number of header lines for column description used
    header_lines, table_data=self.get_num_header_rows_and_dataframe(file_data,separator,header_lenght,encoding)
    
    # describe dialect
    metadata_csvw["dialect"]={"delimiter": separator,
    "skipRows": header_lenght, "headerRowCount": header_lines, "encoding": encoding}
    
    # describe columns
    if header_lines==1:
      # see if there might be a unit string at the end of each title
      # e.g. "E_y (MPa)"
      column_json=list()
      for index, title in enumerate(table_data.columns):
        if len(title.split(' '))>1:
          unit_json=self.get_unit(title.split(' ')[-1])  
        else:
          unit_json={}
        json_str={**{'titles': title,'@id': self.make_id(title), "@type": "Column"},**unit_json}
        column_json.append(json_str)
      metadata_csvw["tableSchema"]={"columns":column_json}

    else:
      column_json=list()
      for index, (title,unit_str) in enumerate(table_data.columns):
        json_str={**{'titles': title,'@id': self.make_id(title), "@type": "Column"},**self.get_unit(unit_str)}
        column_json.append(json_str)
      metadata_csvw["tableSchema"]={"columns":column_json}
    
    result=json.dumps(metadata_csvw, indent = 4)
    meta_file_name = file_name.split(sep='.')[0] + '-metadata.json'
    return meta_file_name, result



In [5]:
#@title Dialog - Run Cell to begin
dialog = CSV_Annotator()
dialog.display_widgets()

VBox(children=(HBox(children=(Label(value='File:'), Text(value='', description='Url:', placeholder='put ur url…