<a href="https://colab.research.google.com/github/Mat-O-Lab/MSEO/blob/main/tools/csv_parser.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [44]:
#@title Code
import ipywidgets as widgets
from IPython.display import display, clear_output
import matplotlib.pyplot as plt
import pandas as pd 
import io
import sys
import re
import base64
import json
%matplotlib notebook

def get_header_lenght(file_data, separator_string, encoding):
  file_string = io.StringIO(file_data.decode(encoding))  
  try:
    df = pd.read_csv(file_string,sep=separator_string)
  except:
    e = sys.exc_info()[1]
    if 'Error tokenizing' in e.args[0]:
      #example Error tokenizing data. C error: Expected 3 fields in line 17, saw 5
      # column header will be at line 17
      line = int(re.search('fields in line (.+?),', e.args[0]).group(1))-1
      return line
  # return zero if ther is no error -> no additional header
  return 0

def get_num_header_rows_and_dataframe(file_data,separator_string, header_lenght, encoding):
  file_string = io.StringIO(file_data.decode(encoding))
  num_header_rows=1
  table_data = pd.read_csv(file_string,header=list(range(num_header_rows)),sep=separator_string,skiprows=header_lenght,encoding=encoding)
  print(table_data.columns,header_lenght)
  #print(all(table_data[column].dtype=='object' for column in table_data.columns),num_header_rows)
  while all(table_data[column].dtype=='object' for column in table_data.columns):
    print(all(table_data[column].dtype=='object' for column in table_data.columns),num_header_rows)
    #has probablly multi line header
    num_header_rows+=1
    file_string = io.StringIO(file_data.decode(encoding))  
    #print(num_header_rows)
    table_data = pd.read_csv(file_string,header=list(range(num_header_rows)),sep=separator_string,skiprows=header_lenght,encoding=encoding)
    if num_header_rows>10:
      # something is wrong - returning zero
      return 0
  return num_header_rows, table_data

def process_file(file_name,file_data,separator,encoding):
  #init results dict
  metadata_csvw = dict()
  metadata_csvw["@context"]="http://www.w3.org/ns/csvw"
  metadata_csvw["url"]=file_name
  # get lenght of additional header
  header_lenght=get_header_lenght(file_data,separator,encoding)
  # read additional header lines and provide as meta in results dict
  file_string = io.StringIO(file_data.decode(encoding))
  header_data = pd.read_csv(file_string,names=['param','value','unit'],header=None,sep=separator,nrows=header_lenght,encoding=encoding)
  header_data.set_index('param',inplace=True)
  metadata_csvw["params"]=header_data.dropna().to_dict(orient='index')
  # read tabular data structure, and determine number of header lines for column description used
  header_lines, table_data=get_num_header_rows_and_dataframe(file_data,separator,header_lenght,encoding)
  # describe dialect
  metadata_csvw["dialect"]={"delimiter": separator,
  "skipRows": header_lenght, "headerRowCount": header_lines}
  # describe columns
  if header_lines==1:
    metadata_csvw["tableSchema"]={"columns":list({'titles':column} for column in table_data.columns)}
  else:
    metadata_csvw["tableSchema"]={"columns":list({'titles':col_tuple[0],'unitString':col_tuple[1]} for (index, col_tuple) in enumerate(table_data.columns))}
  result=json.dumps(metadata_csvw, indent = 4)
  meta_file_name = file_name.split(sep='.')[0] + '-metadata.json'
  return meta_file_name, result


In [41]:
#@title Dialog { vertical-output: true }
# dialog
uploader = widgets.FileUpload(
    accept='',  # Accepted file extension e.g. '.txt', '.pdf', 'image/*', 'image/*,.pdf'
    multiple=False,  # True to accept multiple files upload else False
    description='Upload'

)
file= widgets.HBox([widgets.Label(value="File:"), uploader])
encoding = widgets.Dropdown(
    options=['ISO-8859-1', 'UTF-8', 'ascii', 'latin-1','cp273'],
    value='ISO-8859-1',
    description='Encoding:',
    disabled=False,
)
separator = widgets.Dropdown(
    options=[';', '\t', '|'],
    value='\t',
    description='separator:',
    disabled=False,
)
settings= widgets.HBox([encoding, separator])
button = widgets.Button(description='Process!', layout=widgets.Layout(width='200px')); 
out = widgets.Output()

def on_button_clicked(_):
  # "linking function with output"
  with out:
  # what happens when we press the button
    clear_output()
    input_file=uploader.value[list(uploader.value.keys())[0]]
    file_name = input_file['metadata']['name']
    file_data = input_file['content']
    metafile_name, result =process_file(file_name,file_data,separator.value,encoding.value)
    print(result)
    res = result
    b64 = base64.b64encode(res.encode())
    payload = b64.decode()
    html_buttons = '''<html>
    <head>
    <meta name="viewport" content="width=device-width, initial-scale=1">
    </head>
    <body>
    <a download="{filename}" href="data:text/json;base64,{payload}" download>
    <button class="p-Widget jupyter-widgets jupyter-button widget-button mod-warning">Download File</button>
    </a>
    </body>
    </html>
    '''
    html_button = html_buttons.format(payload=payload,filename=meta_file)
    display(widgets.HTML(html_button))

button.on_click(on_button_clicked)
process = widgets.VBox([button,out])
display(file,settings,process)

HBox(children=(Label(value='File:'), FileUpload(value={}, description='Upload')))

HBox(children=(Dropdown(description='Encoding:', options=('ISO-8859-1', 'UTF-8', 'ascii', 'latin-1', 'cp273'),…

VBox(children=(Button(description='Process!', layout=Layout(width='200px'), style=ButtonStyle()), Output()))