In [1]:
import pandas
import re



In [2]:
filename = './test/data/p08.csv'    
# Tamanho maximo em linhas do da informacao extra que pode estar dentro do arquivo
MAX_EXTRA_SIZE = 500
encoding = 'utf-8'
delimiter = ','

In [3]:
def find_title(str_line):
    # Extrai titulo de string
    match = re.search(r'(?:Plot Title: )([^"]+)',str_line)
    if match:
        return match.group(1)
    else:
        return None


def find_serial_number(str_line):
    """
    Extrai numero de serial de string
    """
    match = re.search(r'(?:LGR S/N: |Serial Number:)(\d+)',str_line)
    if match:
        return match.group(1)
    else:
        return None

def get_info(file_name):
    
    # Obtem nome das colunas
    header = list(pandas.read_csv(filename, delimiter=delimiter,  header=0, skiprows=1, nrows=0, encoding=encoding))

    # Extrai titulo e informacoes extras se disponiveis
    fo = open(filename, 'rt', encoding='utf-8')
    title = find_title(fo.readline())
    sn = find_serial_number(fo.readline())

    # Informacoes extras
    n_cols = len(header)
    extra = []
    for i in range(MAX_EXTRA_SIZE):
        # separa nos separadores, mas não se tiver dentro de ""
        fields = re.split(delimiter + '(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)', fo.readline())
        n_fields = len(fields)
        if n_fields > n_cols:
            extra.append(delimiter.join(fields[n_cols:]))
        elif n_fields < n_cols:
            # Provavelmente uma linha invalida. ignora
            pass
        else:
            # numero de campos é igual ao de dados. termina procura por dados extra
            break
    extra = ''.join(extra)
    fo.close()
    
    return title, sn, header, extra

def get_data(file_name):
    # Extrai dados
    header = pandas.read_csv(filename, delimiter=delimiter,  header=0, skiprows=1, nrows=0, encoding=encoding)
    table = pandas.read_csv(filename, delimiter=delimiter, header=0, skiprows=1, encoding=encoding, usecols=header) 
    return table


def process_data(text):
    levels = []
    levels.append(['Details'])
    levels.append(['Series:','Event Type:'])
    levels.append(['Devices', 'Deployment Info', 'Series Statistics', 'Filter Parameters'])
    levels.append(['Device Info'])
    #teste = re.split(r'[\n](?=Details|Series: |Event Type: )',extra)
    #r'(?:Series:|Event Type:).+?[\n](?=Series:|Event Type:|$)'    
    return get_all_groups(text, levels)

def get_group(text, level):
    regex1 = '(?:'
    regex2 = '.+?[\n](?='
    first = True
    for m in level:
        if not first:
            regex1 += '|'
            regex2 += '|'
        else:
            first = False
        regex1 += m
        regex2 += m
        
    regex1 += ')'
    regex2 += '|$)'
    regex = regex1 + regex2
    match = re.compile(regex, re.S)
    return match.findall(text)

def text_to_dict(text):
    fields = text.split('\n')
    d = {}
    for f in fields:
        s = f.split(':', 1)
        if len(s) == 2:
            d.update({s[0].strip(): s[1].strip()})
    return(d)

def get_all_groups(text, levels, level_number=0):
    n_levels = len(levels)
    groups = []
    temp = get_group(text, levels[level_number])  

    output = {}
    level_number += 1
    for l in temp:
        [key, val] = l.split("\n", 1)
        new_val = None
        if level_number < n_levels:
            new_val = get_all_groups(val, levels, level_number)
        if new_val:
            val = new_val
        else:
            val = text_to_dict(val)

        output.update({key: val})
    return output


In [4]:
table = get_data(filename)
display(table)

Unnamed: 0,#,"Date Time, GMT-03:00","Temp, °F (LGR S/N: 10440739, SEN S/N: 10440739)","Chuva, mm (LGR S/N: 10440739, SEN S/N: 10440739)","Soma Acum.: Chuva, mm (LGR S/N: 10440739)",Coupler Attached (LGR S/N: 10440739),Host Connected (LGR S/N: 10440739),End Of File (LGR S/N: 10440739)
0,1,02/28/20 10:00:00 AM,89.017,0.0,0.0,,,
1,2,02/28/20 10:05:00 AM,89.569,,0.0,,,
2,3,02/28/20 10:10:00 AM,89.755,,0.0,,,
3,4,02/28/20 10:15:00 AM,90.124,,0.0,,,
4,5,02/28/20 10:20:00 AM,90.495,,0.0,,,
...,...,...,...,...,...,...,...,...
20839,20834,05/08/20 03:35:00 PM,71.512,,0.0,,,
20840,20835,05/08/20 03:40:00 PM,71.168,,0.0,,,
20841,20836,05/08/20 03:45:00 PM,70.995,,0.0,,,
20842,20837,05/08/20 03:45:31 PM,,,,Logged,,


In [5]:
title, sn, header, extra = get_info(filename)

In [6]:
print('Titulo: {}'.format(title))
print('Numero de serie: {}'.format(sn))
print('Cabecalho: {}'.format(header))
print('Informacao extra: {}'.format(extra))

Titulo: EH-HP08
Numero de serie: 10440739
Cabecalho: ['#', 'Date Time, GMT-03:00', 'Temp, °F (LGR S/N: 10440739, SEN S/N: 10440739)', 'Chuva, mm (LGR S/N: 10440739, SEN S/N: 10440739)', 'Soma Acum.: Chuva, mm (LGR S/N: 10440739)', 'Coupler Attached (LGR S/N: 10440739)', 'Host Connected (LGR S/N: 10440739)', 'End Of File (LGR S/N: 10440739)']
Informacao extra: Details
Series: Temp, °F
Devices
Device Info
Product: HOBO UA-003-64 Pendant Tempent
Serial Number: 10440739
Version Number: 1.17
Manufacturer: Onset Computer Corp.
Device Memory: 65536
Header Created: 08/03/17 09:31:21 AM GMT-03:00
Deployment Info
Full Series Name: Temperature, °F
Launch Name: EH-HP08
Deployment Number: 17
Launch Time: 02/28/20 09:10:36 AM GMT-03:00
Logging Interval: 00 Hr 05 Min 00 Sec
Launch GMT Offset: -3 Hr 0 Min
Battery at Launch: 3.17 Volts
Launching Program: HOBOware -3.7.12_0425_0948_Windows
Series Statistics
Samples: 20,230
Max: 108.964
Min: 38.629
Avg: 67.761
Std Dev (σ): 11.855
First Sample Time: 02/28

In [7]:
process_data(extra)

{'Details': {'Series: Temp, °F': {'Devices': {'Device Info': {'Product': 'HOBO UA-003-64 Pendant Tempent',
     'Serial Number': '10440739',
     'Version Number': '1.17',
     'Manufacturer': 'Onset Computer Corp.',
     'Device Memory': '65536',
     'Header Created': '08/03/17 09:31:21 AM GMT-03:00'}},
   'Deployment Info': {'Full Series Name': 'Temperature, °F',
    'Launch Name': 'EH-HP08',
    'Deployment Number': '17',
    'Launch Time': '02/28/20 09:10:36 AM GMT-03:00',
    'Logging Interval': '00 Hr 05 Min 00 Sec',
    'Launch GMT Offset': '-3 Hr 0 Min',
    'Battery at Launch': '3.17 Volts',
    'Launching Program': 'HOBOware -3.7.12_0425_0948_Windows'},
   'Series Statistics': {'Samples': '20,230',
    'Max': '108.964',
    'Min': '38.629',
    'Avg': '67.761',
    'Std Dev (σ)': '11.855',
    'First Sample Time': '02/28/20 10:00:00 AM GMT-03:00',
    'Last Sample Time': '05/08/20 03:45:00 PM GMT-03:00'}},
  'Series: Chuva, mm': {'Devices': {'Device Info': {'Product': 'HOBO 