In [1]:
import pandas
import re
from gbdhidro.hobo import hobo

In [2]:
filename = '/media/jairo/Dados/Jairo/Projetos/Samuel data/git/GBD-Hidro/src/station_raw_to_netcdf/input/hobo UA-003 precipitacao/EHP02039.csv'
ID = 'EHP-07'
SN = '10364362'
config_file = './stations_info.csv'

In [3]:
title, serial_number, header, extra = hobo.get_info(filename)
print(title, serial_number)
print(header)

EHP-02 20497860
['#', 'Date Time, GMT-03:00', 'Temp, °F (LGR S/N: 20497860, SEN S/N: 20497860)', 'CHUVA, mm (LGR S/N: 20497860, SEN S/N: 20497860)', 'Soma Acum.: CHUVA, mm (LGR S/N: 20497860)', 'Coupler Attached (LGR S/N: 20497860)', 'Host Connected (LGR S/N: 20497860)', 'End Of File (LGR S/N: 20497860)']


In [4]:
# Abre arquivo de configuracao e retira dados importantes
cfgs = pandas.read_csv(config_file)
row = cfgs.loc[cfgs['Plot Title'] == title]
if row.empty:
    # Erro - nao tem nenhuma informacao sobre esse titulo de plot
    print('Buuu - Nao encontrei nada com esse titulo de plot')
station_id = row.iloc[0]['Codigo']
station_sn = row.iloc[0]['Numero de serie']
station_latitude = row.iloc[0]['Latitude [graus]']
station_longitude = row.iloc[0]['Longitude [graus]']
station_altitude = row.iloc[0]['Altitude [m]']
station_variable_col = row.iloc[0]['Coluna variavel']
station_datetime_col = row.iloc[0]['Coluna data/hora']

In [6]:
station_sn

10440731

In [7]:
# checa se arquivo tem o nome e o numero de serie esperado
if title!=ID:
    print('Titulo do arquivo ({}) diferente de id esperado ({})'.format(title,ID))

if serial_number != SN:
    print('Serial number ({}) diferente do esperado ({})'.format(serial_number, SN))

Titulo do arquivo (EHP-02) diferente de id esperado (EHP-07)
Serial number (20497860) diferente do esperado (10364362)


In [8]:
# Le dados
data = hobo.get_data(filename)
data

Unnamed: 0,#,"Date Time, GMT-03:00","Temp, °F (LGR S/N: 20497860, SEN S/N: 20497860)","CHUVA, mm (LGR S/N: 20497860, SEN S/N: 20497860)","Soma Acum.: CHUVA, mm (LGR S/N: 20497860)",Coupler Attached (LGR S/N: 20497860),Host Connected (LGR S/N: 20497860),End Of File (LGR S/N: 20497860)
0,1,02/26/20 12:00:00 AM,,,0.0,,,
1,2,02/26/20 05:00:00 PM,80.911,0.0,,,,
2,3,02/26/20 05:05:00 PM,80.029,,,,,
3,4,02/26/20 05:10:00 PM,79.326,,,,,
4,5,02/26/20 05:15:00 PM,78.800,,,,,
...,...,...,...,...,...,...,...,...
21161,21162,05/08/20 02:30:00 PM,71.339,,,,,
21162,21163,05/08/20 02:35:00 PM,71.339,,,,,
21163,21164,05/08/20 02:37:13 PM,,91.2,,,,
21164,21165,05/08/20 02:37:23 PM,,,,Logged,,


In [13]:
# Titule e sn confirmado, le dados extra
var_col = 'CHUVA, mm (LGR S/N: 20497860, SEN S/N: 20497860)'
v = data[var_col]
v.index = data['Date Time, GMT-03:00']
v = v.dropna()
v

Date Time, GMT-03:00
02/26/20 05:00:00 PM     0.0
03/14/20 04:49:21 PM     0.2
03/14/20 04:50:40 PM     0.4
03/14/20 04:54:14 PM     0.6
03/14/20 05:58:39 PM     0.8
                        ... 
05/05/20 12:12:26 AM    90.6
05/05/20 12:47:59 AM    90.8
05/05/20 12:54:01 PM    91.0
05/08/20 02:37:13 PM    91.2
05/08/20 02:37:43 PM    91.2
Name: CHUVA, mm (LGR S/N: 20497860, SEN S/N: 20497860), Length: 458, dtype: float64

In [24]:
from datetime import datetime
from datetime import timezone, timedelta

#date_str = data['Date Time, GMT-03:00']
date_str = v.index.to_series()
date_time = pandas.to_datetime(date_str, format='%m/%d/%y %I:%M:%S %p')
print(date_str)
gmt_hour_offset = -3
gmt_minute_offset = 0
tzinfo=timezone(timedelta(hours=gmt_hour_offset, minutes=gmt_minute_offset))
index = date_time.dt.tz_localize(tzinfo)


Date Time, GMT-03:00
02/26/20 05:00:00 PM    02/26/20 05:00:00 PM
03/14/20 04:49:21 PM    03/14/20 04:49:21 PM
03/14/20 04:50:40 PM    03/14/20 04:50:40 PM
03/14/20 04:54:14 PM    03/14/20 04:54:14 PM
03/14/20 05:58:39 PM    03/14/20 05:58:39 PM
                                ...         
05/05/20 12:12:26 AM    05/05/20 12:12:26 AM
05/05/20 12:47:59 AM    05/05/20 12:47:59 AM
05/05/20 12:54:01 PM    05/05/20 12:54:01 PM
05/08/20 02:37:13 PM    05/08/20 02:37:13 PM
05/08/20 02:37:43 PM    05/08/20 02:37:43 PM
Name: Date Time, GMT-03:00, Length: 458, dtype: object


In [None]:
def find_title(str_line):
    # Extrai titulo de string
    match = re.search(r'(?:Plot Title: )([^"]+)',str_line)
    if match:
        return match.group(1)
    else:
        return None


def find_serial_number(str_line):
    """
    Extrai numero de serial de string
    """
    match = re.search(r'(?:LGR S/N: |Serial Number:)(\d+)',str_line)
    if match:
        return match.group(1)
    else:
        return None

def get_info(file_name):
    
    # Obtem nome das colunas
    header = list(pandas.read_csv(filename, delimiter=delimiter,  header=0, skiprows=1, nrows=0, encoding=encoding))

    # Extrai titulo e informacoes extras se disponiveis
    fo = open(filename, 'rt', encoding='utf-8')
    title = find_title(fo.readline())
    sn = find_serial_number(fo.readline())

    # Informacoes extras
    n_cols = len(header)
    extra = []
    for i in range(MAX_EXTRA_SIZE):
        # separa nos separadores, mas não se tiver dentro de ""
        fields = re.split(delimiter + '(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)', fo.readline())
        n_fields = len(fields)
        if n_fields > n_cols:
            extra.append(delimiter.join(fields[n_cols:]))
        elif n_fields < n_cols:
            # Provavelmente uma linha invalida. ignora
            pass
        else:
            # numero de campos é igual ao de dados. termina procura por dados extra
            break
    extra = ''.join(extra)
    fo.close()
    
    return title, sn, header, extra

def get_data(file_name):
    # Extrai dados
    header = pandas.read_csv(filename, delimiter=delimiter,  header=0, skiprows=1, nrows=0, encoding=encoding)
    table = pandas.read_csv(filename, delimiter=delimiter, header=0, skiprows=1, encoding=encoding, usecols=header) 
    return table


def process_data(text):
    levels = []
    levels.append(['Details'])
    levels.append(['Series:','Event Type:'])
    levels.append(['Devices', 'Deployment Info', 'Series Statistics', 'Filter Parameters'])
    levels.append(['Device Info'])
    #teste = re.split(r'[\n](?=Details|Series: |Event Type: )',extra)
    #r'(?:Series:|Event Type:).+?[\n](?=Series:|Event Type:|$)'    
    return get_all_groups(text, levels)

def get_group(text, level):
    regex1 = '(?:'
    regex2 = '.+?[\n](?='
    first = True
    for m in level:
        if not first:
            regex1 += '|'
            regex2 += '|'
        else:
            first = False
        regex1 += m
        regex2 += m
        
    regex1 += ')'
    regex2 += '|$)'
    regex = regex1 + regex2
    match = re.compile(regex, re.S)
    return match.findall(text)

def text_to_dict(text):
    fields = text.split('\n')
    d = {}
    for f in fields:
        s = f.split(':', 1)
        if len(s) == 2:
            d.update({s[0].strip(): s[1].strip()})
    return(d)

def get_all_groups(text, levels, level_number=0):
    n_levels = len(levels)
    groups = []
    temp = get_group(text, levels[level_number])  

    output = {}
    level_number += 1
    for l in temp:
        [key, val] = l.split("\n", 1)
        new_val = None
        if level_number < n_levels:
            new_val = get_all_groups(val, levels, level_number)
        if new_val:
            val = new_val
        else:
            val = text_to_dict(val)

        output.update({key: val})
    return output


In [None]:
table = get_data(filename)
display(table)

In [None]:
title, sn, header, extra = get_info(filename)

In [None]:
print('Titulo: {}'.format(title))
print('Numero de serie: {}'.format(sn))
print('Cabecalho: {}'.format(header))
print('Informacao extra: {}'.format(extra))

In [None]:
process_data(extra)

In [34]:
delta = "P3Y6M4DT12H30M5S"
#delta = "P5W"

In [4]:
#p = re.compile(r'(?P<year>\d+)Y(?P<month>\d+)M(?P<day>\d+)', re.IGNORECASE)
 #p = re.compile(r'GMT(?P<hour>[-+]*\d+):*(?P<minute>\d+)*', re.IGNORECASE)

# Formato P12W (week)
def period_iso8601_to_relativetime(text):
    from dateutil.relativedelta import relativedelta
    p = re.compile(r'P(?P<years>\d+(?=Y))*\D*(?P<months>\d+(?=M))*\D*(?P<weeks>\d+(?=W))*\D*(?P<days>\d+(?=D))*\D*T*(?P<hours>\d+(?=H))*\D*(?P<minutes>\d+(?=M))*\D*(?P<seconds>\d+(?=S))*\D*', re.IGNORECASE)
    m = p.search(text)
    years = 0
    months = 0
    weeks = 0
    days = 0
    hours = 0
    minutes = 0
    seconds = 0
    
    if m['years']:
        years = float(m['years'])
    if m['months']:
        months = float(m['months'])
    if m['weeks']:
        weeks = float(m['weeks'])
    if m['days']:
        days = float(m['days'])
    if m['hours']:
        hours = float(m['hours'])
    if m['minutes']:
        minutes = float(m['minutes'])
    if m['seconds']:
        seconds = float(m['seconds'])
        
    delta = relativedelta(
        years = years,
        months = months,
        weeks = weeks,
        days = days,
        hours = hours,
        minutes = minutes,
        seconds = seconds)
    return delta
period_iso8601_to_relativetime("P3Y6M4DT12H30M5S")

relativedelta(years=+3, months=+6, days=+4, hours=+12, minutes=+30, seconds=+5)

In [20]:
m.groups()

AttributeError: 'NoneType' object has no attribute 'groups'

In [25]:
# Formato P12W (week)
delta = 'P5W'
def period_iso8601_to_deltatime(text):
    from dateutil.relativedelta import relativedelta
    w = re.compile(r'^P(?P<week>\d+(?=W))', re.IGNORECASE)
    m = w.search(text)
    if m:
        delta = relativedelta(
            weeks = float(m['weeks']) )
period_iso8601_to_deltatime('P5W')

IndexError: no such group

In [None]:
flot