In [1]:
import pandas as pd
import glob
import os
from pathlib import Path
import h5py
from joblib import Parallel, delayed

In [12]:
%pip install tables

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [4]:
# from google.colab import drive
# drive.mount('/content/drive')

In [5]:
replacement_map = {
    '├Ê': 'É',
    '├â': 'Ã',
    '├ü': 'Á',
    '├ç': 'Ç',
    '├ì': 'Í',
    '├Ò': 'é',
    '├│': 'ó',
    '├¡': 'í',
    '├ú': 'ã',
    '├í': 'á',
    '├º': 'ç',
    '├ô': 'Ó',
    '├õ': 'Ô',
    '├Ü': 'Ú',
    '├è': 'Ê',
    '├é': 'Â',
    '├┤': 'ô',
    '├║': 'ú',
    '├ò': 'õ',
    '├¬': 'ê',
    '├ó': 'â',
    '├╡': 'õ',
    '├Õ': 'Ú'
}

def fix_encoding(text, replacement_map):
    """
    Corrects encoding issues in a given text by replacing incorrectly encoded characters
    with their correct counterparts using a provided replacement map.

    Parameters:
        text (str): The input string that may contain incorrectly encoded characters.
        replacement_map (dict): A dictionary where keys are the incorrectly encoded
                                characters and values are the correct characters to
                                replace them with.

    Returns:
        str: The corrected text with all instances of incorrectly encoded characters
             replaced by their correct counterparts.

    Example:
        text = "This is a t├¬st with s├│me encoding probl├¡ms."
        corrected_text = fix_encoding(text, replacement_map)
        print(corrected_text)
        # Output: "This is a têst with sóme encoding problems."

    How It Works:
        1. Iterates over each key-value pair in the `replacement_map`.
        2. Replaces all occurrences of the incorrect character (key) in the input `text`
           with the correct character (value).
        3. Returns the corrected text after all replacements are made.

    Notes:
        - The function is case-sensitive, so ensure the keys in `replacement_map` match
          the exact incorrect characters in the text.
        - The `replacement_map` should be tailored to the specific encoding issues
          present in the text.
    """
    for incorrect, correct in replacement_map.items():
        text = text.replace(incorrect, correct)
    return text

In [6]:
# Function to read a single HDF5 file
def read_hdf_file(file_path, key_table):
    """
    Reads a single HDF5 file and returns the data.

    Parameters:
    file_path (str): The path to the HDF5 file.

    Returns:
    DataFrame: The data contained in the HDF5 file.
    """
    return pd.read_hdf(file_path, key=key_table)

# TELEMETRIA

In [26]:
# telemetria_folder_path = Path("./1 - Organized data gauge/TELEMETRIA/")
telemetria_folder_path = r"/content/drive/MyDrive/QualiBRain/Scripts e Dados/1 - Organized data gauge/TELEMETRIA/"
telemetria_folder_path = r".\1 - Organized data gauge\TELEMETRIA\\"
telemetria_h5_files = glob.glob(os.path.join(telemetria_folder_path, r"*.h5"))
print("quantity of files:", len(telemetria_h5_files), "\n\nexamples:\n", telemetria_h5_files[0], "\n...\n", telemetria_h5_files[-1])

quantity of files: 108 

examples:
 .\1 - Organized data gauge\TELEMETRIA\AC_2021.h5 
...
 .\1 - Organized data gauge\TELEMETRIA\TO_2024.h5


In [21]:
table_key = 'table_info'

# Use parallel processing to read all files
telemetria_list_info = Parallel(n_jobs=-1)(delayed(read_hdf_file)(filename, table_key) for filename in telemetria_h5_files)

# Concatenate and remove duplicates
df_telemetria_gauge_info = pd.concat(telemetria_list_info).drop_duplicates(ignore_index=True)
del telemetria_list_info

# Apply the function to the 'City' column
df_telemetria_gauge_info['city'] = df_telemetria_gauge_info['city'].apply(lambda x: fix_encoding(x, replacement_map)).copy(deep = True)
df_telemetria_gauge_info['name_station'] = df_telemetria_gauge_info['name_station'].apply(lambda x: fix_encoding(x, replacement_map)).copy(deep = True)
df_telemetria_gauge_info = df_telemetria_gauge_info.rename(columns={
    'Responsible': 'responsible'
})
df_telemetria_gauge_info['source'] = 'TELEMETRIA'
df_telemetria_gauge_info = df_telemetria_gauge_info[['gauge_code',	'state', 'city',	'name_station',	'lat',	'long',	'responsible', 'source']]
df_telemetria_gauge_info

Unnamed: 0,gauge_code,state,city,name_station,lat,long,responsible,source
0,12370000,AC,MARECHAL THAUMATURGO,THAUMATURGO,-8.9458,-72.7847,00001 - ANA - Agência Nacional de Águas,TELEMETRIA
1,12500000,AC,CRUZEIRO DO SUL,CRUZEIRO DO SUL,-7.6333,-72.6603,00001 - ANA - Agência Nacional de Águas,TELEMETRIA
2,12510500,AC,TARAUACÁ,PONTE DO RIO LIBERDADE,-7.7994,-72.0211,00001 - ANA - Agência Nacional de Águas,TELEMETRIA
3,12557000,AC,JORDÃO,JORDÃO,-9.1883,-71.9525,00380 - SEMA-AC - Secretaria de Meio Ambiente ...,TELEMETRIA
4,12590000,AC,TARAUACÁ,PONTE DE TARAUACÁ,-8.1519,-70.7456,00001 - ANA - Agência Nacional de Águas,TELEMETRIA
...,...,...,...,...,...,...,...,...
4474,28300100,TO,XAMBIOÁ,UHE TUCURUÍ RIO ARAGUAIA 1,-6.3997,-48.5281,00075 - ELETRONORTE - ELETRONORTE,TELEMETRIA
4475,28308000,TO,WANDERLÂNDIA,CGH LAJES MONTANTE,-6.8006,-48.1358,00683 - ALVORADA - Alvorada Energia S.A,TELEMETRIA
4476,28310080,TO,PIRAQUÊ,CGH LAJES JUSANTE,-6.7822,-48.1525,00683 - ALVORADA - Alvorada Energia S.A,TELEMETRIA
4477,28318000,TO,RIACHINHO,RIO CORDA RIACHINHO,-6.5178,-48.1514,00258 - SEMARH-TO - Secretaria do Meio Ambient...,TELEMETRIA


In [28]:
df_telemetria_gauge_info[df_telemetria_gauge_info['gauge_code'] == '02243468']

Unnamed: 0,gauge_code,state,city,name_station,lat,long,responsible,source


In [8]:
table_key = 'table_data'

# Use parallel processing to read all files
telemetria_list_data = Parallel(n_jobs=-1)(delayed(read_hdf_file)(filename, table_key) for filename in telemetria_h5_files)

# Concatenate and remove duplicates
# df_telemetria_gauge_data = pd.concat(telemetria_list_data).drop_duplicates(ignore_index=True)
df_telemetria_gauge_data = pd.concat(telemetria_list_data)
del telemetria_list_data

# Convert the 'event_date' column to datetime
df_telemetria_gauge_data['datetime'] = pd.to_datetime(df_telemetria_gauge_data['datetime'])

# Set 'datetime' as the index
df_telemetria_gauge_data.set_index('datetime', inplace=True)

# Resample to daily frequency and aggregate 'rain_mm' using sum
df_telemetria_gauge_data = df_telemetria_gauge_data.groupby(['gauge_code']).resample('D')['rain_mm'].sum().reset_index()

df_telemetria_gauge_data['rain_mm'] = df_telemetria_gauge_data['rain_mm'].astype('float64')
df_telemetria_gauge_data['gauge_code'] = df_telemetria_gauge_data['gauge_code'].astype('str')

df_telemetria_gauge_data = df_telemetria_gauge_data[['gauge_code',	'datetime',	'rain_mm']]
df_telemetria_gauge_data

Unnamed: 0,gauge_code,datetime,rain_mm
0,02042051,2021-07-31,0.0
1,02042051,2021-08-01,1.0
2,02042051,2021-08-02,0.0
3,02042051,2021-08-03,0.6
4,02042051,2021-08-04,1.0
...,...,...,...
2908882,88690050,2023-12-27,0.0
2908883,88690050,2023-12-28,0.0
2908884,88690050,2023-12-29,2.0
2908885,88690050,2023-12-30,0.0


In [9]:
df_telemetria_gauge_data.dtypes

Unnamed: 0,0
gauge_code,object
datetime,datetime64[ns]
rain_mm,float64


In [13]:
# df_telemetria_gauge_data.to_hdf('./1 - Organized data gauge/TELEMETRIA_GAUGE_DATA_2021_2023.h5', key='table_data', mode='w', complevel=9, complib='zlib', encoding='utf-8')
df_telemetria_gauge_data.to_hdf('TELEMETRIA_DAILY_2021_2024.h5', key='table_data', mode='w', complevel=9, complib='zlib', encoding='utf-8')
del df_telemetria_gauge_data

NameError: name 'df_telemetria_gauge_data' is not defined

In [None]:
# df_telemetria_gauge_info.to_hdf('./1 - Organized data gauge/TELEMETRIA_GAUGE_DATA_2021_2023.h5', key='table_info', mode='r+', complevel=9, complib='zlib', encoding='utf-8')
df_telemetria_gauge_info.to_hdf('TELEMETRIA_DAILY_2021_2024.h5', key='table_info', mode='r+', complevel=9, complib='zlib', encoding='utf-8')
del df_telemetria_gauge_info

In [None]:
df = pd.read_hdf('TELEMETRIA_DAILY_2021_2024.h5', key='table_info')
df

In [14]:
df = pd.read_hdf('TELEMETRIA_DAILY_2021_2024.h5', key='table_data').sort_values(by=['datetime'])
df

Unnamed: 0,gauge_code,datetime,rain_mm
1086297,51151000,2021-01-01,0.0
2568645,74800000,2021-01-01,0.0
489986,22151000,2021-01-01,0.0
1071301,50090000,2021-01-01,1.6
469854,21780200,2021-01-01,0.0
...,...,...,...
2790086,86102000,2024-12-31,0.0
186394,15552800,2024-12-31,6.2
476106,21836000,2024-12-31,0.0
2264135,65924300,2024-12-31,8.8
