In [1]:
import pandas as pd
import glob
import os
from pathlib import Path
import h5py
from joblib import Parallel, delayed

In [2]:
!pip install tables

Defaulting to user installation because normal site-packages is not writeable


In [3]:
# from google.colab import drive
# drive.mount('/content/drive')

In [4]:
replacement_map = {
    '├Ê': 'É',
    '├â': 'Ã',
    '├ü': 'Á',
    '├ç': 'Ç',
    '├ì': 'Í',
    '├Ò': 'é',
    '├│': 'ó',
    '├¡': 'í',
    '├ú': 'ã',
    '├í': 'á',
    '├º': 'ç',
    '├ô': 'Ó',
    '├õ': 'Ô',
    '├Ü': 'Ú',
    '├è': 'Ê',
    '├é': 'Â',
    '├┤': 'ô',
    '├║': 'ú',
    '├ò': 'õ',
    '├¬': 'ê',
    '├ó': 'â',
    '├╡': 'õ',
    '├Õ': 'Ú'
}

def fix_encoding(text, replacement_map):
    """
    Corrects encoding issues in a given text by replacing incorrectly encoded characters
    with their correct counterparts using a provided replacement map.

    Parameters:
        text (str): The input string that may contain incorrectly encoded characters.
        replacement_map (dict): A dictionary where keys are the incorrectly encoded
                                characters and values are the correct characters to
                                replace them with.

    Returns:
        str: The corrected text with all instances of incorrectly encoded characters
             replaced by their correct counterparts.

    Example:
        text = "This is a t├¬st with s├│me encoding probl├¡ms."
        corrected_text = fix_encoding(text, replacement_map)
        print(corrected_text)
        # Output: "This is a têst with sóme encoding problems."

    How It Works:
        1. Iterates over each key-value pair in the `replacement_map`.
        2. Replaces all occurrences of the incorrect character (key) in the input `text`
           with the correct character (value).
        3. Returns the corrected text after all replacements are made.

    Notes:
        - The function is case-sensitive, so ensure the keys in `replacement_map` match
          the exact incorrect characters in the text.
        - The `replacement_map` should be tailored to the specific encoding issues
          present in the text.
    """
    for incorrect, correct in replacement_map.items():
        text = text.replace(incorrect, correct)
    return text

In [5]:
# Function to read a single HDF5 file
def read_hdf_file(file_path, key_table):
    """
    Reads a single HDF5 file and returns the data.

    Parameters:
    file_path (str): The path to the HDF5 file.

    Returns:
    DataFrame: The data contained in the HDF5 file.
    """
    return pd.read_hdf(file_path, key=key_table)

# CEMADEN

In [9]:
# cemaden_folder_path = Path("./1 - Organized data gauge/CEMADEN/")
cemaden_folder_path = "/content/drive/MyDrive/QualiBRain/Scripts e Dados/1 - Organized data gauge/CEMADEN/"
cemaden_folder_path = r".\1 - Organized data gauge\CEMADEN\\"
telemetria_h5_files = glob.glob(os.path.join(cemaden_folder_path, r"*.h5"))
cemaden_h5_files = glob.glob(os.path.join(cemaden_folder_path, "*.h5"))
print("quantity of files:", len(cemaden_h5_files), "\n\nexamples:\n", cemaden_h5_files[0], "\n...\n", cemaden_h5_files[-1])

quantity of files: 108 

examples:
 .\1 - Organized data gauge\CEMADEN\AC_2021.h5 
...
 .\1 - Organized data gauge\CEMADEN\TO_2024.h5


In [10]:
table_key = 'table_info'

# Use parallel processing to read all files
cemaden_list_info = Parallel(n_jobs=-1)(delayed(read_hdf_file)(filename, table_key) for filename in cemaden_h5_files)

# Concatenate and remove duplicates
df_cemaden_gauge_info = pd.concat(cemaden_list_info).drop_duplicates(ignore_index=True)
del cemaden_list_info

# Apply the function to the 'City' column
df_cemaden_gauge_info['city'] = df_cemaden_gauge_info['city'].apply(lambda x: fix_encoding(x, replacement_map)).copy(deep = True)
df_cemaden_gauge_info['name_station'] = df_cemaden_gauge_info['name_station'].apply(lambda x: fix_encoding(x, replacement_map)).copy(deep = True)
df_cemaden_gauge_info['responsible'] = 'CEMADEN'
df_cemaden_gauge_info['source'] = 'CEMADEN'
# df_cemaden_gauge_info = df_telemetria_gauge_info[['gauge_code',	'state', 'city',	'name_station',	'lat',	'long',	'responsible', 'source']]
df_cemaden_gauge_info

Unnamed: 0,gauge_code,state,city,name_station,lat,long,responsible,source
0,120010401A,AC,BRASILÉIA,Centro,-11.012469,-68.740939,CEMADEN,CEMADEN
1,120032801A,AC,JORDÃO,Centro,-9.190133,-71.950808,CEMADEN,CEMADEN
2,120040101A,AC,RIO BRANCO,AC Oca,-9.972969,-67.806662,CEMADEN,CEMADEN
3,120040102A,AC,RIO BRANCO,Centro,-9.969802,-67.816956,CEMADEN,CEMADEN
4,120070801A,AC,XAPURI,Cageacre,-10.665833,-68.485844,CEMADEN,CEMADEN
...,...,...,...,...,...,...,...,...
5616,170730601A,TO,DUERÉ,Dona Constanci,-11.342960,-49.266640,CEMADEN,CEMADEN
5617,171886501A,TO,SANTA FÉ DO ARAGUAIA,Prefeitura Municipal,-7.148690,-48.690830,CEMADEN,CEMADEN
5618,172020001A,TO,SÃO MIGUEL DO TOCANTINS,Centro Cultural,-5.552650,-47.577580,CEMADEN,CEMADEN
5619,172030901A,TO,SÃO SEBASTIÃO DO TOCANTINS,Portal da Alvorada,-5.259031,-48.201467,CEMADEN,CEMADEN


In [None]:
df_cemaden_gauge_info[df_cemaden_gauge_info['gauge_code'] == '02243468']

Unnamed: 0,gauge_code,state,city,name_station,lat,long,responsible,source


In [11]:
table_key = 'table_data'

# Use parallel processing to read all files
cemaden_list_data = Parallel(n_jobs=-1)(delayed(read_hdf_file)(filename, table_key) for filename in cemaden_h5_files)

# Concatenate and remove duplicates
# df_cemaden_gauge_data = pd.concat(cemaden_list_data).drop_duplicates(ignore_index=True)
df_cemaden_gauge_data = pd.concat(cemaden_list_data)
del cemaden_list_data

# Convert the 'event_date' column to datetime
df_cemaden_gauge_data['datetime'] = pd.to_datetime(df_cemaden_gauge_data['datetime'])

# Set 'datetime' as the index
df_cemaden_gauge_data.set_index('datetime', inplace=True)

# Resample to daily frequency and aggregate 'rain_mm' using sum
df_cemaden_gauge_data = df_cemaden_gauge_data.groupby(['gauge_code', 'city', 'state']).resample('D')['rain_mm'].sum().reset_index()

df_cemaden_gauge_data['rain_mm'] = df_cemaden_gauge_data['rain_mm'].astype('float64')
df_cemaden_gauge_data['gauge_code'] = df_cemaden_gauge_data['gauge_code'].astype('str')
df_cemaden_gauge_data = df_cemaden_gauge_data[['gauge_code',	'datetime',	'rain_mm']]

df_cemaden_gauge_data

Unnamed: 0,gauge_code,datetime,rain_mm
0,110018901A,2021-01-01,6.30
1,110018901A,2021-01-02,1.79
2,110018901A,2021-01-03,0.00
3,110018901A,2021-01-04,16.15
4,110018901A,2021-01-05,1.58
...,...,...,...
4724938,CEMADEN_12,2023-02-24,0.42
4724939,CEMADEN_12,2023-02-25,0.00
4724940,CEMADEN_12,2023-02-26,0.00
4724941,CEMADEN_12,2023-02-27,1.78


In [12]:
df_cemaden_gauge_data.dtypes

Unnamed: 0,0
gauge_code,object
datetime,datetime64[ns]
rain_mm,float64


In [13]:
# df_cemaden_gauge_data.to_hdf('./1 - Organized data gauge/CEMADEN_GAUGE_DATA_2021_2024.h5', key='table_data', mode='w', complevel=9, complib='zlib', encoding='utf-8')
df_cemaden_gauge_data.to_hdf('CEMADEN_DAILY_2021_2024.h5', key='table_data', mode='w', complevel=9, complib='zlib', encoding='utf-8')
del df_cemaden_gauge_data

In [14]:
# df_cemaden_gauge_info.to_hdf('./1 - Organized data gauge/CEMADEN_GAUGE_DATA_2021_2024.h5', key='table_info', mode='r+', complevel=9, complib='zlib', encoding='utf-8')
df_cemaden_gauge_info.to_hdf('CEMADEN_DAILY_2021_2024.h5', key='table_info', mode='r+', complevel=9, complib='zlib', encoding='utf-8')
del df_cemaden_gauge_info

In [18]:
df = pd.read_hdf('CEMADEN_DAILY_2021_2024.h5', key='table_data').sort_values('datetime')
df

Unnamed: 0,gauge_code,datetime,rain_mm
0,110018901A,2021-01-01,6.30
3731664,410140802A,2021-01-01,0.00
529017,230990401A,2021-01-01,0.40
3733125,410140803A,2021-01-01,0.00
3734952,410165501A,2021-01-01,0.42
...,...,...,...
4458585,431410001A,2024-12-31,0.00
2787036,350410704A,2024-12-31,0.00
945900,261040002A,2024-12-31,0.20
1265730,291480202A,2024-12-31,0.00


In [16]:
df = pd.read_hdf('CEMADEN_DAILY_2021_2024.h5', key='table_info')
df

Unnamed: 0,gauge_code,state,city,name_station,lat,long,responsible,source
0,250030401A,PB,ALAGOA GRANDE,Centro,-7.039000,-35.631000,CEMADEN,CEMADEN
1,250050201A,PB,ALAGOINHA,Centro,-6.950520,-35.546540,CEMADEN,CEMADEN
2,250157501C,PB,BARRA DE SANTANA,BARRA DE SANTANA,-7.513027,-36.007575,CEMADEN,CEMADEN
3,250180701A,PB,BAYEUX,São Bento,-7.122000,-34.906000,CEMADEN,CEMADEN
4,250180702A,PB,BAYEUX,Alto da Boa Vista,-7.133000,-34.940000,CEMADEN,CEMADEN
...,...,...,...,...,...,...,...,...
5616,261240601C,PE,SANHARÓ,SANHARÓ,-8.384800,-36.589350,CEMADEN,CEMADEN
5617,261400601U,PE,SERRITA,SERRITA,-7.936444,-39.311444,CEMADEN,CEMADEN
5618,261440201A,PE,SOLIDÃO,SOLIDAO - APAC,-7.602130,-37.652580,CEMADEN,CEMADEN
5619,261480801A,PE,TACARATU,Tacaratu [Secretária de Desenvolvimento Social...,-9.106870,-38.149310,CEMADEN,CEMADEN
