In [1]:
import pandas as pd
import glob
import os
from pathlib import Path
import h5py
from joblib import Parallel, delayed

In [2]:
%pip install tables

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [3]:
# from google.colab import drive
# drive.mount('/content/drive')

In [4]:
replacement_map = {
    '├Ê': 'É',
    '├â': 'Ã',
    '├ü': 'Á',
    '├ç': 'Ç',
    '├ì': 'Í',
    '├Ò': 'é',
    '├│': 'ó',
    '├¡': 'í',
    '├ú': 'ã',
    '├í': 'á',
    '├º': 'ç',
    '├ô': 'Ó',
    '├õ': 'Ô',
    '├Ü': 'Ú',
    '├è': 'Ê',
    '├é': 'Â',
    '├┤': 'ô',
    '├║': 'ú',
    '├ò': 'õ',
    '├¬': 'ê',
    '├ó': 'â',
    '├╡': 'õ',
    '├Õ': 'Ú'
}

def fix_encoding(text, replacement_map):
    """
    Corrects encoding issues in a given text by replacing incorrectly encoded characters
    with their correct counterparts using a provided replacement map.

    Parameters:
        text (str): The input string that may contain incorrectly encoded characters.
        replacement_map (dict): A dictionary where keys are the incorrectly encoded
                                characters and values are the correct characters to
                                replace them with.

    Returns:
        str: The corrected text with all instances of incorrectly encoded characters
             replaced by their correct counterparts.

    Example:
        text = "This is a t├¬st with s├│me encoding probl├¡ms."
        corrected_text = fix_encoding(text, replacement_map)
        print(corrected_text)
        # Output: "This is a têst with sóme encoding problems."

    How It Works:
        1. Iterates over each key-value pair in the `replacement_map`.
        2. Replaces all occurrences of the incorrect character (key) in the input `text`
           with the correct character (value).
        3. Returns the corrected text after all replacements are made.

    Notes:
        - The function is case-sensitive, so ensure the keys in `replacement_map` match
          the exact incorrect characters in the text.
        - The `replacement_map` should be tailored to the specific encoding issues
          present in the text.
    """
    for incorrect, correct in replacement_map.items():
        text = text.replace(incorrect, correct)
    return text

In [5]:
# Function to read a single HDF5 file
def read_hdf_file(file_path, key_table):
    """
    Reads a single HDF5 file and returns the data.

    Parameters:
    file_path (str): The path to the HDF5 file.

    Returns:
    DataFrame: The data contained in the HDF5 file.
    """
    return pd.read_hdf(file_path, key=key_table)

# INMET

In [7]:
# inmet_folder_path = Path("./1 - Organized data gauge/INMET/")
inmet_folder_path = "/content/drive/MyDrive/QualiBRain/Scripts e Dados/1 - Organized data gauge/INMET/"
inmet_folder_path = r".\1 - Organized data gauge\INMET\\"
inmet_h5_files = glob.glob(os.path.join(inmet_folder_path, r"*.h5"))
print("quantity of files:", len(inmet_h5_files), "\n\nexamples:\n", inmet_h5_files[0], "\n...\n", inmet_h5_files[-1])


quantity of files: 4 

examples:
 .\1 - Organized data gauge\INMET\2021_D.h5 
...
 .\1 - Organized data gauge\INMET\2024_D.h5


In [8]:
table_key = 'table_info'

# Use parallel processing to read all files
inmet_list_info = Parallel(n_jobs=-1)(delayed(read_hdf_file)(filename, table_key) for filename in inmet_h5_files)

# Concatenate and remove duplicates
df_inmet_gauge_info = pd.concat(inmet_list_info).drop_duplicates(ignore_index=True)
del inmet_list_info



# Create the 'name_station' column
df_inmet_gauge_info['name_station'] = df_inmet_gauge_info['city'].str.cat(df_inmet_gauge_info['gauge_code'].astype(str), sep=' | ')

# Apply the function to the 'City' column
df_inmet_gauge_info['city'] = df_inmet_gauge_info['city'].apply(lambda x: fix_encoding(x, replacement_map)).copy(deep = True)
df_inmet_gauge_info['name_station'] = df_inmet_gauge_info['name_station'].apply(lambda x: fix_encoding(x, replacement_map)).copy(deep = True)
df_inmet_gauge_info['gauge_code'] = df_inmet_gauge_info['gauge_code'].apply(lambda x: fix_encoding(x, replacement_map)).copy(deep = True)

df_inmet_gauge_info['source'] = 'INMET'
df_inmet_gauge_info['responsible'] = 'INMET'
df_inmet_gauge_info = df_inmet_gauge_info[['gauge_code',	'state',	'city',	'name_station',	'lat',	'long',	'responsible', 'source']]
df_inmet_gauge_info

Unnamed: 0,gauge_code,state,city,name_station,lat,long,responsible,source
0,A001,DF,BRASILIA,BRASILIA | A001,-15.789444,-47.925833,INMET,INMET
1,A042,DF,BRAZLANDIA,BRAZLANDIA | A042,-15.599722,-48.131111,INMET,INMET
2,A045,DF,AGUAS EMENDADAS,AGUAS EMENDADAS | A045,-15.596491,-47.625801,INMET,INMET
3,A046,DF,GAMA (PONTE ALTA),GAMA (PONTE ALTA) | A046,-15.935278,-48.137500,INMET,INMET
4,A047,DF,PARANOA (COOPA-DF),PARANOA (COOPA-DF) | A047,-16.012222,-47.557417,INMET,INMET
...,...,...,...,...,...,...,...,...
969,83179,BA,BARRA,BARRA | 83179,-11.084722,-43.138889,INMET,INMET
970,83364,MT,PADRE RICARDO REMETTER,PADRE RICARDO REMETTER | 83364,-15.776111,-56.071944,INMET,INMET
971,83452,MG,JURAMENTO,JURAMENTO | 83452,-16.775000,-43.667222,INMET,INMET
972,83485,MG,CARBONITA,CARBONITA | 83485,-17.533056,-43.012222,INMET,INMET


In [9]:
df_inmet_gauge_info[df_inmet_gauge_info['gauge_code'] == '02243468']

Unnamed: 0,gauge_code,state,city,name_station,lat,long,responsible,source


In [12]:
table_key = 'table_data'

# Use parallel processing to read all files
inmet_list_data = Parallel(n_jobs=-1)(delayed(read_hdf_file)(filename, table_key) for filename in inmet_h5_files)

# Concatenate and remove duplicates
# df_inmet_gauge_data = pd.concat(inmet_list_data).drop_duplicates(ignore_index=True)
df_inmet_gauge_data = pd.concat(inmet_list_data)
del inmet_list_data

# Convert the 'event_date' column to datetime
df_inmet_gauge_data['datetime'] = pd.to_datetime(df_inmet_gauge_data['datetime'])

# Set 'datetime' as the index
df_inmet_gauge_data.set_index('datetime', inplace=True)

# Resample to daily frequency and aggregate 'rain_mm' using sum
df_inmet_gauge_data = df_inmet_gauge_data.groupby(['gauge_code']).resample('D')['rain_mm'].sum().reset_index()


# Apply the function to the 'City' column

df_inmet_gauge_data['rain_mm'] = df_inmet_gauge_data['rain_mm'].astype('float64')
df_inmet_gauge_data['gauge_code'] = df_inmet_gauge_data['gauge_code'].astype('str')

df_inmet_gauge_data

Unnamed: 0,gauge_code,datetime,rain_mm
0,82024,2021-01-01,0.0
1,82024,2021-01-02,0.0
2,82024,2021-01-03,8.0
3,82024,2021-01-04,0.0
4,82024,2021-01-05,0.0
...,...,...,...
1076753,S717,2021-12-27,0.0
1076754,S717,2021-12-28,0.0
1076755,S717,2021-12-29,0.0
1076756,S717,2021-12-30,0.0


In [13]:
df_inmet_gauge_data.dtypes

Unnamed: 0,0
gauge_code,object
datetime,datetime64[ns]
rain_mm,float64


In [14]:
# df_inmet_gauge_data.to_hdf('./1 - Organized data gauge/INMET_GAUGE_DATA_2021_2023.h5', key='table_data', mode='w', complevel=9, complib='zlib', encoding='utf-8')
df_inmet_gauge_data.to_hdf('INMET_DAILY_2021_2024.h5', key='table_data', mode='w', complevel=9, complib='zlib', encoding='utf-8')
del df_inmet_gauge_data

In [15]:
# df_inmet_gauge_info.to_hdf('./1 - Organized data gauge/INMET_GAUGE_DATA_2021_2023.h5', key='table_info', mode='r+', complevel=9, complib='zlib', encoding='utf-8')
df_inmet_gauge_info.to_hdf('INMET_DAILY_2021_2024.h5', key='table_info', mode='r+', complevel=9, complib='zlib', encoding='utf-8')
del df_inmet_gauge_info

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->Index(['gauge_code', 'state', 'city', 'name_station', 'responsible', 'source'], dtype='object')]

  df_inmet_gauge_info.to_hdf('INMET_DAILY_2021_2024.h5', key='table_info', mode='r+', complevel=9, complib='zlib', encoding='utf-8')


In [16]:
df = pd.read_hdf('INMET_DAILY_2021_2024.h5', key='table_data').sort_values(by='datetime').sort_values(by='datetime')
df

Unnamed: 0,gauge_code,datetime,rain_mm
0,82024,2021-01-01,0.0
272870,A027,2021-01-01,0.0
281636,A034,2021-01-01,0.0
283097,A035,2021-01-01,0.0
284558,A036,2021-01-01,0.0
...,...,...,...
466355,A324,2024-12-31,0.0
65406,82690,2024-12-31,0.0
988346,A883,2024-12-31,0.0
646422,A523,2024-12-31,0.2
