### Imports

In [1]:
import pandas
from ydata_profiling import ProfileReport

In [2]:
# Keep rows where ALL non-null dates are within range
def is_valid_year_range(row, date_cols, max_year=2025):
    for col in date_cols:
        if pandas.notna(row[col]):  # If date exists
            year = row[col].year
            if  year > max_year:
                return False
    return True


### Files Processing
In this step we're selecting the features, renaming and adding some modifications to treat nan values and datetime formatting. Raw results are stored at ../data/bronze/ and more appropriate data is stored at ./data/silver/.

In [3]:
# Use proper path resolution from notebook location with error handling
try:
    # Try with robust parsing options for potentially malformed CSV
    files= ["INFLUD21", "INFLUD22", "INFLUD23", "INFLUD24", "INFLUD25"]
    columns_mapping = {"NU_NOTIFIC": "origin_id",
                "DT_NOTIFIC": "data_preenchimento",
                "VACINA_COV": "vacina_covid",
                "VACINA": "vacina_gripe",
                "HOSPITAL": "internado_hospital",
                "DT_INTERNA": "data_internacao_hospital",
                "UTI": "internado_uti",
                "DT_ENTUTI": "data_entrada_uti",
                "DT_SAIDUTI": "data_saida_uti",
                "CLASSI_FIN": "diagnostico_final",
                "EVOLUCAO": "evolucao",
                "DT_EVOLUCA": "data_evolucao",
                "DT_SIN_PRI":"data_primeiro_sintoma"
                }
    time_columns = ["DT_NOTIFIC", "DT_INTERNA", "DT_ENTUTI", "DT_SAIDUTI", "DT_EVOLUCA", "DT_SIN_PRI"]
    integer_columns = ["VACINA_COV", "VACINA", "HOSPITAL", "UTI", "CLASSI_FIN", "EVOLUCAO"]
    features = [key for key in columns_mapping.keys()]
    unified_df = pandas.DataFrame()
    for file in files:
        df = pandas.read_csv(f"../data/bronze/{file}.csv", 
                             low_memory=False,
                            on_bad_lines='warn',  # Skip bad lines
                            encoding='latin1',    # Common encoding 
                            sep=';')  
        df = df[features]  
        df[time_columns] = df[time_columns].apply(pandas.to_datetime, errors='coerce')
                
        # Apply the filter
        df = df[df.apply(lambda row: is_valid_year_range(row, time_columns), axis=1)]
        for col in time_columns:
            df[col] = df[col].dt.strftime('%Y-%m-%d')
        for col in integer_columns:
            df[col] = df[col].astype("Int64")
        df = df.rename(columns=columns_mapping)
        df.to_csv(f"../data/silver/{file}.csv", index=False, sep=';',na_rep="")
        unified_df = pandas.concat([unified_df, df])
    unified_df.to_csv("../data/silver/INFLUD21-25.csv", index=False, sep=';')
except Exception as e:
    print(f"ERROR with features selection: {e}")




### Profile Generation
Here we're using ydata-profile capabilities to create profiles and store them at ../data/profiles/ for further analisys.

In [None]:

try:
    # Try with robust parsing options for potentially malformed CSV
    files= ["INFLUD21","INFLUD22","INFLUD23","INFLUD24","INFLUD25"]
    for file in files:
        df = pandas.read_csv(f"../data/silver/{file}.csv", 
                             low_memory=False,
                            on_bad_lines='warn',  
                            encoding='latin1',    # Common encoding 
                            sep=';')    
        print(f"DATA LOADED SUCCESSFULLY FOR FILE: {file}")
        profile = ProfileReport(df=df)
        profile.to_file(output_file=f"../data/profiles/{file}.html")
except Exception as e:
    print(f"ERROR with PROFILE GENERATION: {e}")




DATA LOADED SUCCESSFULLY FOR FILE: INFLUD21


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 13/13 [00:18<00:00,  1.45s/it]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

DATA LOADED SUCCESSFULLY FOR FILE: INFLUD22


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 13/13 [00:02<00:00,  5.24it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

DATA LOADED SUCCESSFULLY FOR FILE: INFLUD23


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 13/13 [00:02<00:00,  5.26it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

DATA LOADED SUCCESSFULLY FOR FILE: INFLUD24


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 13/13 [00:02<00:00,  6.15it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

DATA LOADED SUCCESSFULLY FOR FILE: INFLUD25


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 13/13 [00:01<00:00,  9.35it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

### Final Profile Generation
Here we're using ydata-profile capabilities to create a specific profile related to the concatenation of all data. The result is stored at ../data/profiles/ for further analisys.

In [None]:

try:
    # Try with robust parsing options for potentially malformed CSV
    file = "INFLUD21-25.csv"
    df = pandas.read_csv(f"../data/silver/{file}", 
                             low_memory=False,
                            on_bad_lines='warn',  
                            encoding='latin1',    # Common encoding 
                            sep=';')    
    print("DATA LOADED SUCCESSFULLY")
    profile = ProfileReport(df=df)
    profile.to_file(output_file=f"../data/profiles/{file}.html")
except Exception as e:
    print(f"ERROR with PROFILE GENERATION: {e}")




DATA LOADED SUCCESSFULLY


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 13/13 [00:46<00:00,  3.57s/it]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]