In [1]:
# Read packedges

import pandas as pd
import pandas.testing as tm
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import scipy.stats

import statsmodels.api as sm
import statsmodels.discrete.discrete_model as dm

from patsy import dmatrices
import statsmodels.graphics.tsaplots as tsa

from scipy.fft import fft, ifft, fftfreq

import numpy as np, scipy.stats as st

import statsmodels.formula.api as smf
from statsmodels.tsa.stattools import acf

import itertools
from itertools import combinations, chain

from scipy.stats import pearsonr

import re

from datetime import datetime

import os

import functions

# Read data 

In [20]:
# data of hospitalizations
df_sih = pd.read_parquet('/Users/julianeoliveira/Documents/github/Bivariate_Anomaly_Detection_Primary_Health_Care_Drug_Selling_ILI_surveillance/Data/counts_aih_2025.parquet')

#/home/juliane.oliveira/workspace/OTC_PHC_analysis_and_modelling/counts_aih_gripal.parquet')

# Read dictionary with municipalities
muni = pd.read_csv('/Users/julianeoliveira/Documents/github/Bivariate_Anomaly_Detection_Primary_Health_Care_Drug_Selling_ILI_surveillance/Data/RELATORIO_DTB_BRASIL_MUNICIPIO.csv', sep = ';')

# Format data for analysis

In [27]:
df_sih = df_sih.assign( co_ibge = df_sih['MUNIC_RES'].astype(int),
                        epiweek = df_sih.epiweek.astype(int),
                       epiyear = df_sih.epiyear.astype(int))

df_sih = functions.year_week_ts(df_sih, epiweek_col='epiweek', year_col='epiyear')

muni = muni[['UF', 'Nome_UF', 'Região Geográfica Imediata',
       'Nome Região Geográfica Imediata',
       'Município', 'Código Município Completo', 'Nome_Município']]

muni = muni.assign(co_ibge = muni['Código Município Completo'].astype(str).str[0:6].astype(int))


df_sih = df_sih.merge(muni, left_on='co_ibge', right_on='co_ibge')

df_sih = df_sih.rename(columns= {"Região Geográfica Imediata": "co_imed", "Nome Região Geográfica Imediata": "Nome_imed"})

In [28]:
df_sih_muni = df_sih[['Nome_Município', 'co_ibge', 'epiweek', 'epiyear', 'n',  'year_week',
                      'year_week_ts', 'year_week_str', 'UF', 'Nome_UF',
                      'Código Município Completo']]

df_sih_imed = df_sih.groupby(['UF', 'Nome_UF', 'co_imed', 'Nome_imed',  'epiweek', 'epiyear','year_week',
       'year_week_ts', 'year_week_str'])['n'].sum().reset_index()

In [29]:
df_sih_muni  = df_sih_muni[(df_sih_muni.year_week >= '2022-47') & (df_sih_muni.year_week <= '2024-52')]

In [30]:
df_sih_imed = df_sih_imed[(df_sih_imed.year_week >= '2022-47') & (df_sih_imed.year_week <= '2024-52')]

In [32]:
df_sih_imed.groupby(['epiyear'])['n'].sum().reset_index()

Unnamed: 0,epiyear,n
0,2022,85630
1,2023,871551
2,2024,846297


In [33]:
df_sih.groupby(['epiyear'])['n'].sum().reset_index()

Unnamed: 0,epiyear,n
0,2018,770430
1,2019,785039
2,2020,1034071
3,2021,1658091
4,2022,961599
5,2023,871551
6,2024,846297
7,2025,391


# Find anomalies in SIH series

## Anomalies in series at city level

In [17]:
lst = []

for code in df_sih_muni.co_ibge.unique():
    
    set_muni = df_sih_muni[df_sih_muni.co_ibge == code]
    
    
    # Add new columns
    set_muni = set_muni.assign(
        mean_per_week=round(set_muni.n.mean(), 0),
        mode = set_muni.n.mode(),
        median = set_muni.n.median(),
        std = set_muni.n.std(),
        total_n = set_muni.n.sum(),
        n_4 = set_muni.n.rolling(window=4).mean().fillna(0)
    )

    #Apply condition row-wise for warning
    set_muni['warning_aih'] = (set_muni['n'] > set_muni['median'] + 2*set_muni['std']) 

    set_muni = set_muni.assign(warning_aih_sum = set_muni['warning_aih'].sum())

    # Append to the list
    lst.append(set_muni)

result_warning_aih_muni = pd.concat(lst, ignore_index=True)

In [18]:
dta1 = result_warning_aih_muni[['co_ibge','year_week', 'n', 'warning_aih']]

dta2 = functions.clean_warning_column(dta1, 'co_ibge', 'year_week','warning_aih')

dta2 = dta2[['co_ibge', 'year_week', 'warning_aih', 'n',
             'cleaned_warning', 'event', 'warning_final']]

dta2 = dta2.rename(columns={'cleaned_warning': 'warning_aih_without_isolated', 
                            'event': 'warning_aih_corect_with_consec',
                           'warning_final':'warning_final_aih'})

In [19]:
dta2 = dta2.assign(warning_aih = dta2.warning_aih.astype(int),
                   warning_aih_without_isolated = dta2.warning_aih_without_isolated.astype(int),
                   warning_final_aih = dta2.warning_final_aih.astype(int))
                   

In [20]:
dta2.to_parquet('/Users/julianeoliveira/Documents/github/Bivariate_Anomaly_Detection_Primary_Health_Care_Drug_Selling_ILI_surveillance/Results/data_manuscript_warning_aih_muni2.parquet')

                #/home/juliane.oliveira/workspace/Data/data_manuscript_warning_aih_muni.parquet')

## Anomalies in series at imediate reagion level

In [34]:
lst = []

for code in df_sih_imed.co_imed.unique():
    
    set_imed = df_sih_imed[df_sih_imed.co_imed == code]
    
    # Add new columns
    set_imed = set_imed.assign(
        mean_per_week=round(set_imed.n.mean(), 0),
        mode = set_imed.n.mode(),
        median = set_imed.n.median(),
        std = set_imed.n.std(),
        total_n = set_imed.n.sum(),
        n_4 = set_imed.n.rolling(window=4).mean().fillna(0)
    )

    #Apply condition row-wise for warning
    set_imed['warning_aih'] = (set_imed['n'] > set_imed['median'] + 2*set_imed['std']) 

    set_imed = set_imed.assign(warning_aih_sum = set_imed['warning_aih'].sum())

    # Append to the list
    lst.append(set_imed)

result_warning_aih_imed = pd.concat(lst, ignore_index=True)

In [35]:
result_warning_aih_imed.columns

Index(['UF', 'Nome_UF', 'co_imed', 'Nome_imed', 'epiweek', 'epiyear',
       'year_week', 'year_week_ts', 'year_week_str', 'n', 'mean_per_week',
      dtype='object')

In [36]:
dta1 = result_warning_aih_imed[['co_imed','year_week', 'n', 'warning_aih']]

dta2 = functions.clean_warning_column(dta1, 'co_imed', 'year_week','warning_aih')

dta2 = dta2[['co_imed', 'year_week', 'warning_aih', 'n',
             'cleaned_warning', 'event', 'warning_final']]

dta2 = dta2.rename(columns={'cleaned_warning': 'warning_aih_without_isolated', 
                            'event': 'warning_aih_corect_with_consec',
                           'warning_final':'warning_final_aih'})

dta2 = dta2.assign(warning_aih = dta2.warning_aih.astype(int),
                   warning_aih_without_isolated = dta2.warning_aih_without_isolated.astype(int),
                   warning_final_aih = dta2.warning_final_aih.astype(int))

In [37]:
dta2.warning_final_aih.sum()

556

In [38]:
dta2

Unnamed: 0,co_imed,year_week,warning_aih,n,warning_aih_without_isolated,warning_aih_corect_with_consec,warning_final_aih
92,110001,2022-47,0,47,0,0,0
95,110001,2022-48,0,51,0,0,0
98,110001,2022-49,0,51,0,0,0
101,110001,2022-50,0,59,0,0,0
104,110001,2022-51,0,32,0,0,0
...,...,...,...,...,...,...,...
55798,530001,2024-48,0,193,0,0,0
55801,530001,2024-49,0,192,0,0,0
55804,530001,2024-50,0,160,0,0,0
55807,530001,2024-51,0,151,0,0,0


In [35]:
dta2.to_parquet('/Users/julianeoliveira/Documents/github/Bivariate_Anomaly_Detection_Primary_Health_Care_Drug_Selling_ILI_surveillance/Results/data_manuscript_warning_aih_imed2.parquet')

                #/home/juliane.oliveira/workspace/Data/data_manuscript_warning_aih_imed.parquet')