In [1]:
# Read packedges

import pandas as pd
import pandas.testing as tm
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import scipy.stats

import statsmodels.api as sm
import statsmodels.discrete.discrete_model as dm

from patsy import dmatrices
import statsmodels.graphics.tsaplots as tsa

from scipy.fft import fft, ifft, fftfreq

import numpy as np, scipy.stats as st

import statsmodels.formula.api as smf
from statsmodels.tsa.stattools import acf

import itertools
from itertools import combinations, chain

from scipy.stats import pearsonr

import re

from datetime import datetime

import os

import functions

# Read data 

In [76]:
# data of hospitalizations
df_sih = pd.read_parquet('/Users/julianeoliveira/Documents/github/Bivariate_Anomaly_Detection_Primary_Health_Care_Drug_Selling_ILI_surveillance/Data/counts_aih_2025.parquet')

#/home/juliane.oliveira/workspace/OTC_PHC_analysis_and_modelling/counts_aih_gripal.parquet')

# Read dictionary with municipalities
muni = pd.read_csv('/Users/julianeoliveira/Documents/github/Bivariate_Anomaly_Detection_Primary_Health_Care_Drug_Selling_ILI_surveillance/Data/RELATORIO_DTB_BRASIL_MUNICIPIO.csv', sep = ';')

# Format data for analysis

In [77]:
df_sih = df_sih.assign( co_ibge = df_sih['MUNIC_RES'].astype(int),
                        epiweek = df_sih.epiweek.astype(int),
                       epiyear = df_sih.epiyear.astype(int))

df_sih = functions.year_week_ts(df_sih, epiweek_col='epiweek', year_col='epiyear')

muni = muni[['UF', 'Nome_UF', 'Região Geográfica Imediata',
       'Nome Região Geográfica Imediata',
       'Município', 'Código Município Completo', 'Nome_Município']]

muni = muni.assign(co_ibge = muni['Código Município Completo'].astype(str).str[0:6].astype(int))


df_sih = df_sih.merge(muni, left_on='co_ibge', right_on='co_ibge')

df_sih = df_sih.rename(columns= {"Região Geográfica Imediata": "co_imed", "Nome Região Geográfica Imediata": "Nome_imed"})

In [78]:
df_sih_muni = df_sih[['Nome_Município', 'co_ibge', 'epiweek', 'epiyear', 'n',  'year_week',
                      'year_week_ts', 'year_week_str', 'UF', 'Nome_UF',
                      'Código Município Completo']]

df_sih_imed = df_sih.groupby(['UF', 'Nome_UF', 'co_imed', 'Nome_imed',  'epiweek', 'epiyear','year_week',
       'year_week_ts', 'year_week_str'])['n'].sum().reset_index()

In [79]:
df_sih_muni  = df_sih_muni[(df_sih_muni.year_week >= '2022-47') & (df_sih_muni.year_week <= '2024-52')]

In [80]:
df_sih_imed = df_sih_imed[(df_sih_imed.year_week >= '2022-47') & (df_sih_imed.year_week <= '2024-52')]

In [81]:
df_sih_imed.groupby(['epiyear'])['n'].sum().reset_index()

Unnamed: 0,epiyear,n
0,2022,85630
1,2023,871551
2,2024,846297


In [82]:
df_sih.groupby(['epiyear'])['n'].sum().reset_index()

Unnamed: 0,epiyear,n
0,2018,770430
1,2019,785039
2,2020,1034071
3,2021,1658091
4,2022,961599
5,2023,871551
6,2024,846297
7,2025,391


# Find anomalies in SIH series

## Anomalies in series at imediate reagion level

In [83]:
# Step 2: Compute median (p50) by region
p50_by_region = df_sih_imed.groupby('co_imed')['n'].median().reset_index().rename(columns={'n': 'p50'})


In [84]:
# Step 3: Merge p50 back to original data
df_sih_imed = df_sih_imed.merge(p50_by_region, on='co_imed', how='left')

In [85]:
# Step 4: Apply anomaly detection logic
def flag_anomaly(row):
    n = row['n']
    p50 = row['p50']
    if pd.isna(n) or pd.isna(p50):
        return 0
    if p50 < 50:
        return int(n > (2 * p50) and n > 10)
    elif 50 <= p50 < 100:
        return int(n > (p50 + 0.5 * p50))
    elif p50 < 250:
        return int(n > (p50 + 0.4 * p50))
    elif p50 < 500:
        return int(n > (p50 + 0.3 * p50))
    elif p50 < 1000:
        return int(n > (p50 + 0.2 * p50))
    else:  # p50 >= 1000
        return int(n > (p50 + 0.1 * p50))

# Step 5: Create warning column
df_sih_imed['warning_aih'] = df_sih_imed.apply(flag_anomaly, axis=1)


In [86]:
df_sih_imed['warning_aih'].sum()

2618

In [87]:
dta1 = df_sih_imed[['co_imed','year_week', 'n', 'warning_aih']]

dta2 = functions.clean_warning_column(dta1, 'co_imed', 'year_week','warning_aih')

dta2 = dta2[['co_imed', 'year_week', 'warning_aih', 'n',
             'cleaned_warning', 'event', 'warning_final']]

dta2 = dta2.rename(columns={'cleaned_warning': 'warning_aih_without_isolated', 
                            'event': 'warning_aih_corect_with_consec',
                           'warning_final':'warning_final_aih'})

dta2 = dta2.assign(warning_aih = dta2.warning_aih.astype(int),
                   warning_aih_without_isolated = dta2.warning_aih_without_isolated.astype(int),
                   warning_final_aih = dta2.warning_final_aih.astype(int))

In [88]:
dta2.warning_final_aih.sum()

482

In [89]:
dta2

Unnamed: 0,co_imed,year_week,warning_aih,n,warning_aih_without_isolated,warning_aih_corect_with_consec,warning_final_aih
92,110001,2022-47,0,47,0,0,0
95,110001,2022-48,0,51,0,0,0
98,110001,2022-49,0,51,0,0,0
101,110001,2022-50,0,59,0,0,0
104,110001,2022-51,0,32,0,0,0
...,...,...,...,...,...,...,...
55798,530001,2024-48,0,193,0,0,0
55801,530001,2024-49,0,192,0,0,0
55804,530001,2024-50,0,160,0,0,0
55807,530001,2024-51,0,151,0,0,0


In [91]:
warning_region = dta2.groupby('co_imed')['warning_final_aih'].sum().reset_index().rename(columns={'warning_final_aih': 'sum_war'})


In [93]:
warning_region[warning_region.sum_war == 0]

Unnamed: 0,co_imed,sum_war
7,120002,0
17,130007,0
25,140004,0
27,150002,0
31,150006,0
...,...,...
503,520017,0
504,520018,0
505,520019,0
506,520020,0


In [90]:
dta2.to_parquet('/Users/julianeoliveira/Documents/github/Bivariate_Anomaly_Detection_Primary_Health_Care_Drug_Selling_ILI_surveillance/Results/data_manuscript_warning_aih_imed2_def2.parquet')

                #/home/juliane.oliveira/workspace/Data/data_manuscript_warning_aih_imed.parquet')