In [43]:
# Read packedges

import pandas as pd
import pandas.testing as tm
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import statsmodels.api as sm
import statsmodels.discrete.discrete_model as dm

from patsy import dmatrices
import statsmodels.graphics.tsaplots as tsa


from scipy.fft import fft, ifft, fftfreq

import statsmodels.formula.api as smf
from statsmodels.tsa.stattools import acf

import itertools
from itertools import combinations, chain

from scipy.stats import pearsonr

import re

from datetime import datetime

import pymannkendall as mk

import math

from scipy.stats import friedmanchisquare

from sklearn.metrics import r2_score

from pmdarima.preprocessing import FourierFeaturizer
from pmdarima.datasets import load_wineind
from sklearn.linear_model import LinearRegression

# Our functions
import functions

import early_warning_detection_functions

import trend_timeseries

import function_single_serie

import warnings
warnings.filterwarnings("ignore")


# Read data

In [44]:
df = pd.read_parquet('/opt/storage/refined/aesop/visualization/aesop_2025_05_09_mun_new.parquet')

In [45]:
# Filter only weeks where data from OTC started to be send

df = df[df.year_week >= '2022-40']

# Create auxiliar variables to run Negbi Model on OTC data

In [46]:
df = df.assign(num_otc_ivas = df.num_otc_ivas.fillna(0))

In [47]:
# Create key new variables
lst_dfs_cities = function_single_serie.lst_dfs_cities(df, city_code_col='co_ibge', 
                             epiweek_date_col='year_week', 
                             serie_col= 'num_otc_ivas') # PODEMOS COLOCAR AQUI unidades_gripais

# Identify if the series has significant trend

final_kendall_negbi = function_single_serie.final_kendall_negbi(lst_dfs_cities, serie = 'num_otc_ivas_4')
final_kendall_negbi  = final_kendall_negbi.assign(
                            dtrend_otc_negbi = final_kendall_negbi.num_otc_ivas_4 - final_kendall_negbi.trend_line_negbi_num_otc_ivas_4
                            )

lst = []

for city in final_kendall_negbi.co_ibge.unique():

    set_muni = final_kendall_negbi[final_kendall_negbi.co_ibge == city]

    lst.append(set_muni)



lst_dfs_cities = lst

# Identify significant  Seasonality using  Friedman test 

for i in range(0, len(lst_dfs_cities)):

    data = lst_dfs_cities[i]['dtrend_otc_negbi'].to_numpy()
    
    p = len(data) // 2

    res_test = function_single_serie.friedman_test(data, freq= p)

    lst_dfs_cities[i] = lst_dfs_cities[i].assign(p_value_otc_negbi_friedman = res_test['p_value'],
                                                test_stat_otc_negbi_friedman = res_test['test_stat'])

# Format data 

final = pd.concat(lst_dfs_cities)

lst_var = ['co_ibge', 'epiyear', 'epiweek', 
           'co_uf','nm_uf', 'co_ibge7', 'nm_municipio', 'sigla_uf', 'year_week',
           'year_week_ts', 'num_otc_ivas', 'num_otc_ivas_4','num_otc_ivas_lag_1', 'num_otc_ivas_lag_2', #### Varuáveis que uso no modelo daqui pra baixo 
           'num_otc_ivas_lag_3','num_otc_ivas_lag_4','time_trend',
           'p_values_negbi_num_otc_ivas_4', 'dtrend_otc_negbi',
           'p_value_otc_negbi_friedman']

final = final[lst_var]

final = final.rename(columns={"num_otc_ivas_lag_1": "otc_4_lag_1", 
                      "num_otc_ivas_lag_2": "otc_4_lag_2",
                     "num_otc_ivas_lag_3":"otc_4_lag_3",
                     'num_otc_ivas_lag_4':'otc_4_lag_4'})

In [48]:
final.columns

Index(['co_ibge', 'epiyear', 'epiweek', 'co_uf', 'nm_uf', 'co_ibge7',
       'nm_municipio', 'sigla_uf', 'year_week', 'year_week_ts', 'num_otc_ivas',
       'num_otc_ivas_4', 'otc_4_lag_1', 'otc_4_lag_2', 'otc_4_lag_3',
       'otc_4_lag_4', 'time_trend', 'p_values_negbi_num_otc_ivas_4',
       'dtrend_otc_negbi', 'p_value_otc_negbi_friedman'],
      dtype='object')

# Run regressions

### Cities without trend and sezonality

In [49]:
# select cities without trend and sezonality in PHC

df1 = final[(final.p_value_otc_negbi_friedman >= 0.05) & (final.p_values_negbi_num_otc_ivas_4 >= 0.05)]

print('number of cities without trend and sezonality', df1.co_ibge.nunique(), 'percentage', round(df1.co_ibge.nunique()*100/5570,1) )

number of cities without trend and sezonality 3818 percentage 68.5


In [50]:
lags_y = range(1, 4)  # Example: Using lags 1 to 4 for y_t
dependent_variable = 'num_otc_ivas_4'  # dependent variable

def lags_comb(variable, lags_variable):
    """Create cumulative lag combinations for a given variable."""
    lags_variable_comb = [f'{variable}_{lag}' for lag in lags_variable]
    cumulative_sums_var = [' + '.join(lags_variable_comb[:i]) for i in range(1, len(lags_variable_comb) + 1)]
    return cumulative_sums_var

# Generate lagged combinations
lags_y_comb = lags_comb('otc_4_lag', lags_y)
#lags_x_comb = lags_comb('otc_4_lag', lags_x)


formulas = []

# Iterate over all combinations and create formula strings
for terms in lags_y_comb:
    components = [terms]
    # Remove empty strings and join components
    formula_components = ' + '.join(filter(None, components))
    formula = f'{dependent_variable} ~ {formula_components}'
    formulas.append(formula)

## Keep the best model formulas and AIC

lst = []

for code in df1.co_ibge.unique():

    #print(code)

    data = df1[df1.co_ibge == code]

    # Placeholder for the best model
    best_model = None
    best_aic = float('inf')
    best_formula = None

    # Loop through all formulas to find the best model based on AIC
    for formula in formulas:
        
        # Fit the model
        alpha = 1
        model = smf.glm(formula=formula, data=data, family=sm.families.NegativeBinomial(alpha=alpha)).fit()
        
        # Check the AIC of the current model
        current_aic = model.aic
        
        # Update the best model if the current model has a lower AIC
        if current_aic < best_aic:
            best_aic = current_aic
            best_model = model
            best_formula = formula

    data = data.assign(best_formula = best_formula)
    data = data.assign(best_aic = best_aic)
    
    lst.append(data)

lst_dfs_cities1 = lst



#### save predicted values, upperbound and warning columns

# we know the best model that adjust to the data of each municipality. Thus:

lst = []

for data in lst_dfs_cities1:
    # Fit the model
    alpha = 1
    model = smf.glm(formula=data.best_formula.iloc[0], data=data, family=sm.families.NegativeBinomial(alpha=alpha)).fit()

    # Extract fitted values and residuals
    fitted_values = model.fittedvalues
    residuals = model.resid_deviance
    
    sigma = np.std(residuals)
    sigma_ = np.std(fitted_values)
    sigma_t = fitted_values.rolling(window=5).std()
    
    data = data.assign(fitted_values_otc_only = fitted_values,
                      residuals_otc_only = residuals,
                      sigma_otc_only = sigma,
                      sigma__otc_only = sigma_,
                      sigma_t_otc_only = sigma_t)
    
    lst.append(data)
    
lst_dfs_cities1 = lst


lst = []
for i in range(len(lst_dfs_cities1)):
    
    data = lst_dfs_cities1[i]
    
    # Calculate control limits for the original series
    w = 1 #data.w_best.iloc[0]
    
    data = data.assign(UCL_otc_only = data.fitted_values_otc_only + w * data.sigma_t_otc_only,
                      LCL_otc_only = data.fitted_values_otc_only - w * data.sigma_t_otc_only)

    data = data.assign(out_of_limits_above_otc_only = (data['num_otc_ivas_4'] >= data.UCL_otc_only).astype(int),
                       out_of_limits_below_otc_only = (data['num_otc_ivas_4'] < data.LCL_otc_only).astype(int)
                       )
    
    lst.append(data)
    
lst_dfs_cities1 = lst   

### Cities with trend and without sezonality

In [51]:

df2 = final[(final.p_value_otc_negbi_friedman >= 0.05) & (final.p_values_negbi_num_otc_ivas_4 < 0.05)]

print('number of cities with trend and without sezonality', df2.co_ibge.nunique(), 'percentage', round(df2.co_ibge.nunique()*100/5570,1) )

number of cities with trend and without sezonality 930 percentage 16.7


In [52]:
lags_y = range(1, 4)  # Example: Using lags 1 to 4 for y_t
dependent_variable = 'num_otc_ivas_4'  # dependent variable

def lags_comb(variable, lags_variable):
    """Create cumulative lag combinations for a given variable."""
    lags_variable_comb = [f'{variable}_{lag}' for lag in lags_variable]
    cumulative_sums_var = [' + '.join(lags_variable_comb[:i]) for i in range(1, len(lags_variable_comb) + 1)]
    return cumulative_sums_var

# Generate lagged combinations
lags_y_comb = lags_comb('otc_4_lag', lags_y)
#lags_x_comb = lags_comb('otc_4_lag', lags_x)


formulas = []

# Iterate over all combinations and create formula strings
for terms in lags_y_comb:
    components = [terms]
    # Remove empty strings and join components
    formula_components = 'time_trend + ' + ' + '.join(filter(None, components))
    formula = f'{dependent_variable} ~ {formula_components}'
    formulas.append(formula)

## Keep the best model formulas and AIC
lst = []

for code in df2.co_ibge.unique():
    print(f"Processing {code}...")

    data = df2[df2.co_ibge == code].copy()
    data = data.assign(time_trend=np.arange(len(data)))

    # Initialize placeholders
    best_model = None
    best_aic = float('inf')
    best_formula = np.nan  # Set default as NaN

    # Try fitting models
    for formula in formulas:
        try:
            model = smf.glm(formula=formula, data=data, family=sm.families.NegativeBinomial(alpha=1)).fit()
            current_aic = model.aic

            if current_aic < best_aic:
                best_aic = current_aic
                best_model = model
                best_formula = formula

        except Exception as e:
            print(f"Skipping formula {formula} for {code}: {e}")
            continue  # Skip to the next formula

    # Assign best formula and AIC (or NaN if model failed)
    data = data.assign(best_formula=best_formula, best_aic=best_aic if best_aic != float('inf') else np.nan)
    lst.append(data)

lst_dfs_cities2 = lst  # Store results

#### Save predicted values, upper bound, and warning columns

lst = []

for data in lst_dfs_cities2:
    
    if pd.isna(data.best_formula.iloc[0]):
        # If no valid model, create NaN columns
        data = data.assign(
                fitted_values_otc_only=np.nan,
                residuals_otc_only=np.nan,
                sigma_otc_only=np.nan,
                sigma__otc_only=np.nan,
                sigma_t_otc_only=np.nan
            )
    else:
        # Fit the model and calculate residuals
        model = smf.glm(formula=data.best_formula.iloc[0], data=data, family=sm.families.NegativeBinomial(alpha=1)).fit()
        fitted_values = model.fittedvalues
        residuals = model.resid_deviance

        # Calculate sigmas
        sigma = np.std(residuals)
        sigma_ = np.std(fitted_values)
        sigma_t = fitted_values.rolling(window=5).std()

        # Assign values
        data = data.assign(
                fitted_values_otc_only=fitted_values,
                residuals_otc_only=residuals,
                sigma_otc_only=sigma,
                sigma__otc_only=sigma_,
                sigma_t_otc_only=sigma_t
            )

    lst.append(data)

lst_dfs_cities2 = lst  # Store results

### Control limits and out-of-limits detection

lst = []

for data in lst_dfs_cities2:
    
    if data['fitted_values_otc_only'].isna().all():
        # If no valid fitted values, assign NaNs
        data = data.assign(
                    UCL_otc_only=np.nan,
                    LCL_otc_only=np.nan,
                    out_of_limits_above_otc_only=np.nan,
                    out_of_limits_below_otc_only=np.nan
            )
    else:
        w = 1  # Default weight
        
        UCL_otc_only = data.fitted_values_otc_only + w * data.sigma_t_otc_only
        LCL_otc_only = data.fitted_values_otc_only - w * data.sigma_t_otc_only
        
        data = data.assign(
                UCL_otc_only =  UCL_otc_only,
                LCL_otc_only =  LCL_otc_only,
                out_of_limits_above_otc_only = (data['num_otc_ivas_4'] >= UCL_otc_only).astype(int),
                out_of_limits_below_otc_only=(data['num_otc_ivas_4'] < LCL_otc_only).astype(int)
            )

    lst.append(data)

lst_dfs_cities2 = lst  # Store final results


Processing 170389...
Processing 171430...
Processing 210055...
Processing 210375...
Processing 210927...
Processing 220460...
Processing 220779...
Processing 220810...
Processing 221065...
Processing 221110...
Processing 230495...
Processing 240320...
Processing 240400...
Processing 250077...
Processing 250650...
Processing 251450...
Processing 261430...
Processing 270740...
Processing 280700...
Processing 290160...
Processing 290530...
Processing 291090...
Processing 291733...
Processing 311020...
Processing 315570...
Processing 316230...
Processing 316470...
Processing 351610...
Processing 352115...
Processing 352780...
Processing 352860...
Processing 410300...
Processing 410304...
Processing 412520...
Processing 412862...
Processing 420250...
Processing 430205...
Processing 431308...
Processing 431339...
Processing 432252...
Processing 432375...
Processing 510885...
Processing 520396...
Processing 110090...
Processing 120013...
Processing 120033...
Processing 150280...
Processing 17

### Cities without trend and with sezonality

In [53]:
# select cities without trend and sezonality in PHC

df3 = final[(final.p_value_otc_negbi_friedman < 0.05) & (final.p_values_negbi_num_otc_ivas_4 >= 0.05)]

print('number of cities without trend and with sezonality', df3.co_ibge.nunique(), 'percentage', round(df3.co_ibge.nunique()*100/5570,1) )

number of cities without trend and with sezonality 409 percentage 7.3


In [54]:
lst_dfs_cities3 = []

for code in df3.co_ibge.unique():
    
    set_muni = df3[df3.co_ibge == code]

    lst_dfs_cities3.append(set_muni)

lst_dfs_cities3 = early_warning_detection_functions.harmonic(lst_dfs_cities3, 'num_otc_ivas_4')

In [55]:
lags_y = range(1, 4)  # Example: Using lags 1 to 4 for y_t
dependent_variable = 'num_otc_ivas_4'  # dependent variable

def lags_comb(variable, lags_variable):
    """Create cumulative lag combinations for a given variable."""
    lags_variable_comb = [f'{variable}_{lag}' for lag in lags_variable]
    cumulative_sums_var = [' + '.join(lags_variable_comb[:i]) for i in range(1, len(lags_variable_comb) + 1)]
    return cumulative_sums_var

# Generate lagged combinations
lags_y_comb = lags_comb('otc_4_lag', lags_y)
#lags_x_comb = lags_comb('otc_4_lag', lags_x)


formulas = []

# Iterate over all combinations and create formula strings
for terms in lags_y_comb:
    components = [terms]
    # Remove empty strings and join components
    formula_components = 'Reconstructed + ' + ' + '.join(filter(None, components))
    formula = f'{dependent_variable} ~ {formula_components}'
    formulas.append(formula)

## Keep the best model formulas and AIC
lst = []

for data in lst_dfs_cities3:
    print(f"Processing {data.co_ibge.iloc[0]}...")

    #data = df3[df3.co_ibge == code].copy()
    #data = data.assign(time_trend=np.arange(len(data)))

    # Initialize placeholders
    best_model = None
    best_aic = float('inf')
    best_formula = np.nan  # Set default as NaN

    # Try fitting models
    for formula in formulas:
        try:
            model = smf.glm(formula=formula, data=data, family=sm.families.NegativeBinomial(alpha=1)).fit()
            current_aic = model.aic

            if current_aic < best_aic:
                best_aic = current_aic
                best_model = model
                best_formula = formula

        except Exception as e:
            print(f"Skipping formula {formula} for {code}: {e}")
            continue  # Skip to the next formula

    # Assign best formula and AIC (or NaN if model failed)
    data = data.assign(best_formula=best_formula, best_aic=best_aic if best_aic != float('inf') else np.nan)
    lst.append(data)

lst_dfs_cities3 = lst  # Store results


#### Save predicted values, upper bound, and warning columns

lst = []

for data in lst_dfs_cities3:
    
    if pd.isna(data.best_formula.iloc[0]):
        # If no valid model, create NaN columns
        data = data.assign(
                fitted_values_otc_only=np.nan,
                residuals_otc_only=np.nan,
                sigma_otc_only=np.nan,
                sigma__otc_only=np.nan,
                sigma_t_otc_only=np.nan
            )
    else:
        # Fit the model and calculate residuals
        model = smf.glm(formula=data.best_formula.iloc[0], data=data, family=sm.families.NegativeBinomial(alpha=1)).fit()
        fitted_values = model.fittedvalues
        residuals = model.resid_deviance

        # Calculate sigmas
        sigma = np.std(residuals)
        sigma_ = np.std(fitted_values)
        sigma_t = fitted_values.rolling(window=5).std()

        # Assign values
        data = data.assign(
                fitted_values_otc_only=fitted_values,
                residuals_otc_only=residuals,
                sigma_otc_only=sigma,
                sigma__otc_only=sigma_,
                sigma_t_otc_only=sigma_t
            )

    lst.append(data)

lst_dfs_cities3 = lst  # Store results

### Control limits and out-of-limits detection

lst = []

for data in lst_dfs_cities3:
    
    if data['fitted_values_otc_only'].isna().all():
        # If no valid fitted values, assign NaNs
        data = data.assign(
                    UCL_otc_only=np.nan,
                    LCL_otc_only=np.nan,
                    out_of_limits_above_otc_only=np.nan,
                    out_of_limits_below_otc_only=np.nan
            )
    else:
        w = 1  # Default weight
        
        UCL_otc_only = data.fitted_values_otc_only + w * data.sigma_t_otc_only
        LCL_otc_only = data.fitted_values_otc_only - w * data.sigma_t_otc_only
        
        data = data.assign(
                UCL_otc_only =  UCL_otc_only,
                LCL_otc_only =  LCL_otc_only,
                out_of_limits_above_otc_only = (data['num_otc_ivas_4'] >= UCL_otc_only).astype(int),
                out_of_limits_below_otc_only=(data['num_otc_ivas_4'] < LCL_otc_only).astype(int)
            )

    lst.append(data)

lst_dfs_cities3 = lst  # Store final results


Processing 290880...
Processing 291710...
Processing 310205...
Processing 311190...
Processing 312695...
Processing 313868...
Processing 313900...
Processing 316260...
Processing 320495...
Processing 330260...
Processing 351670...
Processing 351690...
Processing 353960...
Processing 411310...
Processing 412840...
Processing 420070...
Processing 421310...
Processing 421470...
Processing 430805...
Processing 520400...
Processing 530010...
Processing 230740...
Processing 240450...
Processing 270720...
Processing 292350...
Processing 311640...
Processing 313925...
Processing 315737...
Processing 320390...
Processing 330370...
Processing 350010...
Processing 350460...
Processing 350910...
Processing 351110...
Processing 351980...
Processing 354860...
Processing 410800...
Processing 420125...
Processing 421165...
Processing 421575...
Processing 421870...
Processing 431342...
Processing 432280...
Processing 171190...
Processing 270100...
Processing 311820...
Processing 314015...
Processing 31

### Cities with trend and with sezonality

In [56]:
# select cities without trend and sezonality in PHC

df4 = final[(final.p_value_otc_negbi_friedman < 0.05) & (final.p_values_negbi_num_otc_ivas_4 < 0.05)]

print('number of cities with trend and sezonality', df4.co_ibge.nunique(), 'percentage', round(df4.co_ibge.nunique()*100/5570,1) )

number of cities with trend and sezonality 375 percentage 6.7


In [57]:
lst_dfs_cities4 = []

for code in df4.co_ibge.unique():
    
    set_muni = df4[df4.co_ibge == code]

    lst_dfs_cities4.append(set_muni)

lst_dfs_cities4 = early_warning_detection_functions.harmonic(lst_dfs_cities4, 'num_otc_ivas_4')

In [58]:
lags_y = range(1, 4)  # Example: Using lags 1 to 4 for y_t
dependent_variable = 'num_otc_ivas_4'  # dependent variable

def lags_comb(variable, lags_variable):
    """Create cumulative lag combinations for a given variable."""
    lags_variable_comb = [f'{variable}_{lag}' for lag in lags_variable]
    cumulative_sums_var = [' + '.join(lags_variable_comb[:i]) for i in range(1, len(lags_variable_comb) + 1)]
    return cumulative_sums_var

# Generate lagged combinations
lags_y_comb = lags_comb('otc_4_lag', lags_y)
#lags_x_comb = lags_comb('otc_4_lag', lags_x)


formulas = []

# Iterate over all combinations and create formula strings
for terms in lags_y_comb:
    components = [terms]
    # Remove empty strings and join components
    formula_components = 'time_trend + Reconstructed + ' + ' + '.join(filter(None, components))
    formula = f'{dependent_variable} ~ {formula_components}'
    formulas.append(formula)

In [59]:
## Keep the best model formulas and AIC
lst = []

for data in lst_dfs_cities4:
    print(f"Processing {data.co_ibge.iloc[0]}...")

    #data = df3[df3.co_ibge == code].copy()
    data = data.assign(time_trend=np.arange(len(data)))

    # Initialize placeholders
    best_model = None
    best_aic = float('inf')
    best_formula = np.nan  # Set default as NaN

    # Try fitting models
    for formula in formulas:
        try:
            model = smf.glm(formula=formula, data=data, family=sm.families.NegativeBinomial(alpha=1)).fit()
            current_aic = model.aic

            if current_aic < best_aic:
                best_aic = current_aic
                best_model = model
                best_formula = formula

        except Exception as e:
            print(f"Skipping formula {formula} for {code}: {e}")
            continue  # Skip to the next formula

    # Assign best formula and AIC (or NaN if model failed)
    data = data.assign(best_formula=best_formula, best_aic=best_aic if best_aic != float('inf') else np.nan)
    lst.append(data)

lst_dfs_cities4 = lst  # Store results


Processing 110030...
Processing 130002...
Processing 221020...
Processing 240615...
Processing 261250...
Processing 280650...
Processing 310050...
Processing 313180...
Processing 314200...
Processing 351510...
Processing 354010...
Processing 355230...
Processing 411155...
Processing 430637...
Processing 170290...
Processing 240410...
Processing 310740...
Processing 311830...
Processing 315040...
Processing 316580...
Processing 350340...
Processing 350840...
Processing 354070...
Processing 355010...
Processing 420075...
Processing 430790...
Processing 431171...
Processing 520450...
Processing 231130...
Processing 231260...
Processing 231340...
Processing 240640...
Processing 250580...
Processing 260690...
Processing 292285...
Processing 292850...
Processing 292870...
Processing 312060...
Processing 312090...
Processing 312240...
Processing 312480...
Processing 313050...
Processing 314795...
Processing 315160...
Processing 316690...
Processing 317120...
Processing 350470...
Processing 35

In [60]:
#### Save predicted values, upper bound, and warning columns

lst = []

for data in lst_dfs_cities4:
    
    if pd.isna(data.best_formula.iloc[0]):
        # If no valid model, create NaN columns
        data = data.assign(
                fitted_values_otc_only=np.nan,
                residuals_otc_only=np.nan,
                sigma_otc_only=np.nan,
                sigma__otc_only=np.nan,
                sigma_t_otc_only=np.nan
            )
    else:
        # Fit the model and calculate residuals
        model = smf.glm(formula=data.best_formula.iloc[0], data=data, family=sm.families.NegativeBinomial(alpha=1)).fit()
        fitted_values = model.fittedvalues
        residuals = model.resid_deviance

        # Calculate sigmas
        sigma = np.std(residuals)
        sigma_ = np.std(fitted_values)
        sigma_t = fitted_values.rolling(window=5).std()

        # Assign values
        data = data.assign(
                fitted_values_otc_only=fitted_values,
                residuals_otc_only=residuals,
                sigma_otc_only=sigma,
                sigma__otc_only=sigma_,
                sigma_t_otc_only=sigma_t
            )

    lst.append(data)

lst_dfs_cities4 = lst  # Store results

### Control limits and out-of-limits detection

lst = []

for data in lst_dfs_cities4:
    
    if data['fitted_values_otc_only'].isna().all():
        # If no valid fitted values, assign NaNs
        data = data.assign(
                    UCL_otc_only=np.nan,
                    LCL_otc_only=np.nan,
                    out_of_limits_above_otc_only=np.nan,
                    out_of_limits_below_otc_only=np.nan
            )
    else:
        w = 1  # Default weight
        
        UCL_otc_only = data.fitted_values_otc_only + w * data.sigma_t_otc_only
        LCL_otc_only = data.fitted_values_otc_only - w * data.sigma_t_otc_only
        
        data = data.assign(
                UCL_otc_only =  UCL_otc_only,
                LCL_otc_only =  LCL_otc_only,
                out_of_limits_above_otc_only = (data['num_otc_ivas_4'] >= UCL_otc_only).astype(int),
                out_of_limits_below_otc_only=(data['num_otc_ivas_4'] < LCL_otc_only).astype(int)
            )

    lst.append(data)

lst_dfs_cities4 = lst  # Store final results



# Process and save data

In [61]:
len(lst_dfs_cities1) + len(lst_dfs_cities2) + len(lst_dfs_cities3) + len(lst_dfs_cities4)

5532

In [62]:
lst1 = []



for data in lst_dfs_cities1:

    data = data[['nm_uf', 'co_ibge7', 'nm_municipio', 'co_ibge',
                 'year_week', 'year_week_ts', 
                 'num_otc_ivas', 
                 'UCL_otc_only', 
                 'out_of_limits_above_otc_only']]

    lst1.append(data)


for data in lst_dfs_cities2:

    data = data[['nm_uf', 'co_ibge7', 'nm_municipio', 'co_ibge',
                 'year_week', 'year_week_ts', 
                 'num_otc_ivas', 
                 'UCL_otc_only', 
                 'out_of_limits_above_otc_only']]

    lst1.append(data)

for data in lst_dfs_cities3:

    data = data[['nm_uf', 'co_ibge7', 'nm_municipio', 'co_ibge',
                 'year_week', 'year_week_ts', 
                 'num_otc_ivas', 
                 'UCL_otc_only', 
                 'out_of_limits_above_otc_only']]

    lst1.append(data)

for data in lst_dfs_cities4:

    data = data[['nm_uf', 'co_ibge7', 'nm_municipio', 'co_ibge',
                 'year_week', 'year_week_ts', 
                 'num_otc_ivas', 
                 'UCL_otc_only',
                 'out_of_limits_above_otc_only']]

    lst1.append(data)

In [63]:
len(lst1)

5532

In [80]:
final = pd.concat(lst1)

In [81]:
final = final.rename(columns= {'UCL_otc_only':'lim_esp_NegBi_otc_ivas',
                      'out_of_limits_above_otc_only': 'sinal_NegBi_otc_ivas'})

In [82]:
final.columns

Index(['nm_uf', 'co_ibge7', 'nm_municipio', 'co_ibge', 'year_week',
       'year_week_ts', 'num_otc_ivas', 'lim_esp_NegBi_otc_ivas',
       'sinal_NegBi_otc_ivas'],
      dtype='object')

In [84]:
final

'2025-17'