In [1]:
# Read packedges

import pandas as pd
import pandas.testing as tm
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import statsmodels.api as sm
import statsmodels.discrete.discrete_model as dm

from patsy import dmatrices
import statsmodels.graphics.tsaplots as tsa


from scipy.fft import fft, ifft, fftfreq

import statsmodels.formula.api as smf
from statsmodels.tsa.stattools import acf

import itertools
from itertools import combinations, chain

from scipy.stats import pearsonr

import re

import functions

import early_warning_detection_functions

from datetime import datetime

import pymannkendall as mk

import math

import trend_timeseries

from scipy.stats import friedmanchisquare

from sklearn.metrics import r2_score

from pmdarima.preprocessing import FourierFeaturizer
from pmdarima.datasets import load_wineind
from sklearn.linear_model import LinearRegression


# Read data

In [2]:
df_aih = pd.read_parquet('/Users/julianeoliveira/Documents/github/Bivariate_Anomaly_Detection_Primary_Health_Care_Drug_Selling_ILI_surveillance/Results/data_manuscript_warning_aih_imed_for_regre.parquet')

df = pd.read_parquet('/Users/julianeoliveira/Documents/github/Bivariate_Anomaly_Detection_Primary_Health_Care_Drug_Selling_ILI_surveillance/Results/data_manuscript_otc_phc_imed.parquet')



In [3]:
df2 = df[[ 'co_imed', 'year_week', 'year_week_ts', 'epidemi_cal_start', 'epidemi_cal_end', 'atend_ivas',
       'num_otc_ivas', 'atend_ivas_4', 'num_otc_ivas_4', 'phc_4_lag_1', 'phc_4_lag_2',
       'phc_4_lag_3', 'otc_4_lag_1', 'otc_4_lag_2', 'otc_4_lag_3' ]]

In [4]:
data_aih = df_aih.merge(df2, on= ['co_imed', 'year_week'], how = 'left' )

In [5]:
data_aih = data_aih.assign(phc_4_lag_0 = data_aih.atend_ivas_4,
                   otc_4_lag_0 = data_aih.num_otc_ivas_4,
                   aih_4_lag_0 = data_aih.aih_4)

# Run regressions - AIH and APS

### Cities without trend and sezonality

In [6]:
# select cities without trend and sezonality in PHC

df1 = data_aih[(data_aih.p_value_aih_negbi_friedman >= 0.05) & (data_aih.p_values_negbi_aih_4 >= 0.05)]

print('number of imediate without trend and sezonality', df1.co_imed.nunique(), 'percentage', round(df1.co_imed.nunique()*100/510,1) )

number of imediate without trend and sezonality 438 percentage 85.9


In [7]:
lags_y = range(1, 4)  # Example: Using lags 1 to 4 for y_t
lags_x = range(0, 4)  # Example: Using lags 0 to 4 for x_t
dependent_variable = 'aih_4'  # Example dependent variable

def lags_comb(variable, lags_variable):
    """Create cumulative lag combinations for a given variable."""
    lags_variable_comb = [f'{variable}_{lag}' for lag in lags_variable]
    cumulative_sums_var = [' + '.join(lags_variable_comb[:i]) for i in range(1, len(lags_variable_comb) + 1)]
    return cumulative_sums_var

# Generate lagged combinations
lags_y_comb = lags_comb('aih_4_lag', lags_y)
lags_x_comb = lags_comb('phc_4_lag', lags_x)

formulas = []

# Iterate over all combinations and create formula strings
for value in lags_x_comb:
    for terms in lags_y_comb:
        components = [terms, value]
        # Remove empty strings and join components
        formula_components = ' + '.join(filter(None, components))
        formula = f'{dependent_variable} ~ {formula_components}'
        formulas.append(formula)

In [8]:
## Keep the best model formulas and AIC

lst = []

for code in df1.co_imed.unique():

    #print(code)

    data = df1[df1.co_imed == code]

    # Placeholder for the best model
    best_model = None
    best_aic = float('inf')
    best_formula = None

    # Loop through all formulas to find the best model based on AIC
    for formula in formulas:
        
        # Fit the model
        alpha = 1
        model = smf.glm(formula=formula, data=data[:-30], family=sm.families.NegativeBinomial(alpha=alpha)).fit()
        
        # Check the AIC of the current model
        current_aic = model.aic
        
        # Update the best model if the current model has a lower AIC
        if current_aic < best_aic:
            best_aic = current_aic
            best_model = model
            best_formula = formula

    data = data.assign(best_formula = best_formula)
    data = data.assign(best_aic = best_aic)
    
    lst.append(data)

lst_dfs_cities1 = lst


In [9]:
# Initialize an empty list to store results
lst = []
model_results = []  # List to store model performance metrics

for data in lst_dfs_cities1:
    data = data.copy()  # Ensure no changes happen in place
    
    # Fit the model
    alpha = 1
    model = smf.glm(
        formula=data.best_formula.iloc[0], 
        data=data[:-30], 
        family=sm.families.NegativeBinomial(alpha=alpha)
    ).fit()
    
 
    # Assign model outputs to the DataFrame
    data["forecast_aih_4"] = None 
    data['fitted_values_aih_aps'] = None
    data['residuals_aih_aps'] = None
    
    # Extract fitted values and residuals
    data[:-30] = data[:-30].assign(
        fitted_values_aih_aps=model.fittedvalues,
        residuals_aih_aps=model.resid_deviance
           )
    
    # Forecast
    data[-30:] = data[-30:].assign(forecast_aih_4 = model.predict(data[-30:]))
    data = pd.concat([data[:-30], data[-30:]], ignore_index=True)

    # Extract key model statistics
    pseudo_r2 = model.pseudo_rsquared()
    p_values = model.pvalues
    conf_int = model.conf_int()
    
    res = stats.spearmanr(data[-30:]["aih_4"].to_numpy(), data[-30:]["forecast_aih_4"].to_numpy())

    # Save model statistics in a dictionary
    model_results.append({
        "co_imed": data.co_imed.iloc[0],
        "pseudo_R2": pseudo_r2,
        "p_values": p_values.to_dict(),
        "conf_int": conf_int.values.tolist(),
        "log_likelihood": model.llf,
        "deviance": model.deviance,
        "pearson_chi2": model.pearson_chi2,
        'corre_forecasted': res.correlation,
        'p_value_corr_forecast': res.pvalue
    })

    lst.append(data)  # Append modified data to the list

# Update lst_dfs_cities1 with new DataFrames containing fitted values & residuals
lst_dfs_cities1 = lst

# Convert model results into a DataFrame for easy viewing
df_model_results = pd.DataFrame(model_results)


In [10]:
# Convert 'p_values' dictionary column into multiple columns
df_pvalues = df_model_results["p_values"].apply(pd.Series)

# Merge expanded p-values back into the main DataFrame and drop the original dictionary column
df_model_results1 = pd.concat([df_model_results.drop(columns=["p_values"]), df_pvalues], axis=1)


In [11]:
df_model_results1.corre_forecasted.describe()

count    438.000000
mean       0.898024
std        0.119186
min       -0.522505
25%        0.884775
50%        0.926183
75%        0.951764
max        0.991546
Name: corre_forecasted, dtype: float64

In [12]:
df_model_results1.pseudo_R2.describe()

count    438.000000
mean       0.171757
std        0.037117
min        0.062237
25%        0.146757
50%        0.166820
75%        0.191329
max        0.351884
Name: pseudo_R2, dtype: float64

In [13]:
len(df_model_results1[(df_model_results1.phc_4_lag_0 <= 0.05) | (df_model_results1.phc_4_lag_1 <= 0.05)])

29

In [14]:
len(df_model_results1[(df_model_results1.phc_4_lag_0 <= 0.05) | (df_model_results1.phc_4_lag_1 <= 0.05)])/len(df_model_results1)

0.06621004566210045

### Cities with trend and without sezonality

In [15]:
df2 = data_aih[(data_aih.p_value_aih_negbi_friedman >= 0.05) & (data_aih.p_values_negbi_aih_4 < 0.05)]

print('number of imediate with trend and without sezonality', df2.co_imed.nunique(), 'percentage', round(df2.co_imed.nunique()*100/510,1) )

number of imediate with trend and without sezonality 31 percentage 6.1


In [16]:
lags_y = range(1, 4)  # Example: Using lags 1 to 4 for y_t
lags_x = range(0, 4)  # Example: Using lags 0 to 4 for x_t
dependent_variable =  'aih_4' # Example dependent variable

def lags_comb(variable, lags_variable):
    """Create cumulative lag combinations for a given variable."""
    lags_variable_comb = [f'{variable}_{lag}' for lag in lags_variable]
    cumulative_sums_var = [' + '.join(lags_variable_comb[:i]) for i in range(1, len(lags_variable_comb) + 1)]
    return cumulative_sums_var

# Generate lagged combinations
lags_y_comb = lags_comb('aih_4_lag', lags_y)
lags_x_comb = lags_comb('phc_4_lag', lags_x)

formulas = []

# Iterate over all combinations and create formula strings
for value in lags_x_comb:
    for terms in lags_y_comb:
        components = [terms, value]
        # Remove empty strings and join components
        formula_components = 'time_trend + ' + ' + '.join(filter(None, components))
        formula = f'{dependent_variable} ~ {formula_components}'
        formulas.append(formula)


In [18]:
## Keep the best model formulas and AIC

lst = []

for code in df2.co_imed.unique():

    #print(code)

    data = df2[df2.co_imed == code]

    # Placeholder for the best model
    best_model = None
    best_aic = float('inf')
    best_formula = None

    # Loop through all formulas to find the best model based on AIC
    for formula in formulas:
        
        # Fit the model
        alpha = 1
        model = smf.glm(formula=formula, data=data[:-30], family=sm.families.NegativeBinomial(alpha=alpha)).fit()
        
        # Check the AIC of the current model
        current_aic = model.aic
        
        # Update the best model if the current model has a lower AIC
        if current_aic < best_aic:
            best_aic = current_aic
            best_model = model
            best_formula = formula

    data = data.assign(best_formula = best_formula)
    data = data.assign(best_aic = best_aic)
    
    lst.append(data)

lst_dfs_cities2 = lst


In [19]:
# Initialize an empty list to store results
lst = []
model_results = []  # List to store model performance metrics

for data in lst_dfs_cities2:
    data = data.copy()  # Ensure no changes happen in place
    
    # Fit the model
    alpha = 1
    model = smf.glm(
        formula=data.best_formula.iloc[0], 
        data=data[:-30], 
        family=sm.families.NegativeBinomial(alpha=alpha)
    ).fit()
    
 
    # Assign model outputs to the DataFrame
    data["forecast_aih_4"] = None 
    data['fitted_values_aih_aps'] = None
    data['residuals_aih_aps'] = None
    
    # Extract fitted values and residuals
    data[:-30] = data[:-30].assign(
        fitted_values_aih_aps=model.fittedvalues,
        residuals_aih_aps=model.resid_deviance
           )
    
    # Forecast
    data[-30:] = data[-30:].assign(forecast_aih_4 = model.predict(data[-30:]))
    data = pd.concat([data[:-30], data[-30:]], ignore_index=True)

    # Extract key model statistics
    pseudo_r2 = model.pseudo_rsquared()
    p_values = model.pvalues
    conf_int = model.conf_int()
    
    res = stats.spearmanr(data[-30:]["aih_4"].to_numpy(), data[-30:]["forecast_aih_4"].to_numpy())

    # Save model statistics in a dictionary
    model_results.append({
        "co_imed": data.co_imed.iloc[0],
        "pseudo_R2": pseudo_r2,
        "p_values": p_values.to_dict(),
        "conf_int": conf_int.values.tolist(),
        "log_likelihood": model.llf,
        "deviance": model.deviance,
        "pearson_chi2": model.pearson_chi2,
        'corre_forecasted': res.correlation,
        'p_value_corr_forecast': res.pvalue
    })

    lst.append(data)  # Append modified data to the list

# Update lst_dfs_cities1 with new DataFrames containing fitted values & residuals
lst_dfs_cities2 = lst

# Convert model results into a DataFrame for easy viewing
df_model_results = pd.DataFrame(model_results)


In [20]:
# Convert 'p_values' dictionary column into multiple columns
df_pvalues = df_model_results["p_values"].apply(pd.Series)

# Merge expanded p-values back into the main DataFrame and drop the original dictionary column
df_model_results2 = pd.concat([df_model_results.drop(columns=["p_values"]), df_pvalues], axis=1)


In [21]:
df_model_results2.corre_forecasted.describe()

count    31.000000
mean      0.856747
std       0.167419
min       0.316902
25%       0.858796
50%       0.915740
75%       0.940555
max       0.980192
Name: corre_forecasted, dtype: float64

In [22]:
df_model_results2.pseudo_R2.describe()

count    31.000000
mean      0.195104
std       0.055092
min       0.129772
25%       0.165635
50%       0.182437
75%       0.201881
max       0.413385
Name: pseudo_R2, dtype: float64

In [23]:
len(df_model_results2[(df_model_results2.phc_4_lag_0 <= 0.05) | (df_model_results2.phc_4_lag_1 <= 0.05)])

4

In [24]:
len(df_model_results2[(df_model_results2.phc_4_lag_0 <= 0.05) | (df_model_results2.phc_4_lag_1 <= 0.05)])/len(df_model_results2)

0.12903225806451613

### Cities without trend and with sezonality

In [25]:
# select cities without trend and sezonality in PHC

df3 = data_aih[(data_aih.p_value_aih_negbi_friedman < 0.05) & (data_aih.p_values_negbi_aih_4 >= 0.05)]

print('number of imed without trend and with sezonality', df3.co_imed.nunique(), 'percentage', round(df3.co_imed.nunique()*100/510,1) )

number of imed without trend and with sezonality 38 percentage 7.5


In [26]:
lst_dfs_cities3 = []

for code in df3.co_imed.unique():
    
    set_muni = df3[df3.co_imed == code]

    lst_dfs_cities3.append(set_muni)

lst_dfs_cities3 = early_warning_detection_functions.harmonic(lst_dfs_cities3, 'aih_4')

In [28]:
lags_y = range(1, 4)  # Example: Using lags 1 to 4 for y_t
lags_x = range(0, 4)  # Example: Using lags 0 to 4 for x_t
dependent_variable = 'aih_4'  # Example dependent variable

def lags_comb(variable, lags_variable):
    """Create cumulative lag combinations for a given variable."""
    lags_variable_comb = [f'{variable}_{lag}' for lag in lags_variable]
    cumulative_sums_var = [' + '.join(lags_variable_comb[:i]) for i in range(1, len(lags_variable_comb) + 1)]
    return cumulative_sums_var

# Generate lagged combinations
lags_y_comb = lags_comb('aih_4_lag', lags_y)
lags_x_comb = lags_comb('phc_4_lag', lags_x)

formulas = []

# Iterate over all combinations and create formula strings
for value in lags_x_comb:
    for terms in lags_y_comb:
        components = [terms, value]
        # Remove empty strings and join components
        formula_components = 'Reconstructed + ' + ' + '.join(filter(None, components))
        formula = f'{dependent_variable} ~ {formula_components}'
        formulas.append(formula)

In [32]:
## Keep the best model formulas and AIC

lst = []

for data in lst_dfs_cities3:
    #print(f"Processing {data.co_imed.iloc[0]}...")

    #print(code)

    #data = df3[df3.co_imed == code]

    # Placeholder for the best model
    best_model = None
    best_aic = float('inf')
    best_formula = None

    # Loop through all formulas to find the best model based on AIC
    for formula in formulas:
        
        # Fit the model
        alpha = 1
        model = smf.glm(formula=formula, data=data[:-30], family=sm.families.NegativeBinomial(alpha=alpha)).fit()
        
        # Check the AIC of the current model
        current_aic = model.aic
        
        # Update the best model if the current model has a lower AIC
        if current_aic < best_aic:
            best_aic = current_aic
            best_model = model
            best_formula = formula

    data = data.assign(best_formula = best_formula)
    data = data.assign(best_aic = best_aic)
    
    lst.append(data)

lst_dfs_cities3 = lst


In [33]:
# Initialize an empty list to store results
lst = []
model_results = []  # List to store model performance metrics

for data in lst_dfs_cities3:
    data = data.copy()  # Ensure no changes happen in place
    
    # Fit the model
    alpha = 1
    model = smf.glm(
        formula=data.best_formula.iloc[0], 
        data=data[:-30], 
        family=sm.families.NegativeBinomial(alpha=alpha)
    ).fit()
    
 
    # Assign model outputs to the DataFrame
    data["forecast_aih_4"] = None 
    data['fitted_values_aih_aps'] = None
    data['residuals_aih_aps'] = None
    
    # Extract fitted values and residuals
    data[:-30] = data[:-30].assign(
        fitted_values_aih_aps=model.fittedvalues,
        residuals_aih_aps=model.resid_deviance
           )
    
    # Forecast
    data[-30:] = data[-30:].assign(forecast_aih_4 = model.predict(data[-30:]))
    data = pd.concat([data[:-30], data[-30:]], ignore_index=True)

    # Extract key model statistics
    pseudo_r2 = model.pseudo_rsquared()
    p_values = model.pvalues
    conf_int = model.conf_int()
    
    res = stats.spearmanr(data[-30:]["aih_4"].to_numpy(), data[-30:]["forecast_aih_4"].to_numpy())

    # Save model statistics in a dictionary
    model_results.append({
        "co_imed": data.co_imed.iloc[0],
        "pseudo_R2": pseudo_r2,
        "p_values": p_values.to_dict(),
        "conf_int": conf_int.values.tolist(),
        "log_likelihood": model.llf,
        "deviance": model.deviance,
        "pearson_chi2": model.pearson_chi2,
        'corre_forecasted': res.correlation,
        'p_value_corr_forecast': res.pvalue
    })

    lst.append(data)  # Append modified data to the list

# Update lst_dfs_cities1 with new DataFrames containing fitted values & residuals
lst_dfs_cities3 = lst

# Convert model results into a DataFrame for easy viewing
df_model_results = pd.DataFrame(model_results)


In [34]:
# Convert 'p_values' dictionary column into multiple columns
df_pvalues = df_model_results["p_values"].apply(pd.Series)

# Merge expanded p-values back into the main DataFrame and drop the original dictionary column
df_model_results3 = pd.concat([df_model_results.drop(columns=["p_values"]), df_pvalues], axis=1)


In [35]:
df_model_results3.corre_forecasted.describe()

count    38.000000
mean      0.834544
std       0.119078
min       0.481416
25%       0.789983
50%       0.883360
75%       0.927381
max       0.970634
Name: corre_forecasted, dtype: float64

In [36]:
df_model_results3.pseudo_R2.describe()

count    38.000000
mean      0.202300
std       0.048317
min       0.129779
25%       0.168146
50%       0.190291
75%       0.225445
max       0.335128
Name: pseudo_R2, dtype: float64

In [37]:
len(df_model_results3[(df_model_results3.phc_4_lag_0 <= 0.05) | (df_model_results3.phc_4_lag_1 <= 0.05)])

9

In [38]:
len(df_model_results3[(df_model_results3.phc_4_lag_0 <= 0.05) | (df_model_results3.phc_4_lag_1 <= 0.05)])/len(df_model_results3)

0.23684210526315788

### Cities with trend and with sezonality

In [39]:
# select cities without trend and sezonality in PHC

df4 = data_aih[(data_aih.p_value_aih_negbi_friedman < 0.05) & (data_aih.p_values_negbi_aih_4 < 0.05)]

print('number of imed with trend and sezonality', df4.co_imed.nunique(), 'percentage', round(df4.co_imed.nunique()*100/510,1) )

number of imed with trend and sezonality 3 percentage 0.6


In [40]:
lst_dfs_cities4 = []

for code in df4.co_imed.unique():
    
    set_muni = df4[df4.co_imed == code]

    lst_dfs_cities4.append(set_muni)

lst_dfs_cities4 = early_warning_detection_functions.harmonic(lst_dfs_cities4, 'aih_4')

In [41]:
lags_y = range(1, 4)  # Example: Using lags 1 to 4 for y_t
lags_x = range(0, 4)  # Example: Using lags 0 to 4 for x_t
dependent_variable = 'aih_4'  # Example dependent variable

def lags_comb(variable, lags_variable):
    """Create cumulative lag combinations for a given variable."""
    lags_variable_comb = [f'{variable}_{lag}' for lag in lags_variable]
    cumulative_sums_var = [' + '.join(lags_variable_comb[:i]) for i in range(1, len(lags_variable_comb) + 1)]
    return cumulative_sums_var

# Generate lagged combinations
lags_y_comb = lags_comb('aih_4_lag', lags_y)
lags_x_comb = lags_comb('phc_4_lag', lags_x)

formulas = []

# Iterate over all combinations and create formula strings
for value in lags_x_comb:
    for terms in lags_y_comb:
        components = [terms, value]
        # Remove empty strings and join components
        formula_components = 'time_trend + Reconstructed + ' + ' + '.join(filter(None, components))
        formula = f'{dependent_variable} ~ {formula_components}'
        formulas.append(formula)

In [43]:
## Keep the best model formulas and AIC
lst = []

for data in lst_dfs_cities4:
    print(f"Processing {data.co_imed.iloc[0]}...")

    #data = df3[df3.co_ibge == code].copy()
    data = data.assign(time_trend=np.arange(len(data)))

    # Initialize placeholders
    best_model = None
    best_aic = float('inf')
    best_formula = np.nan  # Set default as NaN

    # Try fitting models
    for formula in formulas:
        try:
            model = smf.glm(formula=formula, data=data, family=sm.families.NegativeBinomial(alpha=1)).fit()
            current_aic = model.aic

            if current_aic < best_aic:
                best_aic = current_aic
                best_model = model
                best_formula = formula

        except Exception as e:
            print(f"Skipping formula {formula} for {code}: {e}")
            continue  # Skip to the next formula

    # Assign best formula and AIC (or NaN if model failed)
    data = data.assign(best_formula=best_formula, best_aic=best_aic if best_aic != float('inf') else np.nan)
    lst.append(data)

lst_dfs_cities4 = lst  # Store results

Processing 170009...
Processing 250011...
Processing 310054...


In [44]:
# Initialize an empty list to store results
lst = []
model_results = []  # List to store model performance metrics

for data in lst_dfs_cities4:
    data = data.copy()  # Ensure no changes happen in place
    
    # Fit the model
    alpha = 1
    model = smf.glm(
        formula=data.best_formula.iloc[0], 
        data=data[:-30], 
        family=sm.families.NegativeBinomial(alpha=alpha)
    ).fit()
    
 
    # Assign model outputs to the DataFrame
    data["forecast_aih_4"] = None 
    data['fitted_values_aih_aps'] = None
    data['residuals_aih_aps'] = None
    
    # Extract fitted values and residuals
    data[:-30] = data[:-30].assign(
        fitted_values_aih_aps=model.fittedvalues,
        residuals_aih_aps=model.resid_deviance
           )
    
    # Forecast
    data[-30:] = data[-30:].assign(forecast_aih_4 = model.predict(data[-30:]))
    data = pd.concat([data[:-30], data[-30:]], ignore_index=True)

    # Extract key model statistics
    pseudo_r2 = model.pseudo_rsquared()
    p_values = model.pvalues
    conf_int = model.conf_int()
    
    res = stats.spearmanr(data[-30:]["aih_4"].to_numpy(), data[-30:]["forecast_aih_4"].to_numpy())

    # Save model statistics in a dictionary
    model_results.append({
        "co_imed": data.co_imed.iloc[0],
        "pseudo_R2": pseudo_r2,
        "p_values": p_values.to_dict(),
        "conf_int": conf_int.values.tolist(),
        "log_likelihood": model.llf,
        "deviance": model.deviance,
        "pearson_chi2": model.pearson_chi2,
        'corre_forecasted': res.correlation,
        'p_value_corr_forecast': res.pvalue
    })

    lst.append(data)  # Append modified data to the list

# Update lst_dfs_cities1 with new DataFrames containing fitted values & residuals
lst_dfs_cities4 = lst

# Convert model results into a DataFrame for easy viewing
df_model_results = pd.DataFrame(model_results)


In [45]:
# Convert 'p_values' dictionary column into multiple columns
df_pvalues = df_model_results["p_values"].apply(pd.Series)

# Merge expanded p-values back into the main DataFrame and drop the original dictionary column
df_model_results4 = pd.concat([df_model_results.drop(columns=["p_values"]), df_pvalues], axis=1)


In [46]:
df_model_results4.corre_forecasted.describe()

count    3.000000
mean     0.922295
std      0.006529
min      0.915572
25%      0.919136
50%      0.922701
75%      0.925656
max      0.928612
Name: corre_forecasted, dtype: float64

In [47]:
df_model_results4.pseudo_R2.describe()

count    3.000000
mean     0.202850
std      0.042649
min      0.154394
25%      0.186929
50%      0.219463
75%      0.227078
max      0.234693
Name: pseudo_R2, dtype: float64

In [49]:
len(df_model_results4[(df_model_results4.phc_4_lag_0 <= 0.05)])

0

In [50]:
df_model_results4

Unnamed: 0,co_imed,pseudo_R2,conf_int,log_likelihood,deviance,pearson_chi2,corre_forecasted,p_value_corr_forecast,Intercept,time_trend,Reconstructed,aih_4_lag_1,phc_4_lag_0
0,170009,0.219463,"[[-1.074425704742235, 1.0387555001696311], [-0...",-150.574984,6.221183,4.498652,0.928612,1.407848e-13,0.973608,0.293805,0.64975,0.118323,0.092807
1,250011,0.234693,"[[-0.12941588835031725, 1.5784392648797514], [...",-193.601887,8.594457,4.087071,0.915572,1.358358e-12,0.096328,0.345057,0.977644,0.493401,0.122984
2,310054,0.154394,"[[0.40164719425602535, 2.457195696654435], [-0...",-196.074136,10.704596,6.91705,0.922701,4.131869e-13,0.006413,0.583266,0.319958,0.163316,0.102974


# Process and save data

In [51]:
len(lst_dfs_cities1) + len(lst_dfs_cities2) + len(lst_dfs_cities3) + len(lst_dfs_cities4)

510

In [52]:
lst_dfs_cities1[0].columns

       'aih_4_lag_3', 'time_trend', 'coef_negbi_aih_4', 'std_err_negbi_aih_4',
       'z_negbi_aih_4', 'p_values_negbi_aih_4', 'IC_low_negbi_aih_4',
       'IC_high_negbi_aih_4', 'trend_line_negbi_aih_4', 'dtrend_aih_negbi',
       'p_value_aih_negbi_friedman', 'test_stat_aih_negbi_friedman',
       'year_week_ts', 'epidemi_cal_start', 'epidemi_cal_end', 'atend_ivas',
       'num_otc_ivas', 'atend_ivas_4', 'num_otc_ivas_4', 'phc_4_lag_1',
       'phc_4_lag_2', 'phc_4_lag_3', 'otc_4_lag_1', 'otc_4_lag_2',
       'otc_4_lag_3', 'phc_4_lag_0', 'otc_4_lag_0', 'aih_4_lag_0',
       'best_formula', 'best_aic', 'forecast_aih_4', 'fitted_values_aih_aps',
       'residuals_aih_aps'],
      dtype='object')

In [53]:
lst = ['co_imed', 'year_week', 'warning_aih', 'n',
       'warning_aih_without_isolated', 'warning_aih_corect_with_consec',
       'warning_final_aih', 'aih_4', 'coef_negbi_aih_4', 'std_err_negbi_aih_4',
       'z_negbi_aih_4', 'p_values_negbi_aih_4', 'IC_low_negbi_aih_4',
       'IC_high_negbi_aih_4', 'trend_line_negbi_aih_4', 'dtrend_aih_negbi',
       'p_value_aih_negbi_friedman', 'test_stat_aih_negbi_friedman',
       'year_week_ts', 'epidemi_cal_start', 'epidemi_cal_end', 'atend_ivas',
       'num_otc_ivas', 'atend_ivas_4', 'num_otc_ivas_4', 
       'best_formula', 'best_aic', 'forecast_aih_4', 'fitted_values_aih_aps',
       'residuals_aih_aps']

In [54]:
lst1 = []

for data in lst_dfs_cities1:

    data = data[lst]

    lst1.append(data)


for data in lst_dfs_cities2:

    data = data[lst]

    lst1.append(data)

for data in lst_dfs_cities3:

    data = data[lst]

    lst1.append(data)

for data in lst_dfs_cities4:

    data = data[lst]

    lst1.append(data)

In [55]:
final = pd.concat(lst1)

In [58]:
final.columns

       'z_negbi_aih_4', 'p_values_negbi_aih_4', 'IC_low_negbi_aih_4',
       'IC_high_negbi_aih_4', 'trend_line_negbi_aih_4', 'dtrend_aih_negbi',
       'p_value_aih_negbi_friedman', 'test_stat_aih_negbi_friedman',
       'year_week_ts', 'epidemi_cal_start', 'epidemi_cal_end', 'atend_ivas',
       'num_otc_ivas', 'atend_ivas_4', 'num_otc_ivas_4', 'best_formula',
       'best_aic', 'forecast_aih_4', 'fitted_values_aih_aps',
       'residuals_aih_aps'],
      dtype='object')

In [59]:
final.to_parquet('/Users/julianeoliveira/Documents/github/Bivariate_Anomaly_Detection_Primary_Health_Care_Drug_Selling_ILI_surveillance/Results/data_manuscript_predict_aih_with_aps_imed.parquet')

In [66]:
final_model_results = pd.concat([df_model_results1, df_model_results2, df_model_results3, df_model_results4])

In [67]:
final_model_results.to_parquet('/Users/julianeoliveira/Documents/github/Bivariate_Anomaly_Detection_Primary_Health_Care_Drug_Selling_ILI_surveillance/Results/data_manuscript_model_results_predict_aih_with_aps_imed.parquet')
