# VRI anomaly detection

This notebook has the aim to study how to detect anomalies in the VRI computed by our models displayed here: https://labs.mosquitoalert.com/MosquitoAlertES/
Data gathered from https://github.com/Mosquito-Alert/MosquitoAlertES/

## Requirements

In [1]:
import pandas as pd
from prophet import Prophet
from prophet.plot import seasonality_plot_df
import os
import pandas as pd
import json
from datetime import datetime
import re
import geopandas as gpd
from tqdm import tqdm
from scipy.signal import savgol_filter

Importing plotly failed. Interactive plots will not work.


## Dataset

In [2]:
VRI_DATA_PATH = './MosquitoAlertES/data/'

# List all files in the directory
files = os.listdir(VRI_DATA_PATH)

# Initialize an empty list to hold the data
data = []
today = datetime.today().date()
# Loop through the files
for file in files:
    # Check if the file matches the desired format
    if file.startswith('muni_preds_') and file.endswith('.json'):
        try:
            # Extract the date from the filename
            date_str = re.search(r'(\d{4}-\d{2}-\d{2})', file)

            # Skip the file if no date is found
            if date_str is None:
                continue
            
            date = datetime.strptime(date_str.group(1), '%Y-%m-%d').date()

            if date > today:
                continue

            # Read the JSON file
            with open(os.path.join(VRI_DATA_PATH, file), 'r') as f:
                file_data = json.load(f)

            # Add the date and select the desired columns
            for entry in file_data:
                # Select only the required fields and add the date
                data.append({
                    'date': date,
                    'NATCODE': entry.get('NATCODE'),
                    'NAMEUNIT': entry.get('NAMEUNIT'),
                    'ma_prob_mean': entry.get('ma_prob_mean')
                })

        except Exception as e:
            print(f"Error processing file {file}: {e}")

# Create a DataFrame from the list of data
df = pd.DataFrame(data)
del data

In [3]:
df

Unnamed: 0,date,NATCODE,NAMEUNIT,ma_prob_mean
0,2023-09-22,34063939084,Solórzano,0.0000
1,2023-09-22,34074949157,Pino del Oro,0.0000
2,2023-09-22,34063939085,Suances,0.0000
3,2023-09-22,34074949158,El Piñero,0.0000
4,2023-09-22,34063939086,Los Tojos,0.0000
...,...,...,...,...
19785981,2024-06-23,34205454001,Isla del Perejil,0.3106
19785982,2024-06-23,34205454002,Peñón de Vélez de la Gomera,0.3793
19785983,2024-06-23,34205454003,Islas Chafarinas,0.3793
19785984,2024-06-23,34205454004,Islas Alhucemas,0.3793


In [4]:
# Rename columns for Prophet
df.rename(columns={"date": "ds"}, inplace=True)
df['ds'] = pd.to_datetime(df['ds'])  # Ensure 'ds' is datetime

df.sort_values(by=['NAMEUNIT', 'ds'], inplace=True, ignore_index=True)

# Apply Savitzky-Golay filter with safeguards
def smooth_group(group):
    if len(group) >= 7:  # Check if the group is large enough for the window_length
        group['y'] = savgol_filter(group['ma_prob_mean'], window_length=7, polyorder=2)
    else:
        group['y'] = group['ma_prob_mean']  # If too small, fallback to original values
    return group

df = df.groupby('NATCODE', group_keys=False).apply(smooth_group)

df

  df = df.groupby('NATCODE', group_keys=False).apply(smooth_group)


Unnamed: 0,ds,NATCODE,NAMEUNIT,ma_prob_mean,y
0,2018-01-01,34123232003,A Arnoia,0.0,0.0
1,2018-01-02,34123232003,A Arnoia,0.0,0.0
2,2018-01-03,34123232003,A Arnoia,0.0,0.0
3,2018-01-04,34123232003,A Arnoia,0.0,0.0
4,2018-01-05,34123232003,A Arnoia,0.0,0.0
...,...,...,...,...,...
19785981,2024-11-10,34070909398,Úrbel del Castillo,0.0,0.0
19785982,2024-11-11,34070909398,Úrbel del Castillo,0.0,0.0
19785983,2024-11-12,34070909398,Úrbel del Castillo,0.0,0.0
19785984,2024-11-13,34070909398,Úrbel del Castillo,0.0,0.0


In [5]:
import logging
logger = logging.getLogger('cmdstanpy')
logger.addHandler(logging.NullHandler())
logger.propagate = False
logger.setLevel(logging.CRITICAL)

import warnings
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)

# Function to train a model and detect anomalies for each city
def detect_anomalies_for_city(city_data):
    group_name, city_df = city_data
    if (city_df['y'] == 0).all():  # Skip if all original items are 0
        return None, None

    first_non_zero = city_df[city_df["y"] != 0].iloc[0]
    holidays_df = city_df[(city_df['y']==0) & (city_df['ds'] < first_non_zero['ds'])]['ds'].reset_index()
    holidays_df['holiday'] = 'no-prediction-yet'
    
    # Step 3: Initialize Prophet with logistic growth
    model = Prophet(growth='logistic', yearly_seasonality=True, weekly_seasonality=False, daily_seasonality=False, holidays=holidays_df[['ds','holiday']])
    city_df.loc[:,'cap'] = 1
    city_df.loc[:,'floor'] = 0
    model.fit(city_df)
    
    # Make predictions for historical data (no future periods)
    future = model.make_future_dataframe(periods=0)
    future['cap'] = 1  # Ensure the future data has the cap
    future['floor'] = 0  # Ensure the future data has the floor
    forecast = model.predict(future)

    forecast['fact'] = city_df['y'].reset_index(drop = True)

    forecast['anomaly'] = 0
    forecast.loc[forecast['fact'] > forecast['yhat_upper'], 'anomaly'] = 1
    forecast.loc[forecast['fact'] < forecast['yhat_lower'], 'anomaly'] = -1

     #anomaly importances
    forecast['importance'] = 0.0
    forecast.loc[forecast['anomaly'] ==1, 'importance'] = \
        (forecast['fact'] - forecast['yhat_upper'])/forecast['fact']
    forecast.loc[forecast['anomaly'] ==-1, 'importance'] = \
        (forecast['yhat_lower'] - forecast['fact'])/forecast['fact']

    # Merge forecast with the original data
    city_df_forecast = forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper', 'trend', 'anomaly', 'importance']]
    result_df = city_df[['NATCODE', 'ds']].merge(city_df_forecast, on='ds', how='left')

    # Seasonality component
    df_w = seasonality_plot_df(m=model, ds=pd.date_range(start='2017-01-01', periods=365))
    seas_df = model.predict_seasonal_components(df_w)
    yearly_df = seas_df['yearly'].reset_index()
    yearly_df.loc[:,'NATCODE'] = city_df.iloc[0]['NATCODE']

    return result_df, yearly_df

In [6]:
from concurrent.futures import ProcessPoolExecutor
import os
import math

# Apply the anomaly detection for each city in parallel
with ProcessPoolExecutor(max_workers=math.floor(max(os.cpu_count() * 0.8, 1))) as executor:
    results = list(
        tqdm(
            executor.map(
                detect_anomalies_for_city,
                df.groupby('NATCODE')
            ), 
            total=len(
                df['NAMEUNIT'].unique()
            )
        )
    )


Optimization terminated abnormally. Falling back to Newton.
Optimization terminated abnormally. Falling back to Newton.
  return cap / (1 + np.exp(-k_t * (t - m_t)))
  return cap / (1 + np.exp(-k_t * (t - m_t)))
  return cap / (1 + np.exp(-k_t * (t - m_t)))
  return cap / (1 + np.exp(-k_t * (t - m_t)))
  return cap / (1 + np.exp(-k_t * (t - m_t)))
  return cap / (1 + np.exp(-k_t * (t - m_t)))
  return cap / (1 + np.exp(-k_t * (t - m_t)))
  return cap / (1 + np.exp(-k_t * (t - m_t)))
  return cap / (1 + np.exp(-k_t * (t - m_t)))
  return cap / (1 + np.exp(-k_t * (t - m_t)))
  return cap / (1 + np.exp(-k_t * (t - m_t)))
  return cap / (1 + np.exp(-k_t * (t - m_t)))
Optimization terminated abnormally. Falling back to Newton.
  return cap / (1 + np.exp(-k_t * (t - m_t)))
  return cap / (1 + np.exp(-k_t * (t - m_t)))
  return cap / (1 + np.exp(-k_t * (t - m_t)))
  return cap / (1 + np.exp(-k_t * (t - m_t)))
  return cap / (1 + np.exp(-k_t * (t - m_t)))
  return cap / (1 + np.exp(-k_t * (t -

In [7]:
# Combine the results for all cities
result_df = df.merge(
    pd.concat([arr[0] for arr in results if arr is not None]),
    on=['NATCODE', 'ds'], 
    how='left'
)
# Setting a 0 for the prediction value that hasn't been predicted because was all 0.
result_df[['yhat', 'yhat_lower', 'yhat_upper', 'trend', 'anomaly', 'importance']] = result_df[['yhat', 'yhat_lower', 'yhat_upper', 'trend', 'anomaly', 'importance']].fillna(0)
yearly_seasonality_df = pd.concat([arr[1] for arr in results if arr is not None])

In [8]:
result_df.to_csv('spain_activty_anomaly.csv', index=False)
yearly_seasonality_df.to_csv('spain_seasonality.csv', index=False)

## Part 2

In [9]:
# result_df = pd.read_csv('./spain_activty_anomaly.csv')
# yearly_seasonality_df = pd.read_csv('./spain_seasonality.csv')

In [10]:
result_df

Unnamed: 0,ds,NATCODE,NAMEUNIT,ma_prob_mean,y,yhat,yhat_lower,yhat_upper,trend,anomaly,importance
0,2018-01-01,34123232003,A Arnoia,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2018-01-02,34123232003,A Arnoia,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2018-01-03,34123232003,A Arnoia,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2018-01-04,34123232003,A Arnoia,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2018-01-05,34123232003,A Arnoia,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
19785981,2024-11-10,34070909398,Úrbel del Castillo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19785982,2024-11-11,34070909398,Úrbel del Castillo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19785983,2024-11-12,34070909398,Úrbel del Castillo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19785984,2024-11-13,34070909398,Úrbel del Castillo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
current_status_df = result_df.sort_values(
    by=['NATCODE', 'ds']
).groupby('NATCODE').apply(lambda x: x.iloc[-1])[['y', 'yhat', 'yhat_lower', 'yhat_upper', 'trend', 'anomaly', 'importance', 'ds']]

  ).groupby('NATCODE').apply(lambda x: x.iloc[-1])[['y', 'yhat', 'yhat_lower', 'yhat_upper', 'trend', 'anomaly', 'importance', 'ds']]


In [12]:
current_status_df.reset_index(inplace=True)

In [13]:
current_status_df.rename(columns={'ds': 'last_update'}, inplace=True)
current_status_df['NATCODE'] = current_status_df['NATCODE'].astype(int)

### Load shapefiles & save geopackage

In [14]:
municipalities_gdf = gpd.read_file('lineas_limite/SHP_ETRS89/recintos_municipales_inspire_peninbal_etrs89')
municipalities_gdf['NAMEUNIT'] = municipalities_gdf['NAMEUNIT'].str.split('/').str[0]
ccaa_gdf = gpd.read_file('lineas_limite/SHP_ETRS89/recintos_autonomicas_inspire_peninbal_etrs89')
ccaa_gdf['NAMEUNIT'] = ccaa_gdf['NAMEUNIT'].str.split('/').str[0]

In [15]:
gdf = municipalities_gdf[['NATCODE', 'NAMEUNIT', 'CODNUT2', 'geometry']].merge(
    ccaa_gdf[['NAMEUNIT', 'CODNUT2']].rename(columns={'NAMEUNIT': 'NAMEUNIT_NUT2'}),
    on='CODNUT2',
    how='inner'
)
gdf['NATCODE'] = gdf['NATCODE'].astype(int)

In [16]:
current_gdf = gdf.merge(current_status_df, on='NATCODE')
current_gdf.set_index('NATCODE', inplace=True)

In [17]:
historic_gdf = gpd.GeoDataFrame(result_df.drop(columns=['NAMEUNIT']), geometry=None)

In [18]:
gpk_path = 'output.gpkg'
# Save the GeoPandas DataFrame (geometries)
current_gdf.to_file(gpk_path, layer='geometries', driver="GPKG")
historic_gdf.to_file(gpk_path, layer='histories', driver="GPKG")

In [19]:
gpd.GeoDataFrame(yearly_seasonality_df, geometry=None).to_file(gpk_path, layer='seasonality', driver="GPKG")