# VRI anomaly detection bites

This notebook has the aim to study how to detect anomalies in the VRI computed by our models displayed here: https://labs.mosquitoalert.com/MosquitoAlertES/

Data gathered from models bites.

## Requirements

In [1]:
import pandas as pd
import geopandas as gpd
from prophet import Prophet
from prophet.plot import seasonality_plot_df
import os
from tqdm import tqdm

Importing plotly failed. Interactive plots will not work.


## Directories and Files

In [None]:
# * Base directory. # TODO: Change this to the cluster directory
DATA_DIR = os.path.join(os.getcwd(), 'data')

# * Input
INPUT_DIR = os.path.join(DATA_DIR, 'input')
# Bites data
BITES_DATA_DIR = os.path.join(INPUT_DIR, 'bites')
# GEO data
GEO_DATA_DIR = os.path.join(INPUT_DIR, 'geo')

# * Output
OUTPUT_DIR = os.path.join(DATA_DIR, 'output')
# Anomaly and seasonality output
ANOMALY_OUTPUT_DIR = os.path.join(OUTPUT_DIR, 'spain_activty_anomaly_bites.csv')
SEASONALITY_OUTPUT_DIR = os.path.join(OUTPUT_DIR, 'spain_seasonality_bites.csv')
os.makedirs(OUTPUT_DIR, exist_ok=True)

## Dataset

In [None]:
# Collect all CSV file paths
files = [
    os.path.join(root, file)
    for root, _, files in os.walk(BITES_DATA_DIR)
    for file in files if file.endswith(".csv")
]

# Initialize an empty list to hold the data
dfs = []
# Loop through the files
for file in files:
    try:
        date = file.split("bites_")[1].split(".")[0]
        df_day = pd.read_csv(file)
        df_day["date"] = date
        dfs.append(df_day)
    except Exception as e:
        print(f"Error processing file {file}: {e}")

# Create a DataFrame from the list of data
df = pd.concat(dfs, ignore_index=True)
del dfs

In [3]:
df

Unnamed: 0,laucode,est,date
0,4001,0.669,2023-08-31
1,4002,0.662,2023-08-31
2,4003,0.716,2023-08-31
3,4004,0.686,2023-08-31
4,4005,0.715,2023-08-31
...,...,...,...
15819370,26181,0.329,2024-10-20
15819371,26183,0.315,2024-10-20
15819372,53056,0.342,2024-10-20
15819373,51001,0.676,2024-10-20


In [4]:
# Rename columns for Prophet
df['ds'] = pd.to_datetime(df["date"])
df.rename(columns={"est": "y"}, inplace=True)

df.sort_values(by=['laucode', 'ds'], inplace=True, ignore_index=True)

# Keep only values for laucode, ds, y
df = df[['ds', 'laucode', 'y']]

df

Unnamed: 0,ds,laucode,y
0,2020-01-01,1001,0.153
1,2020-01-02,1001,0.189
2,2020-01-03,1001,0.189
3,2020-01-04,1001,0.189
4,2020-01-05,1001,0.189
...,...,...,...
15819370,2025-04-26,53083,0.189
15819371,2025-04-27,53083,0.189
15819372,2025-04-28,53083,0.189
15819373,2025-04-29,53083,0.189


In [None]:
import logging
logger = logging.getLogger('cmdstanpy')
logger.addHandler(logging.NullHandler())
logger.propagate = False
logger.setLevel(logging.CRITICAL)

import warnings
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)

# Function to train a model and detect anomalies for each city
def detect_anomalies_for_city(city_data):
    group_name, city_df = city_data
    if (city_df['y'].isna()).all() or (city_df['y'] == 0).all():  # Skip if all original items are zero or NaN
        return None, None

    # The following code of false holidays is optional with the new data
    first_non_zero = city_df[city_df["y"] != 0].iloc[0]
    holidays_df = city_df[(city_df['y']==0) & (city_df['ds'] < first_non_zero['ds'])]['ds'].reset_index()
    holidays_df['holiday'] = 'no-prediction-yet'

    # Step 3: Initialize Prophet with logistic growth
    model = Prophet(growth='logistic', yearly_seasonality=True, weekly_seasonality=False, daily_seasonality=False, holidays=holidays_df[['ds','holiday']])
    city_df.loc[:,'cap'] = 1
    city_df.loc[:,'floor'] = 0
    model.fit(city_df)

    # Make predictions for historical data (no future periods).
    # This means that we are not predicting future values, but rather using the model to predict the historical data.
    future = model.make_future_dataframe(periods=0)
    future['cap'] = 1  # Ensure the future data has the cap
    future['floor'] = 0  # Ensure the future data has the floor
    forecast = model.predict(future)

    forecast['fact'] = city_df['y'].reset_index(drop = True)

    forecast['anomaly'] = 0
    forecast.loc[forecast['fact'] > forecast['yhat_upper'], 'anomaly'] = 1
    forecast.loc[forecast['fact'] < forecast['yhat_lower'], 'anomaly'] = -1

     #anomaly importances
    forecast['importance'] = 0.0
    forecast.loc[forecast['anomaly'] ==1, 'importance'] = \
        (forecast['fact'] - forecast['yhat_upper'])/forecast['fact']
    forecast.loc[forecast['anomaly'] ==-1, 'importance'] = \
        (forecast['yhat_lower'] - forecast['fact'])/forecast['fact']

    # Merge forecast with the original data
    city_df_forecast = forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper', 'trend', 'anomaly', 'importance']]
    result_df = city_df[['laucode', 'ds']].merge(city_df_forecast, on='ds', how='left')

    # Seasonality component
    df_w = seasonality_plot_df(m=model, ds=pd.date_range(start='2017-01-01', periods=365))
    seas_df = model.predict_seasonal_components(df_w)
    yearly_df = seas_df['yearly'].reset_index()
    yearly_df.loc[:,'laucode'] = city_df.iloc[0]['laucode']

    return result_df, yearly_df

In [6]:
from concurrent.futures import ProcessPoolExecutor
import os
import math

# Apply the anomaly detection for each city in parallel
with ProcessPoolExecutor(max_workers=math.floor(max(os.cpu_count() * 0.8, 1))) as executor:
    results = list(
        tqdm(
            executor.map(
                detect_anomalies_for_city,
                df.groupby('laucode')
            ),
            total=len(
                df['laucode'].unique()
            )
        )
    )

100%|██████████| 8125/8125 [24:24<00:00,  5.55it/s]  


In [7]:
# Combine the results for all cities
result_df = df.merge(
    pd.concat([arr[0] for arr in results if arr is not None]),
    on=['laucode', 'ds'],
    how='left'
)
# Setting a 0 for the prediction value that hasn't been predicted because was all 0.
result_df[['yhat', 'yhat_lower', 'yhat_upper', 'trend', 'anomaly', 'importance']] = result_df[['yhat', 'yhat_lower', 'yhat_upper', 'trend', 'anomaly', 'importance']].fillna(0)
yearly_seasonality_df = pd.concat([arr[1] for arr in results if arr is not None])

In [None]:
result_df.to_csv(ANOMALY_OUTPUT_DIR, index=False)
yearly_seasonality_df.to_csv(SEASONALITY_OUTPUT_DIR, index=False)

## Part 2

In [13]:
result_df = pd.read_csv(ANOMALY_OUTPUT_DIR)
yearly_seasonality_df = pd.read_csv(SEASONALITY_OUTPUT_DIR)

In [14]:
result_df

Unnamed: 0,ds,laucode,y,yhat,yhat_lower,yhat_upper,trend,anomaly,importance
0,2020-01-01,1001,0.153,0.175428,0.121867,0.227070,0.292400,0.0,0.0
1,2020-01-02,1001,0.189,0.174842,0.124983,0.231949,0.292400,0.0,0.0
2,2020-01-03,1001,0.189,0.174227,0.122566,0.226680,0.292401,0.0,0.0
3,2020-01-04,1001,0.189,0.173565,0.122012,0.228036,0.292401,0.0,0.0
4,2020-01-05,1001,0.189,0.172842,0.123762,0.223997,0.292402,0.0,0.0
...,...,...,...,...,...,...,...,...,...
15819370,2025-04-26,53083,0.189,0.213411,0.156533,0.270624,0.311072,0.0,0.0
15819371,2025-04-27,53083,0.189,0.214290,0.155988,0.270703,0.310992,0.0,0.0
15819372,2025-04-28,53083,0.189,0.215089,0.155401,0.270857,0.310912,0.0,0.0
15819373,2025-04-29,53083,0.189,0.215835,0.158873,0.272770,0.310832,0.0,0.0


In [None]:
current_status_df = result_df.sort_values(
    by=['laucode', 'ds']
).groupby('laucode').apply(lambda x: x.iloc[-1])[['y', 'yhat', 'yhat_lower', 'yhat_upper', 'trend', 'anomaly', 'importance', 'ds']]

  ).groupby('gid_4').apply(lambda x: x.iloc[-1])[['y', 'yhat', 'yhat_lower', 'yhat_upper', 'trend', 'anomaly', 'importance', 'ds']]


In [11]:
current_status_df.reset_index(inplace=True)

In [None]:
current_status_df.rename(columns={'ds': 'last_update'}, inplace=True)
current_status_df['laucode'] = current_status_df['laucode'].astype(str)

### Load shapefiles & save geopackage

In [None]:
# peninsula_gdf = gpd.read_file('lineas_limite/SHP_ETRS89/recintos_municipales_inspire_peninbal_etrs89')
# peninsula_gdf = peninsula_gdf.to_crs(epsg=4326)

# canarias_gdf = gpd.read_file('lineas_limite/SHP_REGCAN95/recintos_municipales_inspire_canarias_regcan95')
# canarias_gdf = canarias_gdf.to_crs(epsg=4326)

# spain_gdf = gpd.GeoDataFrame(pd.concat([peninsula_gdf, canarias_gdf], ignore_index=True))
# spain_gdf['NAMEUNIT'] = spain_gdf['NAMEUNIT'].str.split('/').str[0]

europe_gdf = gpd.read_file('/home/gsanz/anomaly_detection/basemap/LAU_RG_01M_2023_4326.shp') # TODO: Change this
europe_gdf = europe_gdf.to_crs(epsg=4326)


In [None]:
europe_gdf

In [14]:
peninsula_ccaa_gdf = gpd.read_file('lineas_limite/SHP_ETRS89/recintos_autonomicas_inspire_peninbal_etrs89')
peninsula_ccaa_gdf = peninsula_ccaa_gdf.to_crs(epsg=4326)

canarias_ccaa_gdf = gpd.read_file('lineas_limite/SHP_REGCAN95/recintos_autonomicas_inspire_canarias_regcan95')
canarias_ccaa_gdf = canarias_ccaa_gdf.to_crs(epsg=4326)

spain_ccaa_gdf = gpd.GeoDataFrame(pd.concat([peninsula_ccaa_gdf, canarias_ccaa_gdf], ignore_index=True))
spain_ccaa_gdf['NAMEUNIT'] = spain_ccaa_gdf['NAMEUNIT'].str.split('/').str[0]

In [None]:
gadm4_gdf = gpd.read_file('/home/gsanz/anomaly_detection/basemap/gadm41_ESP.gpkg', layer='ADM_ADM_4')

In [16]:
gadm4_gdf['geometry'] = gadm4_gdf.representative_point()

In [None]:
# municipalities_gdf = gpd.sjoin(spain_gdf, gadm4_gdf, how="left")[[
#     'GID_4', 'NATCODE', 'NAMEUNIT', 'CODNUT2', 'geometry'
# ]]
municipalities_gdf = gpd.sjoin(europe_gdf, gadm4_gdf, how="left")[[
    'GISCO_ID', 'LAU_ID', 'LAU_NAME', 'geometry'
]]

In [None]:
# gdf = municipalities_gdf[['GID_4', 'NATCODE', 'NAMEUNIT', 'CODNUT2', 'geometry']].merge(
#     spain_ccaa_gdf[['NAMEUNIT', 'CODNUT2']].rename(columns={'NAMEUNIT': 'NAMEUNIT_NUT2'}),
#     on='CODNUT2',
#     how='inner'
# )
gdf = municipalities_gdf[['GISCO_ID', 'LAU_ID', 'LAU_NAME', 'geometry']].merge(
    europe_gdf[['LAUCODE', 'CODNUT2']].rename(columns={'NAMEUNIT': 'NAMEUNIT_NUT2'}),
    on='CODNUT2',
    how='inner'
)
gdf['NATCODE'] = gdf['NATCODE'].astype(int)

In [19]:
gdf

Unnamed: 0,GID_4,NATCODE,NAMEUNIT,CODNUT2,geometry,NAMEUNIT_NUT2
0,ESP.17.1.5.3_1,34033333022,Degaña,ES12,"MULTIPOLYGON (((-6.6574 42.96745, -6.64737 42....",Principado de Asturias
1,ESP.17.1.3.4_1,34033333023,El Franco,ES12,"MULTIPOLYGON (((-6.87709 43.56358, -6.87705 43...",Principado de Asturias
2,ESP.17.1.4.2_1,34033333024,Gijón,ES12,"MULTIPOLYGON (((-5.81929 43.50727, -5.8184 43....",Principado de Asturias
3,ESP.17.1.1.6_1,34033333025,Gozón,ES12,"MULTIPOLYGON (((-5.91545 43.60853, -5.91537 43...",Principado de Asturias
4,ESP.17.1.8.4_1,34033333026,Grado,ES12,"POLYGON ((-6.20021 43.18357, -6.20121 43.1856,...",Principado de Asturias
...,...,...,...,...,...,...
8335,ESP.14.2.1.3_1,34053838003,Alajeró,ES70,"MULTIPOLYGON (((-17.22374 28.0254, -17.22373 2...",Canarias
8336,ESP.14.2.1.4_1,34053838004,Arafo,ES70,"POLYGON ((-16.48414 28.33504, -16.48377 28.336...",Canarias
8337,ESP.14.2.1.5_1,34053838005,Arico,ES70,"MULTIPOLYGON (((-16.47283 28.10376, -16.47287 ...",Canarias
8338,ESP.14.2.1.6_1,34053838006,Arona,ES70,"MULTIPOLYGON (((-16.6964 28.00131, -16.69639 2...",Canarias


In [20]:
current_gdf = gdf.merge(current_status_df.rename(columns={'gid_4': 'GID_4'}), on='GID_4')
current_gdf.set_index('NATCODE', inplace=True)
current_gdf.drop(columns=['GID_4'], inplace=True)

In [21]:
historic_gdf = gpd.GeoDataFrame(
    result_df.merge(
        gdf[['NATCODE', 'GID_4']].rename(columns={'GID_4': 'gid_4'}),
        on='gid_4',
        how='inner'
    ).drop(columns=['gid_4']),
    geometry=None
)

In [22]:
gpk_path = 'output_bites.gpkg'
# Save the GeoPandas DataFrame (geometries)
current_gdf.to_file(gpk_path, layer='geometries', driver="GPKG")
historic_gdf.to_file(gpk_path, layer='histories', driver="GPKG")

In [23]:
gpd.GeoDataFrame(yearly_seasonality_df.merge(
        gdf[['NATCODE', 'GID_4']].rename(columns={'GID_4': 'gid_4'}),
        on='gid_4',
        how='inner'
    ).drop(columns=['gid_4']), geometry=None).to_file(gpk_path, layer='seasonality', driver="GPKG")