
# Anomaly Detection in Colombian Water Level Time Series

This notebook demonstrates a complete pipeline for detecting anomalies in water level data from rivers or reservoirs in Colombia. It includes:
- Synthetic data generation with seasonal patterns and anomalies
- Preprocessing with imputation
- Seasonal decomposition using STL
- Anomaly detection using Z-score, Isolation Forest, and Autoencoder
- Visual comparisons

## Installation
Run the following command in your environment to install required packages:
```bash
pip install pandas numpy matplotlib scipy statsmodels scikit-learn tensorflow
```


In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from statsmodels.tsa.seasonal import STL
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping


In [None]:

# Generate synthetic water level data
np.random.seed(42)
date_range = pd.date_range(start='2020-01-01', end='2022-12-31', freq='D')
water_level = 5 + 2 * np.sin(2 * np.pi * date_range.dayofyear / 365)
water_level += np.random.normal(0, 0.3, len(date_range))
water_level = np.array(water_level)
water_level[100:110] = np.nan
water_level[400] = 10
water_level[800] = 1
irregular_dates = date_range.delete([50, 51, 52, 300, 301])
df = pd.DataFrame({'timestamp': irregular_dates, 'water_level': water_level[:len(irregular_dates)]})


In [None]:

# Preprocessing and imputation
df['timestamp'] = pd.to_datetime(df['timestamp'])
df.set_index('timestamp', inplace=True)
df = df.resample('D').asfreq()
df['linear_interp'] = df['water_level'].interpolate(method='linear')
df['ffill'] = df['water_level'].fillna(method='ffill')

plt.figure(figsize=(12, 4))
plt.plot(df.index, df['linear_interp'], label='Linear Interpolation')
plt.plot(df.index, df['ffill'], label='Forward Fill', alpha=0.7)
plt.scatter(df.index[df['water_level'].isna()], [5]*df['water_level'].isna().sum(), color='red', label='Missing')
plt.title('Imputation Comparison')
plt.xlabel('Date')
plt.ylabel('Water Level')
plt.legend()
plt.tight_layout()
plt.show()

# Use linear interpolation for further analysis
df['imputed'] = df['linear_interp']


In [None]:

# STL decomposition
stl = STL(df['imputed'], period=365)
res = stl.fit()
df['trend'] = res.trend
df['seasonal'] = res.seasonal
df['resid'] = res.resid

fig, axs = plt.subplots(4, 1, figsize=(12, 8), sharex=True)
axs[0].plot(df.index, df['imputed'])
axs[0].set_title('Observed')
axs[1].plot(df.index, df['trend'], color='orange')
axs[1].set_title('Trend')
axs[2].plot(df.index, df['seasonal'], color='green')
axs[2].set_title('Seasonal')
axs[3].plot(df.index, df['resid'], color='red')
axs[3].set_title('Residual')
plt.tight_layout()
plt.show()


In [None]:

# Anomaly detection
# Z-score
df['z_score'] = np.abs(stats.zscore(df['resid'].fillna(0)))
df['anomaly_z'] = df['z_score'] > 3

# Isolation Forest
iso = IsolationForest(contamination=0.01, random_state=42)
df['resid_filled'] = df['resid'].fillna(0)
df['anomaly_iforest'] = iso.fit_predict(df[['resid_filled']]) == -1

# Autoencoder
scaler = MinMaxScaler()
resid_scaled = scaler.fit_transform(df[['resid_filled']])
model = Sequential([
    Dense(16, activation='relu', input_shape=(1,)),
    Dense(8, activation='relu'),
    Dense(16, activation='relu'),
    Dense(1)
])
model.compile(optimizer='adam', loss='mse')
model.fit(resid_scaled, resid_scaled, epochs=50, batch_size=32, verbose=0, callbacks=[EarlyStopping(patience=5)])
reconstructions = model.predict(resid_scaled)
mse = np.mean(np.power(resid_scaled - reconstructions, 2), axis=1)
thresh = np.percentile(mse, 99)
df['anomaly_autoencoder'] = mse > thresh


In [None]:

# Plot anomaly detection results
fig, axs = plt.subplots(3, 1, figsize=(12, 10), sharex=True)
axs[0].plot(df.index, df['resid'])
axs[0].scatter(df.index[df['anomaly_z']], df['resid'][df['anomaly_z']], color='red')
axs[0].set_title('Z-score Anomalies')
axs[1].plot(df.index, df['resid'])
axs[1].scatter(df.index[df['anomaly_iforest']], df['resid'][df['anomaly_iforest']], color='purple')
axs[1].set_title('Isolation Forest Anomalies')
axs[2].plot(df.index, df['resid'])
axs[2].scatter(df.index[df['anomaly_autoencoder']], df['resid'][df['anomaly_autoencoder']], color='orange')
axs[2].set_title('Autoencoder Anomalies')
plt.tight_layout()
plt.show()
