In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import IsolationForest

In [3]:
import dotenv
import os
import psycopg2

dotenv.load_dotenv(dotenv_path='database.env')

# --- Configuration ---
DB_NAME = os.environ['DB_NAME']
DB_USER = os.environ['DB_USER']
DB_PASS = os.environ['DB_PASS']
HOST = os.environ['HOST']
PORT = os.environ['PORT']

# 2. Database Connection and Setup
conn = psycopg2.connect(
    dbname=DB_NAME, user=DB_USER, password=DB_PASS, host=HOST, port=PORT
)
conn.autocommit = True
cur = conn.cursor()

cur.execute('SELECT "facilityId","co2e","fuelHint","createdAt","ch4","n2o","ef","co2" FROM stationary_combustion_activity')

df = pd.DataFrame(cur.fetchall(), columns = ["facilityId","co2e","fuelHint","createdAt","ch4","n2o","ef","co2"])
print(df)

                                facilityId          co2e  \
0     68f4851d-4959-49b6-96a1-63d80c816ed3      0.000000   
1     bf2278d8-0d33-4e28-a55e-ad3c1537af7e      0.000054   
2     4f4bf05a-0a52-48db-b66d-3b3da6b23619      0.000054   
3     4f4bf05a-0a52-48db-b66d-3b3da6b23619      0.004033   
4     4f4bf05a-0a52-48db-b66d-3b3da6b23619      0.010790   
...                                    ...           ...   
6160  beae4852-c882-4706-8dd9-f8ce4dc11989      0.222116   
6161  d0f9256f-17f4-4b7e-abc7-2eeeba8d59c0      0.053958   
6162  d0f9256f-17f4-4b7e-abc7-2eeeba8d59c0   9049.547300   
6163  e0bd2a9c-69d4-403e-beed-3b9a6b08b382  12163.778891   
6164  e0bd2a9c-69d4-403e-beed-3b9a6b08b382   6588.125943   

                           fuelHint                         createdAt  \
0                            Bamboo  2025-01-05 21:53:26.836000-08:00   
1                       Natural Gas  2025-01-20 00:33:24.881000-08:00   
2                       Natural Gas  2025-01-20 02:59:01.888

In [14]:
# Convert 'createdAt' to datetime objects
df['createdAt'] = pd.to_datetime(df['createdAt'], utc=True)

# --- 2. Feature Engineering ---

# Extract time-based features
df['hour'] = df['createdAt'].dt.hour
df['day_of_week'] = df['createdAt'].dt.dayofweek # Monday=0, Sunday=6
df['is_weekend'] = df['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)
df['month'] = df['createdAt'].dt.month

# For cyclical seasonality, convert month to sine and cosine features
df['month_sin'] = np.sin(2 * np.pi * df['month']/12)
df['month_cos'] = np.cos(2 * np.pi * df['month']/12)

# Define the features to be used in the model
NUMERICAL_FEATURES = ['co2e', 'ch4', 'n2o', 'ef', 'co2', 'month_sin', 'month_cos']
TIME_FEATURES = ['hour', 'day_of_week', 'is_weekend']
CATEGORICAL_FEATURES = ['facilityId', 'fuelHint']

FEATURES = NUMERICAL_FEATURES + TIME_FEATURES + CATEGORICAL_FEATURES
X = df[FEATURES]

# --- 3. Preprocessing Pipeline ---

# Create transformers for different feature types
numerical_transformer = StandardScaler() # Scaling is crucial for Isolation Forest

# One-Hot Encoding for categorical features
categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, NUMERICAL_FEATURES + TIME_FEATURES),
        ('cat', categorical_transformer, CATEGORICAL_FEATURES)
    ],
    remainder='drop' # Drop other columns not specified
)



In [16]:
# The Isolation Forest model is typically trained on all data since we don't have labeled anomalies.
# 'contamination' is the expected proportion of anomalies in the dataset (e.g., 1%).
# Setting a reasonable contamination value helps the model set its internal threshold.

# Define the model within a pipeline for clean preprocessing and fitting
anomaly_detector = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', IsolationForest(
        n_estimators=100,
        contamination=0.01, # Set based on domain knowledge (e.g., 1% of data is anomalous)
        random_state=42,
        n_jobs=-1 # Use all available cores
    ))
])

# --- 5. Training and Prediction ---

print("Training Isolation Forest model...")
anomaly_detector.fit(X)

# Get the anomaly scores (lower score means more anomalous)
# Note: Isolation Forest outputs a score where higher is 'normal'.
# We often use .decision_function() for the raw score.
df['anomaly_score'] = anomaly_detector.decision_function(X)

# Predict the anomaly classification (-1 for anomaly, 1 for normal)
df['anomaly_label'] = anomaly_detector.predict(X)

print("Training complete.")

Training Isolation Forest model...
Training complete.


In [20]:
print("\n--- Results Summary ---")
print(f"Total Anomalies Detected (-1): {df['anomaly_label'].value_counts().get(-1, 0)}")
print(f"Total Normal Observations (1): {df['anomaly_label'].value_counts().get(1, 0)}")

# --- 6. Inspection of Anomalies ---

anomalies = df[df['anomaly_label'] == -1].sort_values(by='anomaly_score')

# Show the top 5 most anomalous observations (lowest scores)
print("\nTop 5 Most Anomalous Records:")
print(anomalies[['createdAt', 'facilityId', 'fuelHint', 'co2e', 'anomaly_score']])

# Example of how to filter based on a score threshold if you prefer a continuous approach
# score_threshold = df['anomaly_score'].quantile(0.01) # Set threshold at the 1st percentile
# high_risk = df[df['anomaly_score'] < score_threshold]


--- Results Summary ---
Total Anomalies Detected (-1): 60
Total Normal Observations (1): 6105

Top 5 Most Anomalous Records:
                            createdAt                            facilityId  \
1732 2025-03-20 03:56:15.627000+00:00  beae4852-c882-4706-8dd9-f8ce4dc11989   
3787 2025-03-20 03:56:15.627000+00:00  beae4852-c882-4706-8dd9-f8ce4dc11989   
5842 2025-03-20 03:56:15.627000+00:00  beae4852-c882-4706-8dd9-f8ce4dc11989   
2048 2025-07-10 09:22:48.624000+00:00  ffcd9d3d-c381-4e6b-9f20-393bc616132a   
4103 2025-07-10 09:22:48.624000+00:00  ffcd9d3d-c381-4e6b-9f20-393bc616132a   
6158 2025-07-10 09:22:48.624000+00:00  ffcd9d3d-c381-4e6b-9f20-393bc616132a   
4104 2025-07-15 08:37:32.928000+00:00  beae4852-c882-4706-8dd9-f8ce4dc11989   
2049 2025-07-15 08:37:32.928000+00:00  beae4852-c882-4706-8dd9-f8ce4dc11989   
6159 2025-07-15 08:37:32.928000+00:00  beae4852-c882-4706-8dd9-f8ce4dc11989   
2052 2025-09-02 06:41:02.578000+00:00  d0f9256f-17f4-4b7e-abc7-2eeeba8d59c0   
6162 

We can clearly observe that the top 5 listed anomalies have clearly much higher values than `co2e` values we would consider "normal". I can confidently say that this method of detecting anomalies has potential.