In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

## Load Data

In [2]:
# Load the data
fire_data = pd.read_csv('input_data/processed/fire_data.csv')
cell_static_data = pd.read_csv('input_data/processed/cell_static.csv')
cell_dynamic_data = pd.read_csv('input_data/processed/cell_dynamic.csv')

## Preprocess Data

In [3]:
# Only keep data in the date range from 2022-02-24 to 2024-09-30, i.e., war-time data
fire_data['ACQ_DATE'] = pd.to_datetime(fire_data['ACQ_DATE'])
cell_dynamic_data['ACQ_DATE'] = pd.to_datetime(cell_dynamic_data['ACQ_DATE'])
fire_data = fire_data[(fire_data['ACQ_DATE'] >= '2022-02-24') & (fire_data['ACQ_DATE'] <= '2024-09-30')]
cell_dynamic_data = cell_dynamic_data[(cell_dynamic_data['ACQ_DATE'] >= '2022-02-24') & (cell_dynamic_data['ACQ_DATE'] <= '2024-09-30')]

In [4]:
# Copy the fire data for later use
fire_data_copy = fire_data[['FIRE_ID', 'GRID_CELL_50KM', 'ACQ_DATE', 'LONGITUDE', 'LATITUDE']].copy()

In [5]:
# Drop irrelevant columns
fire_data.drop(columns=['FIRE_ID', 'LATITUDE', 'LONGITUDE', 'GRID_CELL', 'OBLAST_ID', 
                        'LATITUDE_1KM', 'LONGITUDE_1KM', 'GRID_CELL_1KM', 'OBLAST_ID_1KM', 
                        'FIRE_COUNT_CELL_1KM',], inplace=True)
# Drop duplicates
fire_data.drop_duplicates(inplace=True)
# Reset index
fire_data.reset_index(drop=True, inplace=True)
fire_data.head()

Unnamed: 0,ACQ_DATE,DAY_OF_YEAR,LATITUDE_50KM,LONGITUDE_50KM,GRID_CELL_50KM,OBLAST_ID_50KM,FIRE_COUNT_CELL_50KM
0,2022-02-24,55,50.0,35.5,50.0_35.5,UA63,1
1,2022-02-24,55,50.0,34.5,50.0_34.5,UA53,4
2,2022-02-24,55,48.5,34.0,48.5_34.0,UA12,4
3,2022-02-24,55,49.0,35.0,49.0_35.0,UA12,1
4,2022-02-24,55,50.0,36.5,50.0_36.5,UA63,2


In [6]:
def merge_static_data(data, static_data, resolution='50KM'):
    # Drop all columns in the static data that are not relevant for the specific resolution
    static_data = static_data[[col for col in static_data.columns if col.endswith(resolution)]]
    # Drop all duplicates
    static_data.drop_duplicates(inplace=True)
    # Merge the fire data with the static data
    merged_data = pd.merge(data, static_data, how='left', on=['GRID_CELL_50KM', 'OBLAST_ID_50KM', 'LATITUDE_50KM', 'LONGITUDE_50KM'])
    return merged_data

In [7]:
# Merge the fire data with the static data
fire_data_processed = merge_static_data(fire_data, cell_static_data)
fire_data_processed.head()

Unnamed: 0,ACQ_DATE,DAY_OF_YEAR,LATITUDE_50KM,LONGITUDE_50KM,GRID_CELL_50KM,OBLAST_ID_50KM,FIRE_COUNT_CELL_50KM,POP_DENSITY_50KM,LAND_USE_CLASS_1.0_50KM,LAND_USE_CLASS_2.0_50KM,LAND_USE_CLASS_3.0_50KM,LAND_USE_CLASS_4.0_50KM,LAND_USE_CLASS_0_50KM
0,2022-02-24,55,50.0,35.5,50.0_35.5,UA63,1,32.066911,0.181404,0.031389,0.724611,0.050246,0.0
1,2022-02-24,55,50.0,34.5,50.0_34.5,UA53,4,24.949751,0.226965,0.056746,0.662375,0.049254,0.0
2,2022-02-24,55,48.5,34.0,48.5_34.0,UA12,4,32.991457,0.08641,0.059752,0.807914,0.029564,0.0
3,2022-02-24,55,49.0,35.0,49.0_35.0,UA12,1,20.103453,0.028348,0.06305,0.867225,0.031525,0.0
4,2022-02-24,55,50.0,36.5,50.0_36.5,UA63,2,235.527153,0.227252,0.10252,0.492499,0.165559,0.0


In [8]:
def generate_fire_time_series(data, start_date, end_date, resolution='50KM'):
    time_series_data = {}
    # Iterate over all cells in the grid_cell column
    for cell in data['GRID_CELL_{}'.format(resolution)].unique():
        # Filter the data for the specific cell
        cell_data = data[data['GRID_CELL_{}'.format(resolution)] == cell]
        # Save the static data from all columns except the ACQ_DATE, DAY_OF_YEAR, and FIRE_COUNT_CELL columns
        static_data = cell_data.iloc[0].drop(['ACQ_DATE', 'DAY_OF_YEAR', 'FIRE_COUNT_CELL_{}'.format(resolution)])
        # Set ACQ_DATE as the index and reindex with the complete date range
        cell_data.set_index('ACQ_DATE', inplace=True)
        cell_data.index = pd.to_datetime(cell_data.index)
        cell_data = cell_data.reindex(pd.date_range(start=start_date, end=end_date, freq='D'), fill_value=0)
        # Override the DAY_OF_YEAR column with the correct values
        cell_data['DAY_OF_YEAR'] = cell_data.index.dayofyear
        # Override the ACQ_DATE column with the correct values
        cell_data['ACQ_DATE'] = cell_data.index
        cell_data.reset_index(drop=True, inplace=True)
        cell_data = cell_data[['ACQ_DATE'] + [col for col in cell_data.columns if col != 'ACQ_DATE']]
        # Override the columns with the static data
        for col in static_data.index:
            cell_data[col] = static_data[col]
        # Save the data
        time_series_data[cell] = cell_data
    # Merge the time series data into a single DataFrame
    time_series_data = pd.concat(time_series_data.values())
    return time_series_data

In [9]:
# Create a date range from 2022-02-24 to 2024-09-30, i.e., war-time data
fire_data_processed = generate_fire_time_series(fire_data_processed, '2022-02-24', '2024-09-30', '50KM')
fire_data_processed.head()

Unnamed: 0,ACQ_DATE,DAY_OF_YEAR,LATITUDE_50KM,LONGITUDE_50KM,GRID_CELL_50KM,OBLAST_ID_50KM,FIRE_COUNT_CELL_50KM,POP_DENSITY_50KM,LAND_USE_CLASS_1.0_50KM,LAND_USE_CLASS_2.0_50KM,LAND_USE_CLASS_3.0_50KM,LAND_USE_CLASS_4.0_50KM,LAND_USE_CLASS_0_50KM
0,2022-02-24,55,50.0,35.5,50.0_35.5,UA63,1,32.066911,0.181404,0.031389,0.724611,0.050246,0.0
1,2022-02-25,56,50.0,35.5,50.0_35.5,UA63,0,32.066911,0.181404,0.031389,0.724611,0.050246,0.0
2,2022-02-26,57,50.0,35.5,50.0_35.5,UA63,0,32.066911,0.181404,0.031389,0.724611,0.050246,0.0
3,2022-02-27,58,50.0,35.5,50.0_35.5,UA63,0,32.066911,0.181404,0.031389,0.724611,0.050246,0.0
4,2022-02-28,59,50.0,35.5,50.0_35.5,UA63,0,32.066911,0.181404,0.031389,0.724611,0.050246,0.0


In [10]:
def merge_dynamic_data(data, dynamic_data):
    # Merge the fire data with the dynamic data
    merged_data = pd.merge(data, dynamic_data, how='left', left_on=['OBLAST_ID_50KM', 'ACQ_DATE'], right_on=['OBLAST_ID', 'ACQ_DATE'])
    # Drop the OBLAST_ID column
    merged_data.drop(columns=['OBLAST_ID'], inplace=True)
    return merged_data

In [11]:
# Merge the fire data with the dynamic data
fire_data_processed = merge_dynamic_data(fire_data_processed, cell_dynamic_data)
fire_data_processed.head()

Unnamed: 0,ACQ_DATE,DAY_OF_YEAR,LATITUDE_50KM,LONGITUDE_50KM,GRID_CELL_50KM,OBLAST_ID_50KM,FIRE_COUNT_CELL_50KM,POP_DENSITY_50KM,LAND_USE_CLASS_1.0_50KM,LAND_USE_CLASS_2.0_50KM,LAND_USE_CLASS_3.0_50KM,LAND_USE_CLASS_4.0_50KM,LAND_USE_CLASS_0_50KM,TEMPERATURE_2M_MAX (°C),TEMPERATURE_2M_MIN (°C),TEMPERATURE_2M_MEAN (°C),RAIN_SUM (MM),SNOWFALL_SUM (CM),WIND_DIRECTION_10M_DOMINANT (°),CLOUD_COVER (%)
0,2022-02-24,55,50.0,35.5,50.0_35.5,UA63,1,32.066911,0.181404,0.031389,0.724611,0.050246,0.0,5.9,-1.4,1.8,0.0,0.0,54.0,84.333333
1,2022-02-25,56,50.0,35.5,50.0_35.5,UA63,0,32.066911,0.181404,0.031389,0.724611,0.050246,0.0,1.2,0.1,0.5,1.7,4.06,63.0,99.791667
2,2022-02-26,57,50.0,35.5,50.0_35.5,UA63,0,32.066911,0.181404,0.031389,0.724611,0.050246,0.0,5.1,0.1,1.7,1.4,0.63,3.0,91.5
3,2022-02-27,58,50.0,35.5,50.0_35.5,UA63,0,32.066911,0.181404,0.031389,0.724611,0.050246,0.0,4.6,-1.6,1.1,0.0,0.0,5.0,80.666667
4,2022-02-28,59,50.0,35.5,50.0_35.5,UA63,0,32.066911,0.181404,0.031389,0.724611,0.050246,0.0,2.1,-4.3,-0.5,0.0,0.0,9.0,64.166667


## Create Test Data

In [12]:
# Define the features and target variable
features = fire_data_processed.drop(columns=['ACQ_DATE', 'GRID_CELL_50KM', 'OBLAST_ID_50KM', 'FIRE_COUNT_CELL_50KM'])
target = fire_data_processed['FIRE_COUNT_CELL_50KM']

# Get two additional features, i.e., the ACQ_DATE and the GRID_CELL_50KM
acq_date = fire_data_processed['ACQ_DATE']
grid_cell = fire_data_processed['GRID_CELL_50KM']

# Bring the data in the correct format for the sklearn pipeline
X_test = features.values
y_test = target.values

## Load the Pipeline

In [13]:
from sklearn.base import BaseEstimator, TransformerMixin

def calculate_significance_score(value, threshold):
    # Calculate the significance score
    if value < threshold:
        return ((value - threshold) / threshold)
    else:
        return (value - threshold) / value

class ThresholdStep(BaseEstimator, TransformerMixin):
    def __init__(self, threshold):
        self.threshold = threshold

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        if y is None:
            raise ValueError("True values (y) are required for the threshold step.")
        
        # Calculate the error
        error = y - X
        # Set all negative values to zero
        error[error < 0] = 0
        # Compare the error with the threshold
        is_abnormal = error > self.threshold
        # Calculate the significance score
        significance_score = np.array(pd.Series(error).apply(calculate_significance_score, threshold=self.threshold))

        return np.array([is_abnormal.astype(int), significance_score])

In [14]:
from sklearn.base import BaseEstimator, TransformerMixin

class RecalculateConfidenceScores(BaseEstimator, TransformerMixin):
    def __init__(self, decay_rate, midpoint, cutoff):
        self.decay_rate = decay_rate
        self.midpoint = midpoint
        self.cutoff = cutoff

    def sigmoid_decay(self, time_diff):
        if time_diff > self.cutoff:
            return 0  # Influence reaches zero after the cut-off
        return 1 / (1 + np.exp(self.decay_rate * (time_diff - self.midpoint)))

    def fit(self, X, y=None):
        return self

    def transform(self, X, dates=None, grid_cells=None):
        if dates is None or grid_cells is None:
            raise ValueError("Dates and grid cells are required for the recalculation of confidence scores.")
        y_scores = X[1]
        y_pred = X[0]
        # Combine confidence scores, dates, and grid cells into a single list of events with indices
        indexed_events = list(enumerate(zip(y_scores, dates, grid_cells)))
        # Sort events by 'date' while preserving their original index
        indexed_events_sorted = sorted(indexed_events, key=lambda x: x[1][1])  # Sort by date (the second element of the tuple)
        # Initialize recalculated confidence scores with placeholders
        recalculated_confidences = [None] * len(y_scores)
        # Track the last war-related fire by grid cell
        last_war_events = {}
        # Loop through each event in the sorted order
        for i, (original_index, (current_conf, current_date, grid_cell)) in enumerate(indexed_events_sorted):
            # If this is a war-related fire, reset the decay process for this grid cell
            if current_conf > 0:
                last_war_events[grid_cell] = {
                    'ACQ_DATE': current_date,
                    'SIGNIFICANCE_SCORE': current_conf
                }
                recalculated_confidences[original_index] = current_conf  # No decay for the current event
            elif grid_cell in last_war_events:
                # Calculate the time difference from the last war-related fire in the same grid cell
                last_war_event = last_war_events[grid_cell]
                time_diff = (current_date - last_war_event['ACQ_DATE'])
                # Transform the time difference, which is in nanoseconds, to days
                time_diff = time_diff / np.timedelta64(1, 'D')
                # Apply the decay function to the subsequent fires in the same grid cell
                decayed_influence = self.sigmoid_decay(time_diff) * last_war_event['SIGNIFICANCE_SCORE']
                # If the decayed influence is zero or less than the original confidence, keep the original confidence
                if decayed_influence > current_conf and decayed_influence > 0:
                    new_conf = decayed_influence
                else:
                    new_conf = current_conf  # Preserve original confidence
                
                recalculated_confidences[original_index] = new_conf
            else:
                # If no war-related fire has been detected in this grid cell, keep the original confidence
                recalculated_confidences[original_index] = current_conf
        
        recalculated_confidences = np.array(recalculated_confidences)
        labels = np.where(recalculated_confidences > 0, 1, 0)
        return [labels, recalculated_confidences, y_pred, y_scores]

In [15]:
# Load the pipeline
import pickle

with open('saved_models/pipeline.pkl', 'rb') as file:
    pipeline = pickle.load(file)

print(pipeline)

Pipeline(steps=[('scaler', StandardScaler()),
                ('pca', PCA(n_components=0.95, random_state=42)),
                ('regressor', RandomForestQuantileRegressor(random_state=42)),
                ('threshold', ThresholdStep(threshold=14.450000000000031)),
                ('decay',
                 RecalculateConfidenceScores(cutoff=10, decay_rate=1.0,
                                             midpoint=5))])


## Predict Labels and Scores

In [16]:
# Predict abnormal fire counts
y_pred_decay, y_scores_decay, y_pred, y_scores = pipeline.named_steps['decay'].transform(
    pipeline.named_steps['threshold'].transform(
        pipeline.named_steps['regressor'].predict(
            pipeline.named_steps['pca'].transform(
                pipeline.named_steps['scaler'].transform(X_test)), quantiles=[0.95]), y_test), acq_date, grid_cell)

In [17]:
# Calculate the number of fires labelled as normal and abnormal for validation data
normal_fires = y_test[y_pred == 0].sum()
abnormal_fires = y_test[y_pred == 1].sum()
normal_fires_decay = y_test[y_pred_decay == 0].sum()
abnormal_fires_decay = y_test[y_pred_decay == 1].sum()

print(f'Normal Fires: {normal_fires}, Abnormal Fires: {abnormal_fires}')
print(f'Percentage of Normal Fires: {normal_fires / (normal_fires + abnormal_fires) * 100:.2f}%')
print(f'Percentage of Abnormal Fires: {abnormal_fires / (normal_fires + abnormal_fires) * 100:.2f}%')
print()
print(f'Normal Fires (Decay): {normal_fires_decay}, Abnormal Fires (Decay): {abnormal_fires_decay}')
print(f'Percentage of Normal Fires (Decay): {normal_fires_decay / (normal_fires_decay + abnormal_fires_decay) * 100:.2f}%')
print(f'Percentage of Abnormal Fires (Decay): {abnormal_fires_decay / (normal_fires_decay + abnormal_fires_decay) * 100:.2f}%')

Normal Fires: 249140, Abnormal Fires: 163363
Percentage of Normal Fires: 60.40%
Percentage of Abnormal Fires: 39.60%

Normal Fires (Decay): 178652, Abnormal Fires (Decay): 233851
Percentage of Normal Fires (Decay): 43.31%
Percentage of Abnormal Fires (Decay): 56.69%


In [18]:
# Calculate the mean scores for normal and abnormal fires
normal_scores = y_scores[y_pred == 0]
abnormal_scores = y_scores[y_pred == 1]

normal_scores_decay = y_scores_decay[y_pred_decay == 0]
abnormal_scores_decay = y_scores_decay[y_pred_decay == 1]

print(f'Mean Significance Score for Normal Fires: {normal_scores.mean():.2f}')
print(f'Mean Significance Score for Abnormal Fires: {abnormal_scores.mean():.2f}')
print()
print(f'Mean Significance Score for Normal Fires (Decay): {normal_scores_decay.mean():.2f}')
print(f'Mean Significance Score for Abnormal Fires (Decay): {abnormal_scores_decay.mean():.2f}')

Mean Significance Score for Normal Fires: -0.99
Mean Significance Score for Abnormal Fires: 0.49

Mean Significance Score for Normal Fires (Decay): -0.99
Mean Significance Score for Abnormal Fires (Decay): 0.28


## Save the Results

In [19]:
# Add the abnormal labels to fire_data_processed
fire_data_processed['ABNORMAL_LABEL'] = y_pred.tolist()
fire_data_processed['SIGNIFICANCE_SCORE'] = y_scores.tolist()
fire_data_processed['ABNORMAL_LABEL_DECAY'] = y_pred_decay.tolist()
fire_data_processed['SIGNIFICANCE_SCORE_DECAY'] = y_scores_decay.tolist()

# Merge fire_data with fire_data_processed to map the abnormal labels to individual fire IDs
fire_data_copy = fire_data_copy.merge(fire_data_processed[['GRID_CELL_50KM', 'ACQ_DATE', 'ABNORMAL_LABEL', 
                                                           'SIGNIFICANCE_SCORE', 'ABNORMAL_LABEL_DECAY', 'SIGNIFICANCE_SCORE_DECAY']], 
                            on=['GRID_CELL_50KM', 'ACQ_DATE'], 
                            how='left')

# Keep only the FIRE_ID, ACQ_DATE, LONGITUDE, LATITUDE, and ABNORMAL_LABEL columns
fire_data_copy = fire_data_copy[['FIRE_ID', 'ACQ_DATE', 'LONGITUDE', 'LATITUDE', 
                                 'ABNORMAL_LABEL', 'SIGNIFICANCE_SCORE', 'ABNORMAL_LABEL_DECAY', 'SIGNIFICANCE_SCORE_DECAY']]

# Display the updated fire_data DataFrame
fire_data_copy.head()

Unnamed: 0,FIRE_ID,ACQ_DATE,LONGITUDE,LATITUDE,ABNORMAL_LABEL,SIGNIFICANCE_SCORE,ABNORMAL_LABEL_DECAY,SIGNIFICANCE_SCORE_DECAY
0,1694947,2022-02-24,35.63002,50.06045,0.0,-1.0,0,-1.0
1,1694949,2022-02-24,34.27544,49.99856,0.0,-1.0,0,-1.0
2,1694945,2022-02-24,34.19779,48.6766,0.0,-1.0,0,-1.0
3,1694944,2022-02-24,34.19842,48.67255,0.0,-1.0,0,-1.0
4,1694948,2022-02-24,34.28119,49.99904,0.0,-1.0,0,-1.0


In [20]:
# Save the fire_data DataFrame to a CSV file
fire_data_copy.to_csv('output_data/abnormal_fires_test.csv', index=False)