In [85]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

## Load Data

In [86]:
# Load the data
fire_data = pd.read_csv('input_data/processed/fire_data.csv')
cell_static_data = pd.read_csv('input_data/processed/cell_static.csv')
cell_dynamic_data = pd.read_csv('input_data/processed/cell_dynamic.csv')

## Preprocess Data

In [87]:
# Only keep data in the date range from 2015-01-01 to 2022-02-23, i.e., pre-war data
fire_data['ACQ_DATE'] = pd.to_datetime(fire_data['ACQ_DATE'])
cell_dynamic_data['ACQ_DATE'] = pd.to_datetime(cell_dynamic_data['ACQ_DATE'])
fire_data = fire_data[(fire_data['ACQ_DATE'] >= '2015-01-01') & (fire_data['ACQ_DATE'] <= '2022-02-23')]
cell_dynamic_data = cell_dynamic_data[(cell_dynamic_data['ACQ_DATE'] >= '2015-01-01') & (cell_dynamic_data['ACQ_DATE'] <= '2022-02-23')]

In [88]:
# Copy the fire data for later use
fire_data_copy = fire_data[['FIRE_ID', 'GRID_CELL_50KM', 'ACQ_DATE', 'LONGITUDE', 'LATITUDE']].copy()

In [5]:
# Drop irrelevant columns
fire_data.drop(columns=['FIRE_ID', 'LATITUDE', 'LONGITUDE', 'GRID_CELL', 'OBLAST_ID', 
                        'LATITUDE_1KM', 'LONGITUDE_1KM', 'GRID_CELL_1KM', 'OBLAST_ID_1KM', 
                        'FIRE_COUNT_CELL_1KM',], inplace=True)
# Drop duplicates
fire_data.drop_duplicates(inplace=True)
# Reset index
fire_data.reset_index(drop=True, inplace=True)
fire_data.head()

Unnamed: 0,ACQ_DATE,DAY_OF_YEAR,LATITUDE_50KM,LONGITUDE_50KM,GRID_CELL_50KM,OBLAST_ID_50KM,FIRE_COUNT_CELL_50KM
0,2015-01-01,1,47.0,37.5,47.0_37.5,UA14,4
1,2015-01-02,2,50.5,28.5,50.5_28.5,UA18,1
2,2015-01-03,3,48.0,33.5,48.0_33.5,UA12,1
3,2015-01-03,3,48.5,35.0,48.5_35.0,UA12,1
4,2015-01-04,4,49.0,24.5,49.0_24.5,UA26,1


In [6]:
def merge_static_data(data, static_data, resolution='50KM'):
    # Drop all columns in the static data that are not relevant for the specific resolution
    static_data = static_data[[col for col in static_data.columns if col.endswith(resolution)]]
    # Drop all duplicates
    static_data.drop_duplicates(inplace=True)
    # Merge the fire data with the static data
    merged_data = pd.merge(data, static_data, how='left', on=['GRID_CELL_50KM', 'OBLAST_ID_50KM', 'LATITUDE_50KM', 'LONGITUDE_50KM'])
    return merged_data

In [7]:
# Merge the fire data with the static data
fire_data_processed = merge_static_data(fire_data, cell_static_data)
fire_data_processed.head()

Unnamed: 0,ACQ_DATE,DAY_OF_YEAR,LATITUDE_50KM,LONGITUDE_50KM,GRID_CELL_50KM,OBLAST_ID_50KM,FIRE_COUNT_CELL_50KM,POP_DENSITY_50KM,LAND_USE_CLASS_1.0_50KM,LAND_USE_CLASS_2.0_50KM,LAND_USE_CLASS_3.0_50KM,LAND_USE_CLASS_4.0_50KM,LAND_USE_CLASS_0_50KM
0,2015-01-01,1,47.0,37.5,47.0_37.5,UA14,4,235.527153,0.015863,0.041639,0.333374,0.096907,0.0
1,2015-01-02,2,50.5,28.5,50.5_28.5,UA18,1,135.458196,0.313123,0.114159,0.477092,0.084996,0.0
2,2015-01-03,3,48.0,33.5,48.0_33.5,UA12,1,235.527153,0.034811,0.148977,0.644517,0.138799,0.0
3,2015-01-03,3,48.5,35.0,48.5_35.0,UA12,1,235.527153,0.073142,0.086862,0.548732,0.222136,0.0
4,2015-01-04,4,49.0,24.5,49.0_24.5,UA26,1,235.527153,0.388293,0.058082,0.397541,0.139557,0.0


In [8]:
def generate_fire_time_series(data, start_date, end_date, resolution='50KM'):
    time_series_data = {}
    # Iterate over all cells in the grid_cell column
    for cell in data['GRID_CELL_{}'.format(resolution)].unique():
        # Filter the data for the specific cell
        cell_data = data[data['GRID_CELL_{}'.format(resolution)] == cell]
        # Save the static data from all columns except the ACQ_DATE, DAY_OF_YEAR, and FIRE_COUNT_CELL columns
        static_data = cell_data.iloc[0].drop(['ACQ_DATE', 'DAY_OF_YEAR', 'FIRE_COUNT_CELL_{}'.format(resolution)])
        # Set ACQ_DATE as the index and reindex with the complete date range
        cell_data.set_index('ACQ_DATE', inplace=True)
        cell_data.index = pd.to_datetime(cell_data.index)
        cell_data = cell_data.reindex(pd.date_range(start=start_date, end=end_date, freq='D'), fill_value=0)
        # Override the DAY_OF_YEAR column with the correct values
        cell_data['DAY_OF_YEAR'] = cell_data.index.dayofyear
        # Override the ACQ_DATE column with the correct values
        cell_data['ACQ_DATE'] = cell_data.index
        cell_data.reset_index(drop=True, inplace=True)
        cell_data = cell_data[['ACQ_DATE'] + [col for col in cell_data.columns if col != 'ACQ_DATE']]
        # Override the columns with the static data
        for col in static_data.index:
            cell_data[col] = static_data[col]
        # Save the data
        time_series_data[cell] = cell_data
    # Merge the time series data into a single DataFrame
    time_series_data = pd.concat(time_series_data.values())
    return time_series_data

In [9]:
# Create a date range from 2015-01-01 to 2022-02-23, i.e., pre-war data
fire_data_processed = generate_fire_time_series(fire_data_processed, '2015-01-01', '2022-02-23', '50KM')
fire_data_processed.head()

Unnamed: 0,ACQ_DATE,DAY_OF_YEAR,LATITUDE_50KM,LONGITUDE_50KM,GRID_CELL_50KM,OBLAST_ID_50KM,FIRE_COUNT_CELL_50KM,POP_DENSITY_50KM,LAND_USE_CLASS_1.0_50KM,LAND_USE_CLASS_2.0_50KM,LAND_USE_CLASS_3.0_50KM,LAND_USE_CLASS_4.0_50KM,LAND_USE_CLASS_0_50KM
0,2015-01-01,1,47.0,37.5,47.0_37.5,UA14,4,235.527153,0.015863,0.041639,0.333374,0.096907,0.0
1,2015-01-02,2,47.0,37.5,47.0_37.5,UA14,0,235.527153,0.015863,0.041639,0.333374,0.096907,0.0
2,2015-01-03,3,47.0,37.5,47.0_37.5,UA14,0,235.527153,0.015863,0.041639,0.333374,0.096907,0.0
3,2015-01-04,4,47.0,37.5,47.0_37.5,UA14,0,235.527153,0.015863,0.041639,0.333374,0.096907,0.0
4,2015-01-05,5,47.0,37.5,47.0_37.5,UA14,0,235.527153,0.015863,0.041639,0.333374,0.096907,0.0


In [10]:
def merge_dynamic_data(data, dynamic_data):
    # Merge the fire data with the dynamic data
    merged_data = pd.merge(data, dynamic_data, how='left', left_on=['OBLAST_ID_50KM', 'ACQ_DATE'], right_on=['OBLAST_ID', 'ACQ_DATE'])
    # Drop the OBLAST_ID column
    merged_data.drop(columns=['OBLAST_ID'], inplace=True)
    return merged_data

In [11]:
# Merge the fire data with the dynamic data
fire_data_processed = merge_dynamic_data(fire_data_processed, cell_dynamic_data)
fire_data_processed.head()

Unnamed: 0,ACQ_DATE,DAY_OF_YEAR,LATITUDE_50KM,LONGITUDE_50KM,GRID_CELL_50KM,OBLAST_ID_50KM,FIRE_COUNT_CELL_50KM,POP_DENSITY_50KM,LAND_USE_CLASS_1.0_50KM,LAND_USE_CLASS_2.0_50KM,LAND_USE_CLASS_3.0_50KM,LAND_USE_CLASS_4.0_50KM,LAND_USE_CLASS_0_50KM,TEMPERATURE_2M_MAX (°C),TEMPERATURE_2M_MIN (°C),TEMPERATURE_2M_MEAN (°C),RAIN_SUM (MM),SNOWFALL_SUM (CM),WIND_DIRECTION_10M_DOMINANT (°),CLOUD_COVER (%)
0,2015-01-01,1,47.0,37.5,47.0_37.5,UA14,4,235.527153,0.015863,0.041639,0.333374,0.096907,0.0,-8.8,-18.0,-13.4,0.0,0.0,243.0,71.833333
1,2015-01-02,2,47.0,37.5,47.0_37.5,UA14,0,235.527153,0.015863,0.041639,0.333374,0.096907,0.0,-0.3,-8.4,-3.4,0.0,0.28,246.0,98.958333
2,2015-01-03,3,47.0,37.5,47.0_37.5,UA14,0,235.527153,0.015863,0.041639,0.333374,0.096907,0.0,2.0,-1.5,0.3,0.0,1.89,248.0,88.291667
3,2015-01-04,4,47.0,37.5,47.0_37.5,UA14,0,235.527153,0.015863,0.041639,0.333374,0.096907,0.0,0.6,-2.0,-0.3,0.0,0.63,239.0,89.958333
4,2015-01-05,5,47.0,37.5,47.0_37.5,UA14,0,235.527153,0.015863,0.041639,0.333374,0.096907,0.0,-1.4,-4.0,-2.6,0.0,0.21,241.0,95.458333


## Create Training and Validation/Calibration Data

In [16]:
# Split the data into training and test sets, the test set is the last 365 days
X_train = fire_data_processed[fire_data_processed['ACQ_DATE'] < '2021-02-23'].drop(columns=['ACQ_DATE', 
                                                                                            'GRID_CELL_50KM', 'OBLAST_ID_50KM', 'FIRE_COUNT_CELL_50KM'])
y_train = fire_data_processed[fire_data_processed['ACQ_DATE'] < '2021-02-23']['FIRE_COUNT_CELL_50KM']

X_test = fire_data_processed[fire_data_processed['ACQ_DATE'] >= '2021-02-23'].drop(columns=['ACQ_DATE', 
                                                                                            'GRID_CELL_50KM', 'OBLAST_ID_50KM', 'FIRE_COUNT_CELL_50KM'])
y_test = fire_data_processed[fire_data_processed['ACQ_DATE'] >= '2021-02-23']['FIRE_COUNT_CELL_50KM']

## Train the Pipeline

In [18]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from quantile_forest import RandomForestQuantileRegressor
from sklearn.pipeline import Pipeline

# Create the pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=0.95, random_state=42)),
    ('regressor', RandomForestQuantileRegressor(n_estimators=100, random_state=42))
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Print the pipeline steps
print(pipeline)

Pipeline(steps=[('scaler', StandardScaler()),
                ('pca', PCA(n_components=0.95, random_state=42)),
                ('regressor', RandomForestQuantileRegressor(random_state=42))])


## Compute the Error Threshold

In [20]:
# Predict the 95th percentile for the test set
y_pred = pipeline.named_steps['regressor'].predict(
    pipeline.named_steps['pca'].transform(pipeline.named_steps['scaler'].transform(X_test)), quantiles=[0.95]
    )

In [21]:
# Calculate the validation error
error = y_test - y_pred
# Set all negative values to zero
error[error < 0] = 0
# Calculate the mean absolute error
mae = error.mean()
print('Mean Absolute Error:', mae)
# Calculate the mean squared error
mse = (error ** 2).mean()
print('Mean Squared Error:', mse)

Mean Absolute Error: 0.059046011225139604
Mean Squared Error: 0.8299520562145515


In [22]:
# Reset the index of the test set and the error
y_test.reset_index(drop=True, inplace=True)
error = error.reset_index(drop=True)

# Calculate the cumulative sum of fires sorted by error
sorted_indices = np.argsort(error)
sorted_y_val = y_test[sorted_indices]
cumulative_fires = np.cumsum(sorted_y_val)

# Determine the threshold for 95% of all fires
total_fires = np.sum(y_test)
threshold_index = np.searchsorted(cumulative_fires, 0.95 * total_fires)
threshold = error[sorted_indices[threshold_index]]

print(f'Threshold for 95% of all fires: {threshold}')

Threshold for 95% of all fires: 14.450000000000031


In [23]:
from sklearn.base import BaseEstimator, TransformerMixin

class ThresholdStep(BaseEstimator, TransformerMixin):
    def __init__(self, threshold):
        self.threshold = threshold

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        if y is None:
            raise ValueError("True values (y) are required for the threshold step.")
        
        # Calculate the error
        error = y - X
        # Set all negative values to zero
        error[error < 0] = 0
        # Compare the error with the threshold
        is_abnormal = error > self.threshold
        return is_abnormal.astype(int)

In [24]:
# Add the threshold step to the pipeline
pipeline.steps.append(('threshold', ThresholdStep(threshold=threshold)))
# Print the updated pipeline steps
print(pipeline)

Pipeline(steps=[('scaler', StandardScaler()),
                ('pca', PCA(n_components=0.95, random_state=42)),
                ('regressor', RandomForestQuantileRegressor(random_state=42)),
                ('threshold', ThresholdStep(threshold=14.450000000000031))])


In [25]:
import pickle

# Save the updated pipeline
with open('saved_models/pipeline.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

In [26]:
# Predict abnormal fire counts
y_pred = pipeline.named_steps['threshold'].transform(
    pipeline.named_steps['regressor'].predict(
        pipeline.named_steps['pca'].transform(
            pipeline.named_steps['scaler'].transform(X_test)), quantiles=[0.95]), y_test)

In [27]:
# Calculate the number of fires labelled as normal and abnormal for validation data
normal_fires = y_test[y_pred == 0].sum()
abnormal_fires = y_test[y_pred == 1].sum()

print(f'Normal Fires: {normal_fires}, Abnormal Fires: {abnormal_fires}')
print(f'Percentage of Normal Fires: {normal_fires / (normal_fires + abnormal_fires) * 100:.2f}%')
print(f'Percentage of Abnormal Fires: {abnormal_fires / (normal_fires + abnormal_fires) * 100:.2f}%')

Normal Fires: 57962, Abnormal Fires: 3035
Percentage of Normal Fires: 95.02%
Percentage of Abnormal Fires: 4.98%


## Save the Results

In [89]:
# Select the validation date range from 2021-02-23 to 2022-02-23
fire_data_processed = fire_data_processed[(fire_data_processed['ACQ_DATE'] >= '2021-02-23') & (fire_data_processed['ACQ_DATE'] <= '2022-02-23')]
fire_data_copy = fire_data_copy[(fire_data_copy['ACQ_DATE'] >= '2021-02-23') & (fire_data_copy['ACQ_DATE'] <= '2022-02-23')]

# Add the abnormal labels to fire_data_processed
fire_data_processed['ABNORMAL_LABEL'] = y_pred.values

# Merge fire_data with fire_data_processed to map the abnormal labels to individual fire IDs
fire_data_copy = fire_data_copy.merge(fire_data_processed[['GRID_CELL_50KM', 'ACQ_DATE', 'ABNORMAL_LABEL']], 
                            on=['GRID_CELL_50KM', 'ACQ_DATE'], 
                            how='left')

# Keep only the FIRE_ID, ACQ_DATE, LONGITUDE, LATITUDE, and ABNORMAL_LABEL columns
fire_data_copy = fire_data_copy[['FIRE_ID', 'ACQ_DATE', 'LONGITUDE', 'LATITUDE', 'ABNORMAL_LABEL']]

# Display the updated fire_data DataFrame
fire_data_copy.head()

Unnamed: 0,FIRE_ID,ACQ_DATE,LONGITUDE,LATITUDE,ABNORMAL_LABEL
0,606587,2021-02-23,37.8656,47.6242,0
1,1576778,2021-02-23,37.603394,47.094631,0
2,1576773,2021-02-23,37.592018,47.092762,0
3,1576768,2021-02-23,37.558506,47.159988,0
4,1576764,2021-02-23,37.601517,47.094345,0


In [90]:
# Save the fire_data DataFrame to a CSV file
fire_data_copy.to_csv('output_data/abnormal_fires_validation.csv', index=False)