In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

## Load Data

In [2]:
# Load the data
fire_data = pd.read_csv('input_data/processed/fire_data.csv')
cell_static_data = pd.read_csv('input_data/processed/cell_static.csv')
cell_dynamic_data = pd.read_csv('input_data/processed/cell_dynamic.csv')

## Preprocess Data

In [3]:
fire_data['ACQ_DATE'] = pd.to_datetime(fire_data['ACQ_DATE'])
cell_dynamic_data['ACQ_DATE'] = pd.to_datetime(cell_dynamic_data['ACQ_DATE'])

In [4]:
# Save the fire ids
fire_ids = pd.DataFrame(fire_data[['FIRE_ID', 'ACQ_DATE']])
fire_ids.head()

Unnamed: 0,FIRE_ID,ACQ_DATE
0,921181,2015-01-01
1,921184,2015-01-01
2,921182,2015-01-01
3,921183,2015-01-01
4,921185,2015-01-02


In [5]:
# Drop irrelevant columns
fire_data.drop(columns=['FIRE_ID', 'LATITUDE', 'LONGITUDE', 'GRID_CELL', 'OBLAST_ID', 
                        'LATITUDE_1KM', 'LONGITUDE_1KM', 'GRID_CELL_1KM', 'OBLAST_ID_1KM', 
                        'FIRE_COUNT_CELL_1KM',], inplace=True)
# Drop duplicates
fire_data.drop_duplicates(inplace=True)
# Reset index
fire_data.reset_index(drop=True, inplace=True)
fire_data.head()

Unnamed: 0,ACQ_DATE,DAY_OF_YEAR,LATITUDE_50KM,LONGITUDE_50KM,GRID_CELL_50KM,OBLAST_ID_50KM,FIRE_COUNT_CELL_50KM
0,2015-01-01,1,47.0,37.5,47.0_37.5,UA14,4
1,2015-01-02,2,50.5,28.5,50.5_28.5,UA18,1
2,2015-01-03,3,48.0,33.5,48.0_33.5,UA12,1
3,2015-01-03,3,48.5,35.0,48.5_35.0,UA12,1
4,2015-01-04,4,49.0,24.5,49.0_24.5,UA26,1


In [6]:
def merge_static_data(data, static_data, resolution='50KM'):
    # Drop all columns in the static data that are not relevant for the specific resolution
    static_data = static_data[[col for col in static_data.columns if col.endswith(resolution)]]
    # Drop all duplicates
    static_data.drop_duplicates(inplace=True)
    # Merge the fire data with the static data
    merged_data = pd.merge(data, static_data, how='left', on=['GRID_CELL_50KM', 'OBLAST_ID_50KM', 'LATITUDE_50KM', 'LONGITUDE_50KM'])
    return merged_data

In [7]:
# Merge the fire data with the static data
fire_data_processed = merge_static_data(fire_data, cell_static_data)
fire_data_processed.head()

Unnamed: 0,ACQ_DATE,DAY_OF_YEAR,LATITUDE_50KM,LONGITUDE_50KM,GRID_CELL_50KM,OBLAST_ID_50KM,FIRE_COUNT_CELL_50KM,POP_DENSITY_50KM,LAND_USE_CLASS_1.0_50KM,LAND_USE_CLASS_2.0_50KM,LAND_USE_CLASS_3.0_50KM,LAND_USE_CLASS_4.0_50KM,LAND_USE_CLASS_0_50KM
0,2015-01-01,1,47.0,37.5,47.0_37.5,UA14,4,235.527153,0.015863,0.041639,0.333374,0.096907,0.0
1,2015-01-02,2,50.5,28.5,50.5_28.5,UA18,1,135.458196,0.313123,0.114159,0.477092,0.084996,0.0
2,2015-01-03,3,48.0,33.5,48.0_33.5,UA12,1,235.527153,0.034811,0.148977,0.644517,0.138799,0.0
3,2015-01-03,3,48.5,35.0,48.5_35.0,UA12,1,235.527153,0.073142,0.086862,0.548732,0.222136,0.0
4,2015-01-04,4,49.0,24.5,49.0_24.5,UA26,1,235.527153,0.388293,0.058082,0.397541,0.139557,0.0


In [8]:
def merge_dynamic_data(data, dynamic_data):
    # Merge the fire data with the dynamic data
    merged_data = pd.merge(data, dynamic_data, how='left', left_on=['OBLAST_ID_50KM', 'ACQ_DATE'], right_on=['OBLAST_ID', 'ACQ_DATE'])
    # Drop the OBLAST_ID column
    merged_data.drop(columns=['OBLAST_ID'], inplace=True)
    return merged_data

In [9]:
# Merge the fire data with the dynamic data
fire_data_processed = merge_dynamic_data(fire_data_processed, cell_dynamic_data)
fire_data_processed.head()

Unnamed: 0,ACQ_DATE,DAY_OF_YEAR,LATITUDE_50KM,LONGITUDE_50KM,GRID_CELL_50KM,OBLAST_ID_50KM,FIRE_COUNT_CELL_50KM,POP_DENSITY_50KM,LAND_USE_CLASS_1.0_50KM,LAND_USE_CLASS_2.0_50KM,LAND_USE_CLASS_3.0_50KM,LAND_USE_CLASS_4.0_50KM,LAND_USE_CLASS_0_50KM,TEMPERATURE_2M_MAX (°C),TEMPERATURE_2M_MIN (°C),TEMPERATURE_2M_MEAN (°C),RAIN_SUM (MM),SNOWFALL_SUM (CM),WIND_DIRECTION_10M_DOMINANT (°),CLOUD_COVER (%)
0,2015-01-01,1,47.0,37.5,47.0_37.5,UA14,4,235.527153,0.015863,0.041639,0.333374,0.096907,0.0,-8.8,-18.0,-13.4,0.0,0.0,243.0,71.833333
1,2015-01-02,2,50.5,28.5,50.5_28.5,UA18,1,135.458196,0.313123,0.114159,0.477092,0.084996,0.0,2.7,-2.0,0.4,2.4,0.07,267.0,93.125
2,2015-01-03,3,48.0,33.5,48.0_33.5,UA12,1,235.527153,0.034811,0.148977,0.644517,0.138799,0.0,2.4,-0.9,0.8,0.3,1.68,257.0,79.416667
3,2015-01-03,3,48.5,35.0,48.5_35.0,UA12,1,235.527153,0.073142,0.086862,0.548732,0.222136,0.0,2.4,-0.9,0.8,0.3,1.68,257.0,79.416667
4,2015-01-04,4,49.0,24.5,49.0_24.5,UA26,1,235.527153,0.388293,0.058082,0.397541,0.139557,0.0,1.7,-1.6,0.3,0.1,0.84,284.0,74.541667


## Create Training and Validation/Calibration Data

In [10]:
# Split the data into training and test sets, the test set is the last 365 days
X_train = fire_data_processed[fire_data_processed['ACQ_DATE'] < '2021-02-23'].drop(columns=['ACQ_DATE', 
                                                                                            'GRID_CELL_50KM', 'OBLAST_ID_50KM', 'FIRE_COUNT_CELL_50KM'])
X_test = fire_data_processed[fire_data_processed['ACQ_DATE'] >= '2022-02-24'].drop(columns=['ACQ_DATE', 
                                                                                            'GRID_CELL_50KM', 'OBLAST_ID_50KM', 'FIRE_COUNT_CELL_50KM'])

grid_cell_train = fire_data_processed[fire_data_processed['ACQ_DATE'] < '2021-02-23']['GRID_CELL_50KM']
grid_cell_test = fire_data_processed[fire_data_processed['ACQ_DATE'] >= '2022-02-24']['GRID_CELL_50KM']

fire_ids_train = fire_ids[fire_ids['ACQ_DATE'] < '2021-02-23']
fire_ids_test = fire_ids[fire_ids['ACQ_DATE'] >= '2022-02-24']

X_train.shape, fire_ids_train.shape, grid_cell_train.shape, X_test.shape, fire_ids_test.shape, grid_cell_test.shape

((127932, 16), (836439, 2), (127932,), (50470, 16), (412503, 2), (50470,))

In [11]:
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler

class OneNearestNeighborModel:
    def __init__(self):
        self.scaler = None
        self.nearest_neighbors = {}
        self.fire_id_train = None

    def fit(self, X_train, fire_id_train, grid_cell_train):
        # Train the scaler on the entire X_train data
        self.scaler = StandardScaler()
        X_train_scaled = self.scaler.fit_transform(X_train)
        self.fire_id_train = fire_id_train
        
        # Iterate over unique grid cells in the training set
        for grid_cell in grid_cell_train.unique():
            # Get the indices of the samples in the train set for the current grid cell
            train_indices = grid_cell_train[grid_cell_train == grid_cell].index
            
            # If there are no train samples for the current grid cell, skip it
            if len(train_indices) == 0:
                continue
            
            # Extract the train samples for the current grid cell
            X_train_grid_scaled = X_train_scaled[train_indices]
            
            # Fit the NearestNeighbors model on the train samples for the current grid cell
            nn = NearestNeighbors(n_neighbors=1)
            nn.fit(X_train_grid_scaled)
            
            # Store the nearest neighbors model in the dictionary
            self.nearest_neighbors[grid_cell] = nn
        
        return self

    def transform(self, X_test, fire_id_test, grid_cell_test):
        # Scale the test samples
        X_test_scaled = pd.DataFrame(self.scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
        
        # Initialize a list to store the pairs of fire ids
        fire_id_pairs = []
        
        # Iterate over unique grid cells in the test set
        for grid_cell in grid_cell_test.unique():
            # Get the indices of the samples in the test set for the current grid cell
            test_indices = grid_cell_test[grid_cell_test == grid_cell].index
            
            # If there are no test samples for the current grid cell, skip it
            if len(test_indices) == 0:
                continue
            
            # Extract the test samples for the current grid cell
            X_test_grid_scaled = X_test_scaled.loc[test_indices]
            
            # Get the nearest neighbors model for the current grid cell
            nn = self.nearest_neighbors.get(grid_cell)
            
            # If there is no nearest neighbors model for the current grid cell, skip it
            if nn is None:
                continue
            
            # Find the nearest neighbors for the test samples
            distances, indices = nn.kneighbors(X_test_grid_scaled)
            
            # Get the corresponding fire ids
            for test_idx, train_idx in zip(test_indices, indices.flatten()):
                fire_id_pairs.append((fire_id_test.iloc[test_idx]['FIRE_ID'], self.fire_id_train.iloc[train_idx]['FIRE_ID']))
        
        return fire_id_pairs

In [12]:
onn_model = OneNearestNeighborModel()
onn_model.fit(X_train, fire_ids_train, grid_cell_train)
fire_id_pairs = onn_model.transform(X_test, fire_ids_test, grid_cell_test)

In [13]:
import pickle

# Save the model as a pickle file
with open('saved_models/one_nearest_neighbor_model.pkl', 'wb') as f:
    pickle.dump(onn_model, f)

In [14]:
# Save the fire id pairs as a csv file
fire_id_pairs_df = pd.DataFrame(fire_id_pairs, columns=['FIRE_ID_TEST', 'FIRE_ID_TRAIN'])

fire_id_pairs_df.to_csv('output_data/one_nearest_neighbor_fire_id_pairs.csv', index=False)