In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load the aggregated data
X = pd.read_csv('input_data/processed/aggregated_data_001.csv')
print(X.shape)

(1308050, 19)


In [3]:
# We want to use plot the data later, thus we need to copy the original values of the temporal and spatial features and the fire IDs
temporal_spatial_features = ['FIRE_ID', 'ACQ_DATE', 'DAY_OF_YEAR', 'LATITUDE', 'LONGITUDE']
X_ts = X[temporal_spatial_features].copy()
X_ts.head()

Unnamed: 0,FIRE_ID,ACQ_DATE,DAY_OF_YEAR,LATITUDE,LONGITUDE
0,921182,2015-01-01,1,47.09,37.61
1,921184,2015-01-01,1,47.09,37.61
2,921183,2015-01-01,1,47.09,37.61
3,921181,2015-01-01,1,47.15,37.53
4,921185,2015-01-02,2,50.51,28.74


In [4]:
# Lets define which columns to keep
columns_to_keep = [
    'ACQ_DATE',
    'LATITUDE', 
    'LONGITUDE', 
    'DAY_OF_YEAR', 
    'FIRE_COUNT_CELL', 
    'FIRE_COUNT_CELL_AVG_7D',
    'FIRE_COUNT_CELL_AVG_30D',
    'FIRE_COUNT_OBLAST',
    'FIRE_COUNT_OBLAST_AVG_7D',
    'FIRE_COUNT_OBLAST_AVG_30D',
    'POP_DENSITY_CELL_AVG',
    'LAND_USE_CLASS_1',
    'LAND_USE_CLASS_2',
    'LAND_USE_CLASS_3',
    'LAND_USE_CLASS_4',
    ]
# Drop the columns that we don't need
X = X.drop(columns=set(X.columns) - set(columns_to_keep))
X.shape

(1308050, 15)

In [5]:
# Get the data for those rows with ACQ_DATE smaller than 2022-02-24 (the first day of the UKR war)
# and for those rows with ACQ_DATE greater than 2022-02-24

# Data before the war
X_pre = X[X['ACQ_DATE'] < '2022-02-24']
X_ts_pre = X_ts[X['ACQ_DATE'] < '2022-02-24']
X_pre.drop(columns=['ACQ_DATE'], inplace=True)

# Data after the war
X_post = X[X['ACQ_DATE'] >= '2022-02-24']
X_ts_post = X_ts[X['ACQ_DATE'] >= '2022-02-24']
X_post.drop(columns=['ACQ_DATE'], inplace=True)

X_pre.shape, X_post.shape

((896367, 14), (411683, 14))

In [6]:
# Load the scaler and pca models
import pickle

with open('saved_models/scaler.pkl', 'rb') as f:
    scaler = pickle.load(f)

with open('saved_models/pca.pkl', 'rb') as f:
    pca = pickle.load(f)

# Scale the data
X_pre_scaled = scaler.transform(X_pre)
X_pre_scaled = pd.DataFrame(X_pre_scaled, columns=X_pre.columns)

# Apply PCA
X_pre_pca = pca.transform(X_pre_scaled)
X_pre_pca = pd.DataFrame(X_pre_pca)
X_pre_pca.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-1.374959,1.340125,0.023832,-0.700548,-0.665593,-1.824658,-2.500547,-0.6334,1.460238,-0.052238
1,-1.3515,1.37514,0.043223,-0.706143,-0.668156,-1.82808,-2.506011,-0.634814,1.459762,-0.066945
2,-1.327659,1.410653,0.062874,-0.711829,-0.670755,-1.831545,-2.511538,-0.636244,1.45928,-0.082305
3,-2.47347,-0.054122,-0.696804,-0.409604,-0.564782,-1.70268,-2.282596,-0.554315,1.478698,0.025888
4,-1.607116,-1.038983,1.043747,1.884137,-0.669353,-1.97605,-0.879053,0.187224,-0.264534,0.031801


In [7]:
# Load the cluster labels
y_labels = pd.read_csv('output_data/cluster_labels_pre_war.csv')
y_labels = np.array(y_labels['CLUSTER_LABEL'])
y_labels.shape

(896367,)

In [10]:
# Lets train a model to predict the cluster labels
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score

# Split the data into training and testing sets
X_pre_train, X_pre_test, y_pre_train, y_pre_test = train_test_split(X_pre_pca, y_labels, test_size=0.1, stratify=y_labels, random_state=42)

# Train a Random Forest classifier
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_pre_train, y_pre_train)

# Predict the cluster labels
y_pre_proba = rf.predict_proba(X_pre_test)

# Get the predicted cluster labels from the probabilities
y_pre_pred = np.argmax(y_pre_proba, axis=1)

# Calculate the balanced accuracy
balanced_accuracy = balanced_accuracy_score(y_pre_test, y_pre_pred)
balanced_accuracy

np.float64(0.9940241284487643)

In [11]:
# Save the model as a pickle file
with open('saved_models/rf_clf.pkl', 'wb') as f:
    pickle.dump(rf, f)

In [12]:
# Apply the scaler and PCA to X_post data
X_post_scaled = scaler.transform(X_post)
X_post_scaled = pd.DataFrame(X_post_scaled, columns=X_post.columns)
X_post_pca = pca.transform(X_post_scaled)
X_post_pca = pd.DataFrame(X_post_pca)

X_post_pca.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.654625,3.114376,0.279042,1.10923,2.461454,-0.839216,-2.075397,-1.732895,-0.421888,0.886627
1,-1.476247,0.115787,-1.019781,1.682075,2.764903,-0.439422,-1.751559,-1.525811,-0.811942,0.035184
2,-1.414617,1.088409,-1.525867,1.403854,2.458175,-0.874336,-1.546762,-1.736021,0.620285,0.342124
3,-0.327953,2.191145,-0.664911,1.224201,2.436058,-0.883948,-1.783913,-1.747083,0.205398,-0.859834
4,0.876728,3.25842,0.332842,1.056786,2.443894,-0.856334,-2.089707,-1.744422,-0.41896,0.731695


In [13]:
# Predict the cluster labels for the X_post data
y_post_proba = rf.predict_proba(X_post_pca)
y_post_pred = np.argmax(y_post_proba, axis=1)
y_post_pred.shape, y_post_proba.shape

((411683,), (411683, 10))

In [14]:
from scipy.stats import entropy

# Calculate the entropy of the predicted probabilities for pre and post war data
entropy_pre = entropy(y_pre_proba, axis=1)
entropy_post = entropy(y_post_proba, axis=1)

# Iterate over different thresholds to find the best one
perc_thresholds = [75, 80, 90, 95, 98, 99]

for perc in perc_thresholds:
    # Set a threshold for out-of-distribution detection based on the pre-war data entropy
    threshold_entropy = np.percentile(entropy_pre, perc)  # X-th percentile

    # Detect out-of-distribution points in the post-war data
    ood = pd.Series(entropy_post > threshold_entropy)

    # Adjust the indices to fit with the original data
    ood.index = X_post.index

    # List the indices of out-of-distribution points
    ood_indices = list(ood[ood].index)

    # Print the results
    print(f"{perc}-th Percentile: Number of out-of-distribution points based on entropy: {ood.sum()}")
    # print(f"{perc}-th Percentile: Indices of out-of-distribution points based on entropy: {ood_indices}")
    print(f"{perc}-th Percentile: Relative amount of out-of-distribution points based on entropy: {ood.mean()}\n")

    # Save the out-of-distribution points
    ood_df = pd.DataFrame(ood)
    ood_df.columns = ['OOD']
    ood_df['FIRE_ID'] = X_ts_post['FIRE_ID']
    ood_df = ood_df[['FIRE_ID', 'OOD']]
    ood_df.to_csv('output_data/ood_indices_{}.csv'.format(perc), index=False)

75-th Percentile: Number of out-of-distribution points based on entropy: 189797
75-th Percentile: Relative amount of out-of-distribution points based on entropy: 0.4610270523679627

80-th Percentile: Number of out-of-distribution points based on entropy: 147175
80-th Percentile: Relative amount of out-of-distribution points based on entropy: 0.35749593740815144

90-th Percentile: Number of out-of-distribution points based on entropy: 117698
90-th Percentile: Relative amount of out-of-distribution points based on entropy: 0.2858947296827899

95-th Percentile: Number of out-of-distribution points based on entropy: 65518
95-th Percentile: Relative amount of out-of-distribution points based on entropy: 0.15914672211385944

98-th Percentile: Number of out-of-distribution points based on entropy: 33011
98-th Percentile: Relative amount of out-of-distribution points based on entropy: 0.08018548251931705

99-th Percentile: Number of out-of-distribution points based on entropy: 22852
99-th Perc