In [25]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

In [26]:
# Load the aggregated data
X = pd.read_csv('input_data/processed/aggregated_data_001.csv')
print(X.shape)

(1308050, 37)


In [27]:
X.columns

Index(['FIRE_ID', 'LATITUDE', 'LONGITUDE', 'ACQ_DATE', 'ACQ_TIME', 'OBLAST_ID',
       'GRID_CELL', 'DAY_OF_YEAR', 'FIRE_COUNT_CELL',
       'GRID_CELL_NEIGHBOR_100km', 'FIRE_COUNT_CELL_NEIGHBOR_100km',
       'GRID_CELL_NEIGHBOR_10km', 'FIRE_COUNT_CELL_NEIGHBOR_10km',
       'FIRE_COUNT_OBLAST', 'FIRE_COUNT_CELL_AVG_7D',
       'FIRE_COUNT_CELL_NEIGHBOR_100km_AVG_7D',
       'FIRE_COUNT_CELL_NEIGHBOR_10km_AVG_7D', 'FIRE_COUNT_OBLAST_AVG_7D',
       'FIRE_COUNT_CELL_AVG_30D', 'FIRE_COUNT_CELL_NEIGHBOR_100km_AVG_30D',
       'FIRE_COUNT_CELL_NEIGHBOR_10km_AVG_30D', 'FIRE_COUNT_OBLAST_AVG_30D',
       'POP_DENSITY_CELL_AVG', 'POP_DENSITY_CELL_NEIGHBOR_100km_AVG',
       'POP_DENSITY_CELL_NEIGHBOR_10km_AVG', 'LAND_USE_CLASS_1',
       'LAND_USE_CLASS_2', 'LAND_USE_CLASS_3', 'LAND_USE_CLASS_4',
       'LAND_USE_CLASS_1_NEIGHBOR_100km_AVG',
       'LAND_USE_CLASS_2_NEIGHBOR_100km_AVG',
       'LAND_USE_CLASS_3_NEIGHBOR_100km_AVG',
       'LAND_USE_CLASS_4_NEIGHBOR_100km_AVG',
       'LAND_U

In [28]:
# We want to use plot the data later, thus we need to copy the original values of the temporal and spatial features and the fire IDs
temporal_spatial_features = ['FIRE_ID', 'ACQ_DATE', 'DAY_OF_YEAR', 'LATITUDE', 'LONGITUDE']
X_ts = X[temporal_spatial_features].copy()
X_ts.head()

Unnamed: 0,FIRE_ID,ACQ_DATE,DAY_OF_YEAR,LATITUDE,LONGITUDE
0,921182,2015-01-01,1,47.09,37.61
1,921184,2015-01-01,1,47.09,37.61
2,921183,2015-01-01,1,47.09,37.61
3,921181,2015-01-01,1,47.15,37.53
4,921185,2015-01-02,2,50.51,28.74


In [29]:
# Lets define which columns to keep
columns_to_keep = [
    'ACQ_DATE',
    'LATITUDE', 
    'LONGITUDE', 
    'DAY_OF_YEAR', 
    'FIRE_COUNT_CELL', 
    'FIRE_COUNT_CELL_NEIGHBOR_100km', 
    'FIRE_COUNT_CELL_NEIGHBOR_10km', 
    'FIRE_COUNT_CELL_AVG_7D',
    'FIRE_COUNT_CELL_NEIGHBOR_100km_AVG_7D',
    'FIRE_COUNT_CELL_NEIGHBOR_10km_AVG_7D',
    'FIRE_COUNT_CELL_AVG_30D',
    'FIRE_COUNT_CELL_NEIGHBOR_100km_AVG_30D',
    'FIRE_COUNT_CELL_NEIGHBOR_10km_AVG_30D',
    'POP_DENSITY_CELL_AVG',
    'POP_DENSITY_CELL_NEIGHBOR_100km_AVG',
    'POP_DENSITY_CELL_NEIGHBOR_10km_AVG',
    'LAND_USE_CLASS_1_NEIGHBOR_100km_AVG',
    'LAND_USE_CLASS_2_NEIGHBOR_100km_AVG',
    'LAND_USE_CLASS_3_NEIGHBOR_100km_AVG',
    'LAND_USE_CLASS_4_NEIGHBOR_100km_AVG',
    'LAND_USE_CLASS_1_NEIGHBOR_10km_AVG',
    'LAND_USE_CLASS_2_NEIGHBOR_10km_AVG',
    'LAND_USE_CLASS_3_NEIGHBOR_10km_AVG',
    'LAND_USE_CLASS_4_NEIGHBOR_10km_AVG'
    ]
# Drop the columns that we don't need
X = X.drop(columns=set(X.columns) - set(columns_to_keep))
X.shape

(1308050, 24)

In [30]:
# Get the data for those rows with ACQ_DATE smaller than 2022-02-24 (the first day of the UKR war)
# and for those rows with ACQ_DATE greater than 2022-02-24

# Data before the war
X_pre = X[X['ACQ_DATE'] < '2022-02-24']
X_ts_pre = X_ts[X['ACQ_DATE'] < '2022-02-24']
X_pre.drop(columns=['ACQ_DATE'], inplace=True)

X_pre.shape

(896367, 23)

In [31]:
# Lets transform the data so that we can use it for clustering
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_pre)
X_pre_scaled = scaler.transform(X_pre)

X_pre_scaled = pd.DataFrame(X_pre_scaled, columns=X_pre.columns)
X_pre_scaled.head()

Unnamed: 0,LATITUDE,LONGITUDE,DAY_OF_YEAR,FIRE_COUNT_CELL,FIRE_COUNT_CELL_NEIGHBOR_100km,FIRE_COUNT_CELL_NEIGHBOR_10km,FIRE_COUNT_CELL_AVG_7D,FIRE_COUNT_CELL_NEIGHBOR_100km_AVG_7D,FIRE_COUNT_CELL_NEIGHBOR_10km_AVG_7D,FIRE_COUNT_CELL_AVG_30D,...,POP_DENSITY_CELL_NEIGHBOR_100km_AVG,POP_DENSITY_CELL_NEIGHBOR_10km_AVG,LAND_USE_CLASS_1_NEIGHBOR_100km_AVG,LAND_USE_CLASS_2_NEIGHBOR_100km_AVG,LAND_USE_CLASS_3_NEIGHBOR_100km_AVG,LAND_USE_CLASS_4_NEIGHBOR_100km_AVG,LAND_USE_CLASS_1_NEIGHBOR_10km_AVG,LAND_USE_CLASS_2_NEIGHBOR_10km_AVG,LAND_USE_CLASS_3_NEIGHBOR_10km_AVG,LAND_USE_CLASS_4_NEIGHBOR_10km_AVG
0,-1.08727,1.168188,-2.434418,0.516762,-0.602185,-0.368227,-0.161225,-0.575252,-0.198817,0.899824,...,-0.758024,-0.517793,-0.461636,0.783503,-0.200191,1.339367,-0.580791,-0.675893,-1.191845,-0.535165
1,-1.08727,1.168188,-2.434418,0.516762,-0.602185,-0.368227,-0.068254,-0.579963,-0.228057,0.886693,...,-0.758024,-0.517793,-0.461636,0.783503,-0.200191,1.339367,-0.580791,-0.675893,-1.191845,-0.535165
2,-1.08727,1.168188,-2.434418,0.516762,-0.602185,-0.368227,0.024716,-0.584675,-0.257296,0.874757,...,-0.758024,-0.517793,-0.461636,0.783503,-0.200191,1.339367,-0.580791,-0.675893,-1.191845,-0.535165
3,-1.048173,1.149366,-2.434418,-0.673332,-0.602185,-0.502507,-0.719045,-0.589386,-0.432732,-0.754625,...,-1.633881,-0.216626,-0.461636,0.783503,-0.200191,1.339367,-0.580791,-0.675893,-1.191845,-0.535165
4,1.141215,-0.918662,-2.421335,-0.673332,-0.635089,-0.502507,-0.719045,-0.631001,-0.510704,-0.754625,...,0.741732,-0.665485,2.382236,0.94898,-2.044882,-1.054034,0.08906,0.064172,0.622077,-0.110112


In [32]:
# Now lets apply PCA to reduce the dimensionality of the data
from sklearn.decomposition import PCA

pca = PCA(n_components=0.99)
X_pre_pca = pca.fit_transform(X_pre_scaled)

X_pre_pca = pd.DataFrame(X_pre_pca)
X_pre_pca.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,-0.656558,-0.678215,-1.859664,1.424718,-0.976618,0.001501,0.23902,-1.736837,-1.9543,0.586959,-0.151034,-1.620529,0.436219,-0.560386,0.081379,-0.068835,-0.487891
1,-0.64769,-0.691298,-1.874096,1.403695,-1.004557,0.026853,0.250652,-1.744076,-1.957556,0.586591,-0.152558,-1.618189,0.437219,-0.560003,0.080844,-0.085219,-0.488549
2,-0.638521,-0.704654,-1.888708,1.382388,-1.032845,0.052453,0.262381,-1.751393,-1.960853,0.586224,-0.154096,-1.615831,0.438225,-0.559636,0.080287,-0.102218,-0.489211
3,-1.718048,0.343951,-1.168152,2.178414,-0.270357,-0.779285,0.381045,-1.864667,-1.561887,1.269583,-0.269189,-1.284565,0.595722,-0.199417,0.102152,0.011054,-0.472778
4,-0.122715,3.902382,-0.42193,-0.922053,0.407163,0.044758,-0.466379,-0.265096,-1.893926,-0.058719,0.08462,-0.989201,-1.558197,0.743487,-0.420663,0.060998,-0.695242


In [35]:
# Use a clustering algorithm where I can specify the number of clusters
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=15)
kmeans.fit(X_pre_pca)

# Get the labels of the clusters
y_labels = kmeans.labels_

# Get the size of each cluster
cluster_sizes = pd.Series(kmeans.labels_).value_counts()
cluster_sizes

0     176501
5     118542
10     93117
2      88181
9      75523
7      65649
6      65023
4      51920
14     44474
3      33645
11     31763
12     26726
1      14251
13      5720
8       5332
Name: count, dtype: int64

In [38]:
# Save the cluster labels to a file
cluster_labels = pd.DataFrame({'FIRE_ID': X_ts_pre['FIRE_ID'], 'CLUSTER_LABEL': y_labels})
cluster_labels.to_csv('output_data/cluster_labels_pre_war.csv', index=False)

In [47]:
# Save the scaler and pca objects
import pickle

with open('saved_models/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

with open('saved_models/pca.pkl', 'wb') as f:
    pickle.dump(pca, f)