## Install libraries

In [1]:
%pip install -q gdown

Note: you may need to restart the kernel to use updated packages.


Also, install libraries listed in `requirements.txt` in the app root folder.

## Import libraries

In [1]:
from datetime import timedelta
from pathlib import Path

import geopandas as gpd
import numpy as np
import pandas as pd
from shapely.geometry import box



## Download training data

%%bash

NASA_FIRE_ID="10wbXaFTG8RyolfGzvI8SFQ1XzHi4tTQr"
TRAIN_ID="11_Wjbxsdrgt-DFGJELumk51elHOLkgpQ"

mkdir -p data

gdown --id $NASA_FIRE_ID -O "data/NASA.zip"
unzip "data/NASA.zip" -d data

gdown --id $TRAIN_ID -O data/train.csv

## Prepare training data

### NASA satellites data

In [2]:
nasa_fire_path = Path("data")
geo_files = nasa_fire_path.rglob("*.shp")

nasa_fire_data = None
for filepath in geo_files:
    if nasa_fire_data is None:
        nasa_fire_data = gpd.read_file(filepath)
    else:
        nasa_fire_data = pd.concat([nasa_fire_data, gpd.read_file(filepath)], ignore_index=True)

In [3]:
print(nasa_fire_data.shape)

(379219, 16)


In [4]:
nasa_fire_data["ACQ_DATE"] = pd.to_datetime(nasa_fire_data["ACQ_DATE"], format="%Y-%m-%d")
nasa_fire_data.sort_values("ACQ_TIME", inplace=True)

In [5]:
nasa_fire_data.drop_duplicates(
    ["LATITUDE", "LONGITUDE", "ACQ_DATE", "INSTRUMENT"],
    keep="last",
    inplace=True,
)
print(nasa_fire_data.shape)

(379213, 16)


In [6]:
nasa_fire_data["id"] = np.arange(nasa_fire_data.shape[0])
nasa_fire_data["train_date"] = nasa_fire_data["ACQ_DATE"] + timedelta(days=1)
nasa_fire_data["Month"] = nasa_fire_data["ACQ_DATE"].dt.month

In [7]:
nasa_fire_data.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 379213 entries, 352233 to 104285
Data columns (total 19 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   LATITUDE    379213 non-null  float64       
 1   LONGITUDE   379213 non-null  float64       
 2   BRIGHTNESS  379213 non-null  float64       
 3   SCAN        379213 non-null  float64       
 4   TRACK       379213 non-null  float64       
 5   ACQ_DATE    379213 non-null  datetime64[ns]
 6   ACQ_TIME    379213 non-null  object        
 7   SATELLITE   379213 non-null  object        
 8   INSTRUMENT  379213 non-null  object        
 9   CONFIDENCE  379213 non-null  int64         
 10  VERSION     379213 non-null  object        
 11  BRIGHT_T31  379213 non-null  float64       
 12  FRP         379213 non-null  float64       
 13  DAYNIGHT    379213 non-null  object        
 14  TYPE        379213 non-null  int64         
 15  geometry    379213 non-null  geometry 

### Russian MCHS data

In [8]:
train_data = pd.read_csv("data/train.csv")

In [9]:
train_data.head()

Unnamed: 0,dt,lon_min,lat_min,lon_max,lat_max,lon,lat,grid_index,type_id,type_name,is_land,infire_day_1,infire_day_2,infire_day_3,infire_day_4,infire_day_5,infire_day_6,infire_day_7,infire_day_8
0,2020-05-04,47.6,41.0,47.8,41.2,,,143,,,False,0,0,0,0,0,0,0,0
1,2021-02-24,47.2,41.2,47.4,41.4,,,891,,,False,0,0,0,0,0,0,0,0
2,2021-02-27,47.2,41.2,47.4,41.4,,,891,,,False,0,0,0,0,0,0,0,0
3,2021-04-01,47.4,41.2,47.6,41.4,,,892,,,True,0,0,0,0,0,0,0,0
4,2020-03-14,47.6,41.2,47.8,41.4,,,893,,,True,0,0,0,0,0,0,0,0


In [10]:
train_data["id"] = np.arange(train_data.shape[0])

In [11]:
train_data["dt"] = pd.to_datetime(train_data["dt"], format="%Y-%m-%d")

In [12]:
train_data["geometry"] = train_data.apply(
    lambda x: box(*x[["lon_min", "lat_min", "lon_max", "lat_max"]]),
    axis=1
)

In [13]:
train_data = gpd.GeoDataFrame(train_data, geometry="geometry")

In [14]:
train_data.crs = "epsg:4326"

In [15]:
train_data.to_crs(epsg=3310, inplace=True)
nasa_fire_data.to_crs(epsg=3310, inplace=True)

In [17]:
nasa_fire_data

Unnamed: 0,LATITUDE,LONGITUDE,BRIGHTNESS,SCAN,TRACK,ACQ_DATE,ACQ_TIME,SATELLITE,INSTRUMENT,CONFIDENCE,VERSION,BRIGHT_T31,FRP,DAYNIGHT,TYPE,geometry,id,train_date,Month
352233,46.6222,39.0148,302.4,1.6,1.2,2021-04-03,0000,Aqua,MODIS,49,6.03,274.1,19.2,N,0,POINT (7318589.118 9094526.753),0,2021-04-04,4
259790,48.9202,39.8694,302.6,1.5,1.2,2020-08-06,0000,Aqua,MODIS,50,6.03,289.3,10.7,N,0,POINT (7061961.880 9131558.334),1,2020-08-07,8
259789,48.9161,39.8892,334.8,1.5,1.2,2020-08-06,0000,Aqua,MODIS,100,6.03,290.2,66.8,N,0,POINT (7062233.376 9133084.572),2,2020-08-07,8
352235,45.9646,48.1378,319.0,4.8,2.0,2021-04-03,0000,Aqua,MODIS,98,6.03,272.8,215.0,N,0,POINT (7280335.931 9808589.347),3,2021-04-04,4
352234,45.9677,48.1306,322.7,4.8,2.0,2021-04-03,0000,Aqua,MODIS,100,6.03,272.9,251.6,N,0,POINT (7280115.787 9807967.912),4,2021-04-04,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104237,63.7139,166.6503,312.7,1.3,1.1,2020-06-21,2359,Terra,MODIS,43,6.03,300.0,10.0,D,0,POINT (-3907416.854 4312467.453),379208,2020-06-22,6
104238,63.4746,169.9347,336.0,1.1,1.1,2020-06-21,2359,Terra,MODIS,87,6.03,303.0,33.0,D,0,POINT (-3781613.780 4162971.632),379209,2020-06-22,6
104239,63.4770,169.9134,364.2,1.1,1.1,2020-06-21,2359,Terra,MODIS,100,6.03,305.3,104.5,D,0,POINT (-3782394.257 4163987.740),379210,2020-06-22,6
104241,63.7198,166.6596,315.3,1.3,1.1,2020-06-21,2359,Terra,MODIS,54,6.03,301.5,12.1,D,0,POINT (-3906639.346 4312478.506),379211,2020-06-22,6


In [16]:
intersection = {}
for i, (el, date) in enumerate(zip(train_data["geometry"], train_data["dt"])):
    res = nasa_fire_data["geometry"].sindex.query(el)
    date_idxs = np.where(nasa_fire_data["train_date"].iloc[res] == date)[0]
    date_res = res[date_idxs]
    intersection[i] = date_res

In [18]:
fifteen_km_radius = {}
for i, (el, date) in enumerate(zip(train_data["geometry"], train_data["dt"])):
    el = el.buffer(15000.0)
    res = nasa_fire_data["geometry"].sindex.query(el)
    date_idxs = np.where(nasa_fire_data["train_date"].iloc[res] == date)[0]
    date_res = res[date_idxs]
    fifteen_km_radius[i] = date_res

In [19]:
ten_km_radius = {}
for i, (el, date) in enumerate(zip(train_data["geometry"], train_data["dt"])):
    el = el.buffer(10000.0)
    res = nasa_fire_data["geometry"].sindex.query(el)
    date_idxs = np.where(nasa_fire_data["train_date"].iloc[res] == date)[0]
    date_res = res[date_idxs]
    ten_km_radius[i] = date_res

In [20]:
five_km_radius = {}
for i, (el, date) in enumerate(zip(train_data["geometry"], train_data["dt"])):
    rel = el.buffer(15000.0)
    res = nasa_fire_data["geometry"].sindex.query(el)
    date_idxs = np.where(nasa_fire_data["train_date"].iloc[res] == date)[0]
    date_res = res[date_idxs]
    five_km_radius[i] = date_res

In [21]:
two_km_radius = {}
for i, (el, date) in enumerate(zip(train_data["geometry"], train_data["dt"])):
    el = el.buffer(2000.0)
    res = nasa_fire_data["geometry"].sindex.query(el)
    date_idxs = np.where(nasa_fire_data["train_date"].iloc[res] == date)[0]
    date_res = res[date_idxs]
    two_km_radius[i] = date_res

In [38]:
closest_radius = {}
closest_radius_index = {}
for i, (el, date) in enumerate(zip(train_data["geometry"], train_data["dt"])):
    res = nasa_fire_data[nasa_fire_data["train_date"] == date]["geometry"].sindex.nearest(el, return_distance=True)
    if res[0].shape[-1] > 0:
        closest_radius[i] = res[-1][0]
        closest_radius_index[i] = res[0][-1]
    else:
        closest_radius[i] = None
        closest_radius_index[i] = None

In [39]:
closest_radius = pd.Series(closest_radius)


In [41]:
AGG_FEATURES = ["BRIGHTNESS", "BRIGHT_T31", "FRP", "CONFIDENCE"]

In [45]:
closest_radius_values = []
for k, v in closest_radius_index.items():
    if v is not None:
        closest_radius_values.append(nasa_fire_data[AGG_FEATURES].iloc[v[0]].to_dict())
    else:
        closest_radius_values.append({f: None for f in AGG_FEATURES})

In [48]:
closest_radius_values = pd.DataFrame(closest_radius_values)
closest_radius_values.columns = [f"closest.{col}" for col in closest_radius_values.columns]
closest_radius_values["closest.dist"] = closest_radius

In [50]:
train_data = pd.read_csv("merged.csv")

  train_data = pd.read_csv("merged.csv")


In [52]:
train_data = pd.concat([train_data, closest_radius_values], axis=1)

In [53]:
train_data.to_csv("merged.csv", index=False)

In [None]:
%%time
intersection_distance = []
for k, v in intersection.items():
    if v.shape[0] > 0:
        dist = nasa_fire_data["geometry"].iloc[v].apply(lambda x: x.distance(train_data["geometry"].iloc[k])).mean()
    else:
        dist = None
    intersection_distance.append(dist)

In [None]:
%%time
dist2 = []
for k, v in two_km_radius.items():
    if v.shape[0] > 0:
        dist = nasa_fire_data["geometry"].iloc[v].apply(lambda x: x.distance(train_data["geometry"].iloc[k])).mean()
    else:
        dist = None
    dist2.append(dist)

In [None]:
%%time
dist5 = []
for k, v in five_km_radius.items():
    if v.shape[0] > 0:
        dist = nasa_fire_data["geometry"].iloc[v].apply(lambda x: x.distance(train_data["geometry"].iloc[k])).mean()
    else:
        dist = None
    dist5.append(dist)

In [None]:
%%time
dist10 = []
for k, v in ten_km_radius.items():
    if v.shape[0] > 0:
        dist = nasa_fire_data["geometry"].iloc[v].apply(lambda x: x.distance(train_data["geometry"].iloc[k])).mean()
    else:
        dist = None
    dist10.append(dist)

In [None]:
%%time
dist15 = []
for k, v in intersection.items():
    if v.shape[0] > 0:
        dist = nasa_fire_data["geometry"].iloc[v].apply(lambda x: x.distance(train_data["geometry"].iloc[k])).mean()
    else:
        dist = None
    dist15.append(dist)

In [None]:
closest_dist = []
for k, v in intersection.items():
    if v.shape[0] > 0:
        dist = nasa_fire_data["geometry"].iloc[v].apply(lambda x: x.distance(train_data["geometry"].iloc[k])).mean()
    else:
        dist = None
    closest_dist.append(dist)

In [34]:
distance_data = pd.DataFrame({
    "0.dist": intersection_distance, 
    "2.dist": dist2,
    "5.dist": dist5,
    "10.dist": dist10,
    "15.dist": dist15
})

In [63]:
data = []
for key, indexes in intersection.items():
    subdata = nasa_fire_data.iloc[indexes][AGG_FEATURES].agg(["mean", "std", "max", "min"])
    data.append(pd.json_normalize({0: subdata.to_dict()}))

In [64]:
data = pd.concat(data, axis=0)

In [48]:
data2 = []
for key, indexes in two_km_radius.items():
    subdata = nasa_fire_data.iloc[indexes][AGG_FEATURES].agg(["mean", "std", "max", "min"])
    data2.append(pd.json_normalize({2: subdata.to_dict()}))

In [49]:
data2 = pd.concat(data2, axis=0)

In [28]:
data3 = []
for key, indexes in five_km_radius.items():
    subdata = nasa_fire_data.iloc[indexes][AGG_FEATURES].agg(["mean", "std", "max", "min"])
    data3.append(pd.json_normalize({5: subdata.to_dict()}))

In [29]:
data3 = pd.concat(data3, axis=0)

In [30]:
data4 = []
for key, indexes in ten_km_radius.items():
    subdata = nasa_fire_data.iloc[indexes][AGG_FEATURES].agg(["mean", "std", "max", "min"])
    data4.append(pd.json_normalize({10: subdata.to_dict()}))
data4 = pd.concat(data4, axis=0)

In [31]:
data5 = []
for key, indexes in fifteen_km_radius.items():
    subdata = nasa_fire_data.iloc[indexes][AGG_FEATURES].agg(["mean", "std", "max", "min"])
    data5.append(pd.json_normalize({15: subdata.to_dict()}))
data5 = pd.concat(data5, axis=0)

In [65]:
data.reset_index(inplace=True, drop=True)
data2.reset_index(inplace=True, drop=True)
data3.reset_index(inplace=True, drop=True)
data4.reset_index(inplace=True, drop=True)
data5.reset_index(inplace=True, drop=True)

In [51]:
for name, d in [("0.csv", data), ("2.csv", data2), ("5.csv", data3), ("10.csv", data4), ("15.csv", data5)]:
    d.to_csv(f"data/{name}", index=None)

In [52]:
del intersection
del two_km_radius
del five_km_radius
del ten_km_radius
del fifteen_km_radius

In [66]:
full_data = pd.concat([train_data, data, data2, data3, data4, data5, distance_data, closest_radius_values], axis=1, ignore_index=False)

In [68]:
full_data.shape

(488103, 101)

In [60]:
full_data.loc[~full_data["2.BRIGHT_T31.max"].isnull(), [col for col in columns if col.startswith("2") and col.endswith(".std")]] = 0
full_data.loc[~full_data["5.BRIGHT_T31.max"].isnull(), [col for col in columns if col.startswith("5") and col.endswith(".std")]] = 0
full_data.loc[~full_data["10.BRIGHT_T31.max"].isnull(), [col for col in columns if col.startswith("10") and col.endswith(".std")]] = 0
full_data.loc[~full_data["15.BRIGHT_T31.max"].isnull(), [col for col in columns if col.startswith("15") and col.endswith(".std")]] = 0

In [61]:
full_data.loc[~full_data["0.BRIGHT_T31.max"].isnull()]

Unnamed: 0,dt,lon_min,lat_min,lon_max,lat_max,lon,lat,grid_index,type_id,type_name,...,15.BRIGHT_T31.max,15.BRIGHT_T31.min,15.FRP.mean,15.FRP.std,15.FRP.max,15.FRP.min,15.CONFIDENCE.mean,15.CONFIDENCE.std,15.CONFIDENCE.max,15.CONFIDENCE.min


### Save training data

In [70]:
full_data.to_csv("merged.csv", index=None)