In [2]:
%config Completer.use_jedi = False

In [1]:
import zipfile
from pathlib import Path
from datetime import timedelta

import dbf
import geopandas as gpd
import numpy as np
import pandas as pd
import pyproj
from shapely.geometry import Polygon, LineString
from shapely.ops import nearest_points

In [3]:
NASA_FIRE_LINK = "https://drive.google.com/file/d/10wbXaFTG8RyolfGzvI8SFQ1XzHi4tTQr/view?usp=sharing"
NASA_FIRE_ID = "10wbXaFTG8RyolfGzvI8SFQ1XzHi4tTQr"
TRAIN_FILE_LINK = "https://drive.google.com/file/d/11_Wjbxsdrgt-DFGJELumk51elHOLkgpQ/view?usp=sharing"
TRAIN_ID = "11_Wjbxsdrgt-DFGJELumk51elHOLkgpQ"

In [4]:
!gdown --id $NASA_FIRE_ID -O data/NASA.zip

Downloading...
From: https://drive.google.com/uc?id=10wbXaFTG8RyolfGzvI8SFQ1XzHi4tTQr
To: E:\fire\data\NASA.zip

  0%|          | 0.00/14.1M [00:00<?, ?B/s]
 15%|#4        | 2.10M/14.1M [00:00<00:00, 19.1MB/s]
 30%|##9       | 4.19M/14.1M [00:00<00:00, 17.9MB/s]
 48%|####8     | 6.82M/14.1M [00:00<00:00, 19.4MB/s]
 63%|######3   | 8.91M/14.1M [00:00<00:00, 18.7MB/s]
 82%|########1 | 11.5M/14.1M [00:00<00:00, 18.7MB/s]
100%|##########| 14.1M/14.1M [00:00<00:00, 22.4MB/s]


In [5]:
!gdown --id $TRAIN_ID -O data/train.csv

Downloading...
From: https://drive.google.com/uc?id=11_Wjbxsdrgt-DFGJELumk51elHOLkgpQ
To: E:\fire\data\train.csv

  0%|          | 0.00/34.4M [00:00<?, ?B/s]
  3%|3         | 1.05M/34.4M [00:00<00:03, 10.3MB/s]
  9%|9         | 3.15M/34.4M [00:00<00:02, 14.6MB/s]
 24%|##4       | 8.39M/34.4M [00:00<00:00, 30.4MB/s]
 34%|###3      | 11.5M/34.4M [00:00<00:01, 19.3MB/s]
 43%|####2     | 14.7M/34.4M [00:00<00:00, 21.2MB/s]
 50%|#####     | 17.3M/34.4M [00:00<00:00, 22.2MB/s]
 58%|#####7    | 19.9M/34.4M [00:00<00:00, 22.8MB/s]
 65%|######5   | 22.5M/34.4M [00:01<00:00, 23.0MB/s]
 73%|#######3  | 25.2M/34.4M [00:01<00:00, 17.0MB/s]
 81%|########  | 27.8M/34.4M [00:01<00:00, 18.2MB/s]
 97%|#########7| 33.6M/34.4M [00:01<00:00, 26.9MB/s]
100%|##########| 34.4M/34.4M [00:01<00:00, 22.3MB/s]


In [6]:
with zipfile.ZipFile("data/NASA.zip") as file:
    file.extractall(path="data")

In [3]:
fire_path = Path("data/DL_FIRE/")
geo_files = fire_path.rglob("*.shp")
nasa_fire = None
for file in geo_files:
    if nasa_fire is None:
        nasa_fire = gpd.read_file(file)
    else:
        nasa_fire = pd.concat([nasa_fire, gpd.read_file(file)], ignore_index=True)

In [4]:
nasa_fire.shape

(3121446, 16)

In [5]:
nasa_fire["ACQ_DATE"] = pd.to_datetime(nasa_fire["ACQ_DATE"], format="%Y-%m-%d")

In [6]:
train = pd.read_csv("data/train.csv")

In [7]:
train.head()

Unnamed: 0,dt,lon_min,lat_min,lon_max,lat_max,lon,lat,grid_index,type_id,type_name,is_land,infire_day_1,infire_day_2,infire_day_3,infire_day_4,infire_day_5,infire_day_6,infire_day_7,infire_day_8
0,2020-05-04,47.6,41.0,47.8,41.2,,,143,,,False,0,0,0,0,0,0,0,0
1,2021-02-24,47.2,41.2,47.4,41.4,,,891,,,False,0,0,0,0,0,0,0,0
2,2021-02-27,47.2,41.2,47.4,41.4,,,891,,,False,0,0,0,0,0,0,0,0
3,2021-04-01,47.4,41.2,47.6,41.4,,,892,,,True,0,0,0,0,0,0,0,0
4,2020-03-14,47.6,41.2,47.8,41.4,,,893,,,True,0,0,0,0,0,0,0,0


In [8]:
nasa_fire.head()

Unnamed: 0,LATITUDE,LONGITUDE,BRIGHTNESS,SCAN,TRACK,ACQ_DATE,ACQ_TIME,SATELLITE,INSTRUMENT,CONFIDENCE,VERSION,BRIGHT_T31,FRP,DAYNIGHT,geometry,TYPE
0,68.41773,83.62501,367.0,0.68,0.74,2020-01-01,118,1,VIIRS,h,2.0NRT,238.6,13.6,N,POINT (83.62501 68.41773),
1,68.61677,57.97244,351.2,0.4,0.6,2020-01-01,118,1,VIIRS,n,2.0NRT,263.0,7.1,N,POINT (57.97244 68.61677),
2,67.59256,83.24406,329.0,0.78,0.78,2020-01-01,118,1,VIIRS,n,2.0NRT,249.3,6.3,N,POINT (83.24406 67.59256),
3,67.59792,83.25304,338.8,0.78,0.78,2020-01-01,118,1,VIIRS,n,2.0NRT,248.2,6.2,N,POINT (83.25304 67.59792),
4,68.61971,57.96733,332.4,0.4,0.6,2020-01-01,118,1,VIIRS,n,2.0NRT,260.8,5.8,N,POINT (57.96733 68.61971),


In [9]:
nasa_fire.sort_values("ACQ_TIME", inplace=True)

In [10]:
nasa_fire.drop_duplicates(["LATITUDE", "LONGITUDE", "ACQ_DATE", "INSTRUMENT"], keep="last", inplace=True)

In [11]:
nasa_fire.shape

(3121199, 16)

In [12]:
nasa_fire["train_date"] = nasa_fire["ACQ_DATE"] + timedelta(days=1)

In [13]:
train["id"] = np.arange(train.shape[0])
nasa_fire["id"] = np.arange(nasa_fire.shape[0])

In [14]:
train["dt"] = pd.to_datetime(train["dt"], format="%Y-%m-%d")

In [15]:
train["polygon"] = train.apply(
    lambda x: 
    Polygon(
        np.column_stack((
            np.array([x["lon_min"], x["lon_min"], x["lon_max"], x["lon_max"]]),
            np.array([x["lat_min"], x["lat_max"], x["lat_min"], x["lat_max"]])
        ))
    ), 
    axis=1
)

In [16]:
nasa_fire["Month"] = nasa_fire["ACQ_DATE"].dt.month

In [18]:
nasa_ids = []
for date, poly in zip(train["dt"], train["polygon"]):
    for i, point in zip(
            nasa_fire.loc[nasa_fire["train_date"] == date, "id"], 
            nasa_fire.loc[nasa_fire["train_date"] == date, "geometry"]
    ):
        if poly.contains(point):
            nasa_ids.append(i)
            break
    else:
        nasa_ids.append(-1)

In [19]:
train["nasa_id"] = nasa_ids

In [20]:
train_cols = [
    'dt', 'lon_min', 'lat_min', 'lon_max', 'lat_max', 'lon', 'lat',
    'grid_index', 'type_id', 'type_name', 'is_land', 'infire_day_1',
    'infire_day_2', 'infire_day_3', 'infire_day_4', 'infire_day_5',
    'infire_day_6', 'infire_day_7', 'infire_day_8',
    'nasa_id'
]

In [21]:
nasa_fire_cols = [
    'LATITUDE', 'LONGITUDE', 'BRIGHTNESS', 'SCAN', 'TRACK',
    'ACQ_TIME', 'SATELLITE', 'CONFIDENCE',
    'BRIGHT_T31', 'FRP', 'DAYNIGHT', 'TYPE', 'id'
]

In [22]:
df = pd.merge(train[train_cols], nasa_fire[nasa_fire_cols], how="inner", left_on="nasa_id", right_on="id")

In [23]:
df.to_csv("merged.csv", index=None)

In [27]:
df[[col for col in df.columns if col.startswith("infire")]].mean()

infire_day_1    0.440180
infire_day_2    0.368089
infire_day_3    0.319024
infire_day_4    0.279107
infire_day_5    0.246597
infire_day_6    0.222478
infire_day_7    0.205364
infire_day_8    0.188830
dtype: float64