## Install libraries

In [None]:
%pip install -q gdown

Also, install libraries listed in `requirements.txt` in the app root folder.

## Import libraries

In [None]:
from datetime import timedelta
from pathlib import Path

import geopandas as gpd
import numpy as np
import pandas as pd
from shapely.geometry import Polygon

## Download training data

In [None]:
%%bash

NASA_FIRE_ID="10wbXaFTG8RyolfGzvI8SFQ1XzHi4tTQr"
TRAIN_ID="11_Wjbxsdrgt-DFGJELumk51elHOLkgpQ"

mkdir -p data

gdown --id $NASA_FIRE_ID -O "data/NASA.zip"
unzip "data/NASA.zip" -d data

gdown --id $TRAIN_ID -O data/train.csv

## Prepare training data

### NASA satellites data

In [None]:
nasa_fire_path = Path("data")
geo_files = nasa_fire_path.rglob("*.shp")

nasa_fire_data = None
for filepath in geo_files:
    if nasa_fire_data is None:
        nasa_fire_data = gpd.read_file(filepath)
    else:
        nasa_fire_data = pd.concat([nasa_fire_data, gpd.read_file(filepath)], ignore_index=True)

In [None]:
print(nasa_fire_data.shape)

In [None]:
nasa_fire_data["ACQ_DATE"] = pd.to_datetime(nasa_fire_data["ACQ_DATE"], format="%Y-%m-%d")
nasa_fire_data.sort_values("ACQ_TIME", inplace=True)

In [None]:
nasa_fire_data.drop_duplicates(
    ["LATITUDE", "LONGITUDE", "ACQ_DATE", "INSTRUMENT"],
    keep="last",
    inplace=True,
)
print(nasa_fire_data.shape)

In [None]:
nasa_fire_data["id"] = np.arange(nasa_fire_data.shape[0])
nasa_fire_data["train_date"] = nasa_fire_data["ACQ_DATE"] + timedelta(days=1)
nasa_fire_data["Month"] = nasa_fire_data["ACQ_DATE"].dt.month

In [None]:
nasa_fire_data.head()

### Russian MCHS data

In [None]:
train_data = pd.read_csv("data/train.csv")

In [None]:
train_data["id"] = np.arange(train_data.shape[0])

In [None]:
train_data["dt"] = pd.to_datetime(train_data["dt"], format="%Y-%m-%d")

In [None]:
train_data["polygon"] = train_data.apply(
    lambda x: Polygon(
        np.column_stack((
            np.array([x["lon_min"], x["lon_min"], x["lon_max"], x["lon_max"]]),
            np.array([x["lat_min"], x["lat_max"], x["lat_min"], x["lat_max"]])
        ))
    ), 
    axis=1
)

In [None]:
train_data.head()

### Merge data

In [None]:
nasa_ids = []
for date, poly in zip(train_data["dt"], train_data["polygon"]):
    for i, point in zip(
            nasa_fire_data.loc[nasa_fire_data["train_date"] == date, "id"], 
            nasa_fire_data.loc[nasa_fire_data["train_date"] == date, "geometry"]
    ):
        if poly.contains(point):
            nasa_ids.append(i)
            break
    else:
        nasa_ids.append(-1)

In [None]:
train_data["nasa_id"] = nasa_ids

In [None]:
nasa_fire_cols = [
    "LATITUDE", "LONGITUDE", "BRIGHTNESS", "SCAN", "TRACK",
    "ACQ_TIME", "SATELLITE", "CONFIDENCE",
    "BRIGHT_T31", "FRP", "DAYNIGHT", "TYPE", "id"
]

In [None]:
train_cols = [
    "dt", "lon_min", "lat_min", "lon_max", "lat_max", "lon", "lat",
    "grid_index", "type_id", "type_name", "is_land", "infire_day_1",
    "infire_day_2", "infire_day_3", "infire_day_4", "infire_day_5",
    "infire_day_6", "infire_day_7", "infire_day_8",
    "nasa_id"
]

In [None]:
df = pd.merge(
    train_data[train_cols],
    nasa_fire_data[nasa_fire_cols],
    how="inner",
    left_on="nasa_id",
    right_on="id",
)

### Save training data

In [None]:
df.to_csv("merged.csv", index=None)