## Import libraries

In [None]:
import logging
from typing import Optional, Tuple

import numpy as np
import pandas as pd
import lightgbm as lgb
from catboost import CatBoostClassifier, Pool
from sklearn import metrics

In [None]:
# https://github.com/roelbertens/time-series-nested-cv/blob/master/time_series_cross_validation/custom_time_series_split.py

class CustomTimeSeriesSplit:
    def __init__(self,
                 train_set_size: int,
                 test_set_size: int
                 ):
        """
        :param train_set_size: data points (days) in each fold for the train set
        :param test_set_size: data points (days) in each fold for the test set
        """
        self.train_set_size = train_set_size
        self.test_set_size = test_set_size
        self._logger = logging.getLogger(__name__)

    def split(
        self,
        x: np.ndarray,
        y: Optional[np.ndarray] = None
    ) -> Tuple[np.ndarray, np.ndarray]:
        """Return train/test split indices.
        :param x: time series to use for prediction, shape (n_samples, n_features)
        :param y: time series to predict, shape (n_samples, n_features)
        :return: (train_indices, test_indices)
        Note: index of both x and y should be of type datetime.
        """
        if y is not None:
            assert x.index.equals(y.index)
        split_points = self.get_split_points(x)
        for split_point in split_points:
            is_train = (x.index < split_point) & (x.index >= split_point -
                                                  pd.Timedelta(self.train_set_size, unit="D"))
            is_test = (x.index >= split_point) & (x.index < split_point +
                                                  pd.Timedelta(self.test_set_size, unit="D"))
            if not is_train.any() or not is_test.any():
                self._logger.warning(
                    "Found %d train and %d test observations "
                    "skipping fold for split point %s",
                    is_train.sum(), is_test.sum(), split_point
                )
                continue
            dummy_ix = pd.Series(range(0, len(x)), index=x.index)
            ix_train = dummy_ix.loc[is_train].values
            ix_test = dummy_ix.loc[is_test].values
            if ix_train is None or ix_test is None:
                self._logger.warning(
                    "Found no data for train or test period, "
                    "skipping fold for split date %s",
                    split_point
                )
                continue
            yield ix_train, ix_test

    def get_split_points(self, x: np.array) -> pd.DatetimeIndex:
        """Get all possible split point dates"""
        start = x.index.min() + pd.Timedelta(self.train_set_size, unit="D")
        end = x.index.max() - pd.Timedelta(self.test_set_size - 1, unit="D")
        self._logger.info(f"Generating split points from {start} to {end}")
        split_range = pd.date_range(start, end, freq="D")
        first_split_point =  (len(split_range) + self.test_set_size - 1) % self.test_set_size
        return split_range[first_split_point::self.test_set_size]


class ModelBuilder:
    def __init__(self, df, target, feats, cat_feats):
        self.df = df
        self.target = target
        self.feats = feats
        self.cat_feats = cat_feats
        self.mode = "classification" if type(target) == str else "multiclassification"

    def train_folds(self, train_size=120, test_size=30, iterations=1000, early_stopping=False):
        if self.mode == "classification":
            oof_preds = np.zeros(self.df.shape[0])
        else:
            oof_preds = np.zeros((self.df.shape[0], len(targets)))

        folds_mask = np.zeros(oof_preds.shape[0])
        for fold_, (train_index, test_index) in enumerate(CustomTimeSeriesSplit(train_set_size=train_size, test_set_size=test_size).split(self.df)):
            X_train, y_train = self.df.iloc[train_index, :][self.feats], self.df.iloc[train_index, :][self.target]
            X_val, y_val = self.df.iloc[test_index, :][self.feats], self.df.iloc[test_index, :][self.target]

            weeks_train = X_train.reset_index()["dt"]
            weeks_test = X_val.reset_index()["dt"]

            tr_start_week = weeks_train.min()
            tr_end_week = weeks_train.max()
            ts_start_week = weeks_test.min()
            ts_end_week = weeks_test.max()

            print()
            print()
            print(f"Fold {fold_} train ({tr_start_week}, {tr_end_week}) test ({ts_start_week}, {ts_end_week})")

            cat_model = CatBoostClassifier(
                iterations=iterations,
                learning_rate=0.05,
                metric_period=500,
                loss_function="Logloss" if self.mode=="classification" else "MultiLogloss",
                l2_leaf_reg=10,
                eval_metric="F1" if self.mode=="classification" else "MultiLogloss", 
                task_type="CPU",
                early_stopping_rounds=100,
                random_seed=1234,
                use_best_model=early_stopping
                )

            D_train = Pool(X_train, y_train, cat_features=cat_feats, feature_names=feats)
            D_val = Pool(X_val, y_val, cat_features=cat_feats, feature_names=feats)
            
            print("Train catboost")
            cat_model.fit(
                D_train, 
                eval_set=D_val if early_stopping else None,
                verbose=True,
                plot=False
            )
            
            if self.mode == "classification":
                D_train_lgb = lgb.Dataset(X_train, y_train, weight=None, free_raw_data=False)
                D_val_lgb = lgb.Dataset(X_val, y_val, weight=None, free_raw_data=False)
                
                print("Train lgbm")
                lgbm_model = lgb.train(
                    {
                        "objective": "binary",
                        "feature_pre_filter": False,
                        "lambda_l1": 5.246525412521277e-08,
                        "lambda_l2": 3.963188589061798e-05,
                        "num_leaves": 6,
                        "feature_fraction": 0.7,
                        "bagging_fraction": 1.0,
                        "bagging_freq": 0,
                        "min_child_samples": 20,
                    },
                    D_train_lgb,
                    num_boost_round=iterations,
                    early_stopping_rounds=200 if early_stopping else None,
                    valid_sets=D_val_lgb if early_stopping else None,
                    feature_name=feats,
                    verbose_eval=500,
                )
                preds = (cat_model.predict_proba(X_val)[:, 1] + lgbm_model.predict(X_val)) / 2
                print()
                print(f"Fold {fold_} F1 Score ", metrics.f1_score(y_val, preds.round()))
                print(f"Fold {fold_} ROC AUC Score ", metrics.roc_auc_score(y_val, preds.round()))
                print(f"Fold {fold_} Confusion matrix")
                print(metrics.confusion_matrix(y_val, preds.round()))
                oof_preds[test_index] = preds
            else:
                oof_preds[test_index] = cat_model.predict(X_val)
                print(f"Fold {fold_} F1 Score ", metrics.f1_score(y_val, oof_preds[test_index].round(), average="micro"))
                try:
                    print(f"Fold {fold_} ROC AUC Score ", metrics.roc_auc_score(y_val, oof_preds[test_index]))
                except ValueError:
                    print(f"Fold {fold_} ROC AUC Score ", 0)
                    
            folds_mask[test_index] = 1
        
        if self.mode == "classification":
            oof_f1micro = metrics.f1_score(self.df.iloc[folds_mask == 1, :][self.target], oof_preds[folds_mask == 1].round(), average="micro")
            oof_f1micro = metrics.roc_auc_score(self.df.iloc[folds_mask == 1, :][self.target], oof_preds[folds_mask == 1], average="micro")
        else:
            oof_f1micro = metrics.f1_score(self.df.iloc[folds_mask == 1, :][self.target], oof_preds[folds_mask == 1].round(), average="micro")
            oof_f1micro = metrics.roc_auc_score(self.df.iloc[folds_mask == 1, :][self.target], oof_preds[folds_mask == 1], average="micro")
        
        print()
        print("Overall OOF F1 Micro ", oof_f1micro)
        print("Overall OOF Mean ROC AUC Score ", oof_f1micro)
        
    def train_final_models(self, iterations=1000, early_stopping=False):
        if self.mode == "classification":
            X_train, y_train = self.df.iloc[:, :][self.feats], self.df.iloc[:, :][self.target]

            cat_model = CatBoostClassifier(
                iterations=iterations,
                learning_rate=0.05,
                metric_period=500,
                loss_function="Logloss",
                l2_leaf_reg=10,
                eval_metric="F1", 
                task_type="CPU",
                random_seed=1234,
                use_best_model=early_stopping
                )

            D_train = Pool(X_train, y_train, cat_features=cat_feats, feature_names=feats)

            print("Train catboost")
            cat_model.fit(
                D_train, 
                eval_set=None,
                verbose=True,
                plot=False
            )

            D_train_lgb = lgb.Dataset(X_train, y_train, weight=None, free_raw_data=False)

            print("Train lgbm")
            lgbm_model = lgb.train(
                {
                    "objective": "binary",
                    "feature_pre_filter": False,
                    "lambda_l1": 5.246525412521277e-08,
                    "lambda_l2": 3.963188589061798e-05,
                    "num_leaves": 6,
                    "feature_fraction": 0.7,
                    "bagging_fraction": 1.0,
                    "bagging_freq": 0,
                    "min_child_samples": 20,
                },
                D_train_lgb,
                num_boost_round=iterations,
                valid_sets=None,
                feature_name=feats,
                verbose_eval=500,
            )

            return cat_model, lgbm_model

        elif self.mode == "multiclassification":
            raise NotImplementedError 

## Prepare training data

In [None]:
df = pd.read_csv("merged.csv")

In [None]:
df["SATELLITE"].unique()

In [None]:
df["CONFIDENCE"] = df["CONFIDENCE"].map({"l":0, "h":1, "n":3})
df["SATELLITE"] = df["SATELLITE"].map({"1":0, "N":1})
df["DAYNIGHT"] = df["DAYNIGHT"].map({"D":0, "N":1})
df["dt"] = pd.to_datetime(df["dt"]).dt.date
df = df.set_index("dt")

In [None]:
targets = ["infire_day_1","infire_day_2","infire_day_3","infire_day_4","infire_day_5","infire_day_6","infire_day_7","infire_day_8"]
feats = ["LATITUDE","LONGITUDE","BRIGHTNESS","SCAN","TRACK","ACQ_TIME","SATELLITE","DAYNIGHT","CONFIDENCE","BRIGHT_T31","FRP"]
#cat_feats = ["grid_index", "DAYNIGHT","SATELLITE"]
cat_feats = []

In [None]:
targets = ["infire_day_1","infire_day_2","infire_day_3","infire_day_4","infire_day_5","infire_day_6","infire_day_7","infire_day_8"]
df["target"] = (df[targets].sum(axis=1)>0).astype(np.uint8)

In [None]:
df["target"].value_counts(normalize=True)

In [None]:
### syntetic data
DROPOUT_PROBA = 0.7
UPSAMPLE_RATE = 6

df_syn_base = df[df["target"]==0][feats]
df_syn_final = pd.DataFrame()

for i in range(UPSAMPLE_RATE):
    df_syn = df_syn_base.copy()
    for f in feats[3:]:
        df_syn[f] = df_syn[f].apply(lambda x: x if np.random.random()>DROPOUT_PROBA else None).sample(frac=1.0).values
    df_syn_final = pd.concat([df_syn_final, df_syn], axis=0)

df_syn_final["target"] = 0

In [None]:
df_combined = pd.concat([
    df[feats+["target"]],
    df_syn_final], axis=0)

In [None]:
df_combined["target"].value_counts(normalize=True)

## Train with single lable (will we see fire during a period of 8 days)

In [None]:
fire_model = ModelBuilder(df_combined, "target", feats, cat_feats)

In [None]:
fire_model.train_folds(train_size=120, test_size=30, iterations=1000, early_stopping=False)

In [None]:
cat_model, lgbm_model = fire_model.train_final_models()

### Save models

In [None]:
cat_model.save_model("catboost", format="cbm")

lgbm_model.save_model("light_gbm.txt")