## Import libraries

In [1]:
import logging
from typing import Optional, Tuple

import numpy as np
import pandas as pd
import lightgbm as lgb
from catboost import CatBoostClassifier, Pool
from sklearn import metrics

In [2]:
# https://github.com/roelbertens/time-series-nested-cv/blob/master/time_series_cross_validation/custom_time_series_split.py

class CustomTimeSeriesSplit:
    def __init__(self,
                 train_set_size: int,
                 test_set_size: int
                 ):
        """
        :param train_set_size: data points (days) in each fold for the train set
        :param test_set_size: data points (days) in each fold for the test set
        """
        self.train_set_size = train_set_size
        self.test_set_size = test_set_size
        self._logger = logging.getLogger(__name__)

    def split(
        self,
        x: np.ndarray,
        y: Optional[np.ndarray] = None
    ) -> Tuple[np.ndarray, np.ndarray]:
        """Return train/test split indices.
        :param x: time series to use for prediction, shape (n_samples, n_features)
        :param y: time series to predict, shape (n_samples, n_features)
        :return: (train_indices, test_indices)
        Note: index of both x and y should be of type datetime.
        """
        if y is not None:
            assert x.index.equals(y.index)
        split_points = self.get_split_points(x)
        for split_point in split_points:
            is_train = (x.index < split_point) & (x.index >= split_point -
                                                  pd.Timedelta(self.train_set_size, unit="D"))
            is_test = (x.index >= split_point) & (x.index < split_point +
                                                  pd.Timedelta(self.test_set_size, unit="D"))
            if not is_train.any() or not is_test.any():
                self._logger.warning(
                    "Found %d train and %d test observations "
                    "skipping fold for split point %s",
                    is_train.sum(), is_test.sum(), split_point
                )
                continue
            dummy_ix = pd.Series(range(0, len(x)), index=x.index)
            ix_train = dummy_ix.loc[is_train].values
            ix_test = dummy_ix.loc[is_test].values
            if ix_train is None or ix_test is None:
                self._logger.warning(
                    "Found no data for train or test period, "
                    "skipping fold for split date %s",
                    split_point
                )
                continue
            yield ix_train, ix_test

    def get_split_points(self, x: np.array) -> pd.DatetimeIndex:
        """Get all possible split point dates"""
        start = x.index.min() + pd.Timedelta(self.train_set_size, unit="D")
        end = x.index.max() - pd.Timedelta(self.test_set_size - 1, unit="D")
        self._logger.info(f"Generating split points from {start} to {end}")
        split_range = pd.date_range(start, end, freq="D")
        first_split_point =  (len(split_range) + self.test_set_size - 1) % self.test_set_size
        return split_range[first_split_point::self.test_set_size]


class ModelBuilder:
    def __init__(self, df, target, feats, cat_feats):
        self.df = df
        self.target = target
        self.feats = feats
        self.cat_feats = cat_feats
        self.mode = "classification" if type(target) == str else "multiclassification"

    def train_folds(self, train_size=120, test_size=30, iterations=1000, early_stopping=False):
        if self.mode == "classification":
            oof_preds = np.zeros(self.df.shape[0])
        else:
            oof_preds = np.zeros((self.df.shape[0], len(targets)))

        folds_mask = np.zeros(oof_preds.shape[0])
        for fold_, (train_index, test_index) in enumerate(CustomTimeSeriesSplit(train_set_size=train_size, test_set_size=test_size).split(self.df)):
            X_train, y_train = self.df.iloc[train_index, :][self.feats], self.df.iloc[train_index, :][self.target]
            X_val, y_val = self.df.iloc[test_index, :][self.feats], self.df.iloc[test_index, :][self.target]

            weeks_train = X_train.reset_index()["dt"]
            weeks_test = X_val.reset_index()["dt"]

            tr_start_week = weeks_train.min()
            tr_end_week = weeks_train.max()
            ts_start_week = weeks_test.min()
            ts_end_week = weeks_test.max()

            print()
            print()
            print(f"Fold {fold_} train ({tr_start_week}, {tr_end_week}) test ({ts_start_week}, {ts_end_week})")

            cat_model = CatBoostClassifier(
                iterations=iterations,
                learning_rate=0.05,
                metric_period=500,
                loss_function="Logloss" if self.mode=="classification" else "MultiLogloss",
                l2_leaf_reg=10,
                eval_metric="F1" if self.mode=="classification" else "MultiLogloss", 
                task_type="CPU",
                early_stopping_rounds=100,
                random_seed=1234,
                use_best_model=early_stopping
                )

            D_train = Pool(X_train, y_train, cat_features=cat_feats, feature_names=feats)
            D_val = Pool(X_val, y_val, cat_features=cat_feats, feature_names=feats)
            
            print("Train catboost")
            cat_model.fit(
                D_train, 
                eval_set=D_val if early_stopping else None,
                verbose=True,
                plot=False
            )
            
            if self.mode == "classification":
                D_train_lgb = lgb.Dataset(X_train, y_train, weight=None, free_raw_data=False)
                D_val_lgb = lgb.Dataset(X_val, y_val, weight=None, free_raw_data=False)
                
                print("Train lgbm")
                lgbm_model = lgb.train(
                    {
                        "objective": "binary",
                        "feature_pre_filter": False,
                        "lambda_l1": 5.246525412521277e-08,
                        "lambda_l2": 3.963188589061798e-05,
                        "num_leaves": 6,
                        "feature_fraction": 0.7,
                        "bagging_fraction": 1.0,
                        "bagging_freq": 0,
                        "min_child_samples": 20,
                    },
                    D_train_lgb,
                    num_boost_round=iterations,
                    early_stopping_rounds=200 if early_stopping else None,
                    valid_sets=D_val_lgb if early_stopping else None,
                    feature_name=feats,
                    verbose_eval=500,
                )
                preds = (cat_model.predict_proba(X_val)[:, 1] + lgbm_model.predict(X_val)) / 2
                print()
                print(f"Fold {fold_} F1 Score ", metrics.f1_score(y_val, preds.round()))
                print(f"Fold {fold_} ROC AUC Score ", metrics.roc_auc_score(y_val, preds.round()))
                print(f"Fold {fold_} Confusion matrix")
                print(metrics.confusion_matrix(y_val, preds.round()))
                oof_preds[test_index] = preds
            else:
                oof_preds[test_index] = cat_model.predict(X_val)
                print(f"Fold {fold_} F1 Score ", metrics.f1_score(y_val, oof_preds[test_index].round(), average="micro"))
                try:
                    print(f"Fold {fold_} ROC AUC Score ", metrics.roc_auc_score(y_val, oof_preds[test_index]))
                except ValueError:
                    print(f"Fold {fold_} ROC AUC Score ", 0)
                    
            folds_mask[test_index] = 1
        
        if self.mode == "classification":
            oof_f1micro = metrics.f1_score(self.df.iloc[folds_mask == 1, :][self.target], oof_preds[folds_mask == 1].round(), average="micro")
            oof_f1micro = metrics.roc_auc_score(self.df.iloc[folds_mask == 1, :][self.target], oof_preds[folds_mask == 1], average="micro")
        else:
            oof_f1micro = metrics.f1_score(self.df.iloc[folds_mask == 1, :][self.target], oof_preds[folds_mask == 1].round(), average="micro")
            oof_f1micro = metrics.roc_auc_score(self.df.iloc[folds_mask == 1, :][self.target], oof_preds[folds_mask == 1], average="micro")
        
        print()
        print("Overall OOF F1 Micro ", oof_f1micro)
        print("Overall OOF Mean ROC AUC Score ", oof_f1micro)
        
    def train_final_models(self, iterations=1000, early_stopping=False):
        if self.mode == "classification":
            X_train, y_train = self.df.iloc[:, :][self.feats], self.df.iloc[:, :][self.target]

            cat_model = CatBoostClassifier(
                iterations=iterations,
                learning_rate=0.05,
                metric_period=500,
                loss_function="Logloss",
                l2_leaf_reg=10,
                eval_metric="F1", 
                task_type="CPU",
                random_seed=1234,
                use_best_model=early_stopping
                )

            D_train = Pool(X_train, y_train, cat_features=cat_feats, feature_names=feats)

            print("Train catboost")
            cat_model.fit(
                D_train, 
                eval_set=None,
                verbose=True,
                plot=False
            )

            D_train_lgb = lgb.Dataset(X_train, y_train, weight=None, free_raw_data=False)

            print("Train lgbm")
            lgbm_model = lgb.train(
                {
                    "objective": "binary",
                    "feature_pre_filter": False,
                    "lambda_l1": 5.246525412521277e-08,
                    "lambda_l2": 3.963188589061798e-05,
                    "num_leaves": 6,
                    "feature_fraction": 0.7,
                    "bagging_fraction": 1.0,
                    "bagging_freq": 0,
                    "min_child_samples": 20,
                },
                D_train_lgb,
                num_boost_round=iterations,
                valid_sets=None,
                feature_name=feats,
                verbose_eval=500,
            )

            return cat_model, lgbm_model

        elif self.mode == "multiclassification":
            raise NotImplementedError 

## Prepare training data

In [3]:
df = pd.read_csv("merged.csv")

In [4]:
df["SATELLITE"].unique()

array(['Terra', 'Aqua'], dtype=object)

In [5]:
df["CONFIDENCE"] = df["CONFIDENCE"].map({"l":0, "h":1, "n":3})
df["SATELLITE"] = df["SATELLITE"].map({"1":0, "N":1})
df["DAYNIGHT"] = df["DAYNIGHT"].map({"D":0, "N":1})
df["dt"] = pd.to_datetime(df["dt"]).dt.date
df = df.set_index("dt")

In [6]:
targets = ["infire_day_1","infire_day_2","infire_day_3","infire_day_4","infire_day_5","infire_day_6","infire_day_7","infire_day_8"]
feats = ["BRIGHTNESS","SCAN","TRACK","ACQ_TIME","SATELLITE","DAYNIGHT","CONFIDENCE","BRIGHT_T31","FRP"]
#cat_feats = ["grid_index", "DAYNIGHT","SATELLITE"]
cat_feats = []

In [7]:
targets = ["infire_day_1","infire_day_2","infire_day_3","infire_day_4","infire_day_5","infire_day_6","infire_day_7","infire_day_8"]
df["target"] = (df[targets].sum(axis=1)>0).astype(np.uint8)

In [8]:
df["target"].value_counts(normalize=True)

1    0.852313
0    0.147687
Name: target, dtype: float64

In [9]:
### syntetic data
DROPOUT_PROBA = 0.7
UPSAMPLE_RATE = 6

df_syn_base = df[df["target"]==0][feats]
df_syn_final = pd.DataFrame()

for i in range(UPSAMPLE_RATE):
    df_syn = df_syn_base.copy()
    for f in feats[3:]:
        df_syn[f] = df_syn[f].apply(lambda x: x if np.random.random()>DROPOUT_PROBA else None).sample(frac=1.0).values
    df_syn_final = pd.concat([df_syn_final, df_syn], axis=0)

df_syn_final["target"] = 0

In [10]:
df_combined = pd.concat([
    df[feats+["target"]],
    df_syn_final], axis=0)

In [11]:
df_combined["target"].value_counts(normalize=True)

0    0.548113
1    0.451887
Name: target, dtype: float64

## Train with single lable (will we see fire during a period of 8 days)

In [12]:
fire_model = ModelBuilder(df_combined, "target", feats, cat_feats)

In [13]:
fire_model.train_folds(train_size=120, test_size=30, iterations=1000, early_stopping=False)

  result = libops.scalar_compare(x.ravel(), y, op)




Fold 0 train (2020-02-20, 2020-06-18) test (2020-06-19, 2020-07-18)
Train catboost
0:	learn: 0.9088065	total: 58.8ms	remaining: 58.7s
500:	learn: 0.9158019	total: 1.32s	remaining: 1.32s
999:	learn: 0.9267353	total: 2.57s	remaining: 0us
Train lgbm
[LightGBM] [Info] Number of positive: 6978, number of negative: 9289
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1079
[LightGBM] [Info] Number of data points in the train set: 16267, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.428967 -> initscore=-0.286069
[LightGBM] [Info] Start training from score -0.286069





Fold 0 F1 Score  0.8901392860437228
Fold 0 ROC AUC Score  0.9044177170846646
Fold 0 Confusion matrix
[[5731  926]
 [ 265 4825]]


Fold 1 train (2020-03-21, 2020-07-18) test (2020-07-19, 2020-08-17)
Train catboost
0:	learn: 0.9029634	total: 3.1ms	remaining: 3.09s


  result = libops.scalar_compare(x.ravel(), y, op)


500:	learn: 0.9144307	total: 1.48s	remaining: 1.47s
999:	learn: 0.9197341	total: 3.01s	remaining: 0us
Train lgbm
[LightGBM] [Info] Number of positive: 11360, number of negative: 15253
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1079
[LightGBM] [Info] Number of data points in the train set: 26613, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.426859 -> initscore=-0.294678
[LightGBM] [Info] Start training from score -0.294678





Fold 1 F1 Score  0.9317784256559768
Fold 1 ROC AUC Score  0.9204232912237109
Fold 1 Confusion matrix
[[3111  529]
 [  56 3995]]


Fold 2 train (2020-04-20, 2020-08-17) test (2020-08-18, 2020-09-16)
Train catboost
0:	learn: 0.9087567	total: 2.96ms	remaining: 2.96s


  result = libops.scalar_compare(x.ravel(), y, op)


500:	learn: 0.9170544	total: 1.39s	remaining: 1.38s
999:	learn: 0.9235379	total: 2.99s	remaining: 0us
Train lgbm
[LightGBM] [Info] Number of positive: 10957, number of negative: 14063
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1079
[LightGBM] [Info] Number of data points in the train set: 25020, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.437930 -> initscore=-0.249569
[LightGBM] [Info] Start training from score -0.249569





Fold 2 F1 Score  0.9369768586903005
Fold 2 ROC AUC Score  0.9254382693024992
Fold 2 Confusion matrix
[[1464  244]
 [  12 1903]]


Fold 3 train (2020-05-20, 2020-09-16) test (2020-09-17, 2020-10-16)
Train catboost
0:	learn: 0.9216980	total: 7.37ms	remaining: 7.36s


  result = libops.scalar_compare(x.ravel(), y, op)


500:	learn: 0.9272288	total: 1.44s	remaining: 1.43s
999:	learn: 0.9318023	total: 2.94s	remaining: 0us
Train lgbm
[LightGBM] [Info] Number of positive: 11338, number of negative: 12467
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1079
[LightGBM] [Info] Number of data points in the train set: 23805, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.476286 -> initscore=-0.094925
[LightGBM] [Info] Start training from score -0.094925





Fold 3 F1 Score  0.9019396551724137
Fold 3 ROC AUC Score  0.9247331691297209
Fold 3 Confusion matrix
[[2078  358]
 [   6 1674]]


Fold 4 train (2020-06-19, 2020-10-16) test (2020-10-17, 2020-11-15)
Train catboost
0:	learn: 0.9165638	total: 3.03ms	remaining: 3.03s


  result = libops.scalar_compare(x.ravel(), y, op)


500:	learn: 0.9257521	total: 1.72s	remaining: 1.72s
999:	learn: 0.9304316	total: 3.31s	remaining: 0us
Train lgbm
[LightGBM] [Info] Number of positive: 12736, number of negative: 14441
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1079
[LightGBM] [Info] Number of data points in the train set: 27177, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.468632 -> initscore=-0.125639
[LightGBM] [Info] Start training from score -0.125639





Fold 4 F1 Score  0.86
Fold 4 ROC AUC Score  0.9205051065611278
Fold 4 Confusion matrix
[[403  66]
 [  4 215]]


Fold 5 train (2020-07-19, 2020-11-15) test (2020-11-16, 2020-12-14)
Train catboost
0:	learn: 0.9175131	total: 2.39ms	remaining: 2.39s


  result = libops.scalar_compare(x.ravel(), y, op)


500:	learn: 0.9336425	total: 1.12s	remaining: 1.11s
999:	learn: 0.9429429	total: 2.26s	remaining: 0us
Train lgbm
[LightGBM] [Info] Number of positive: 7865, number of negative: 8253
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1079
[LightGBM] [Info] Number of data points in the train set: 16118, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.487964 -> initscore=-0.048154
[LightGBM] [Info] Start training from score -0.048154





Fold 5 F1 Score  0.7333333333333333
Fold 5 ROC AUC Score  0.8007326007326008
Fold 5 Confusion matrix
[[79 12]
 [12 33]]


Fold 6 train (2020-08-18, 2020-12-14) test (2020-12-19, 2021-01-05)
Train catboost
0:	learn: 0.9095508	total: 6.63ms	remaining: 6.62s


  result = libops.scalar_compare(x.ravel(), y, op)


500:	learn: 0.9318788	total: 792ms	remaining: 788ms
999:	learn: 0.9510265	total: 1.56s	remaining: 0us
Train lgbm
[LightGBM] [Info] Number of positive: 3859, number of negative: 4704
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1079
[LightGBM] [Info] Number of data points in the train set: 8563, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.450660 -> initscore=-0.198005
[LightGBM] [Info] Start training from score -0.198005





Fold 6 F1 Score  0.9
Fold 6 ROC AUC Score  0.9142857142857143
Fold 6 Confusion matrix
[[13  1]
 [ 1  9]]


Fold 7 train (2020-09-17, 2021-01-05) test (2021-02-04, 2021-02-12)
Train catboost
0:	learn: 0.8896552	total: 1.39ms	remaining: 1.39s


  result = libops.scalar_compare(x.ravel(), y, op)


500:	learn: 0.9413760	total: 589ms	remaining: 587ms
999:	learn: 0.9646302	total: 1.2s	remaining: 0us
Train lgbm
[LightGBM] [Info] Number of positive: 1954, number of negative: 3010
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1075
[LightGBM] [Info] Number of data points in the train set: 4964, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.393634 -> initscore=-0.432062
[LightGBM] [Info] Start training from score -0.432062





Fold 7 F1 Score  0.6666666666666666
Fold 7 ROC AUC Score  0.8928571428571428
Fold 7 Confusion matrix
[[11  3]
 [ 0  3]]


Fold 8 train (2020-10-17, 2021-02-12) test (2021-02-25, 2021-03-15)
Train catboost
0:	learn: 0.8325509	total: 654us	remaining: 653ms


  result = libops.scalar_compare(x.ravel(), y, op)


500:	learn: 0.9752650	total: 258ms	remaining: 257ms
999:	learn: 0.9981982	total: 513ms	remaining: 0us
Train lgbm
[LightGBM] [Info] Number of positive: 277, number of negative: 588




You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 680
[LightGBM] [Info] Number of data points in the train set: 865, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.320231 -> initscore=-0.752709
[LightGBM] [Info] Start training from score -0.752709

Fold 8 F1 Score  0.8228571428571428
Fold 8 ROC AUC Score  0.8536427275084262
Fold 8 Confusion matrix
[[117  16]
 [ 15  72]]


Fold 9 train (2020-11-16, 2021-03-15) test (2021-03-16, 2021-04-14)
Train catboost
0:	learn: 0.8662420	total: 563us	remaining: 563ms


  result = libops.scalar_compare(x.ravel(), y, op)


500:	learn: 0.9931507	total: 207ms	remaining: 206ms
999:	learn: 1.0000000	total: 412ms	remaining: 0us
Train lgbm
[LightGBM] [Info] Number of positive: 145, number of negative: 252
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 327
[LightGBM] [Info] Number of data points in the train set: 397, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.365239 -> initscore=-0.552695




[LightGBM] [Info] Start training from score -0.552695

Fold 9 F1 Score  0.7746031746031746
Fold 9 ROC AUC Score  0.7998847677793457
Fold 9 Confusion matrix
[[1202  156]
 [ 341  854]]

Overall OOF F1 Micro  0.9307062909109078
Overall OOF Mean ROC AUC Score  0.9307062909109078


In [14]:
cat_model, lgbm_model = fire_model.train_final_models()

Train catboost
0:	learn: 0.9153127	total: 4.56ms	remaining: 4.56s
500:	learn: 0.9181814	total: 2.76s	remaining: 2.75s
999:	learn: 0.9208589	total: 4.98s	remaining: 0us
Train lgbm
[LightGBM] [Info] Number of positive: 21278, number of negative: 25809
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1079
[LightGBM] [Info] Number of data points in the train set: 47087, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.451887 -> initscore=-0.193050
[LightGBM] [Info] Start training from score -0.193050




### Save models

In [15]:
cat_model.save_model("catboost", format="cbm")

lgbm_model.save_model("light_gbm.txt")

<lightgbm.basic.Booster at 0x7f6b55115d00>