In [1]:
import pandas as pd
import numpy as np
import logging
import lightgbm as lgb
from sklearn import metrics
from catboost import CatBoost, CatBoostClassifier, Pool

In [2]:
#https://github.com/roelbertens/time-series-nested-cv/blob/master/time_series_cross_validation/custom_time_series_split.py
class CustomTimeSeriesSplit:

    def __init__(self,
                 train_set_size: int,
                 test_set_size: int
                 ):
        """
        :param train_set_size: data points (days) in each fold for the train set
        :param test_set_size: data points (days) in each fold for the test set
        """
        self.train_set_size = train_set_size
        self.test_set_size = test_set_size
        self._logger = logging.getLogger(__name__)

    def split(self,
              x: np.array,
              y: np.array = None) -> (np.array, np.array):
        """Return train/test split indices.
        :param x: time series to use for prediction, shape (n_samples, n_features)
        :param y: time series to predict, shape (n_samples, n_features)
        :return: (train_indices, test_indices)
        Note: index of both x and y should be of type datetime.
        """
        if y is not None:
            assert x.index.equals(y.index)
        split_points = self.get_split_points(x)
        for split_point in split_points:
            is_train = (x.index < split_point) & (x.index >= split_point -
                                                  pd.Timedelta(self.train_set_size, unit='D'))
            is_test = (x.index >= split_point) & (x.index < split_point +
                                                  pd.Timedelta(self.test_set_size, unit='D'))
            if not is_train.any() or not is_test.any():
                self._logger.warning('Found %d train and %d test observations '
                                     'skipping fold for split point %s',
                                     is_train.sum(), is_test.sum(), split_point)
                continue
            dummy_ix = pd.Series(range(0, len(x)), index=x.index)
            ix_train = dummy_ix.loc[is_train].values
            ix_test = dummy_ix.loc[is_test].values
            if ix_train is None or ix_test is None:
                self._logger.warning('Found no data for train or test period, '
                                     'skipping fold for split date %s',
                                     split_point)
                continue
            yield ix_train, ix_test

    def get_split_points(self, x: np.array) -> pd.DatetimeIndex:
        """Get all possible split point dates"""
        start = x.index.min() + pd.Timedelta(self.train_set_size, unit='D')
        end = x.index.max() - pd.Timedelta(self.test_set_size - 1, unit='D')
        self._logger.info(f'Generating split points from {start} to {end}')
        split_range = pd.date_range(start, end, freq='D')
        first_split_point =  (len(split_range) + self.test_set_size - 1) % self.test_set_size
        return split_range[first_split_point::self.test_set_size]
    
    
class ModelBuilder:
    def __init__(self, df, target, feats, cat_feats):
        self.df = df
        self.target = target
        self.feats = feats
        self.cat_feats = cat_feats
        self.mode = 'classification' if type(target)==str else 'multiclassification'
            
    def train_folds(self, train_size=120, test_size=30, iterations=1000, early_stopping=False):
        if self.mode == 'classification':
            oof_preds = np.zeros(self.df.shape[0])
        else:
            oof_preds = np.zeros((self.df.shape[0], len(targets)))
            
        folds_mask = np.zeros(oof_preds.shape[0])
        for fold_, (train_index, test_index) in enumerate(CustomTimeSeriesSplit(train_set_size=train_size, test_set_size=test_size).split(self.df)):
            X_train, y_train = self.df.iloc[train_index,:][self.feats], self.df.iloc[train_index,:][self.target]
            X_val, y_val = self.df.iloc[test_index,:][self.feats], self.df.iloc[test_index,:][self.target]

            weeks_train = X_train.reset_index()['dt']
            weeks_test = X_val.reset_index()['dt']

            tr_start_week = weeks_train.min()
            tr_end_week = weeks_train.max()
            ts_start_week = weeks_test.min()
            ts_end_week = weeks_test.max()
            
            print()
            print()
            print(f'Fold {fold_} train ({tr_start_week}, {tr_end_week}) test ({ts_start_week}, {ts_end_week})')
            
            
            cat_model = CatBoostClassifier(
                iterations=iterations,
                learning_rate=0.05,
                metric_period=500,
                loss_function='Logloss' if self.mode=='classification' else 'MultiLogloss',
                l2_leaf_reg=10,
                eval_metric='F1' if self.mode=='classification' else 'MultiLogloss', 
                task_type='CPU',
                early_stopping_rounds=100,
                random_seed=1234,
                use_best_model=early_stopping
                )
            
            D_train = Pool(X_train, y_train, cat_features=cat_feats, feature_names=feats)
            D_val = Pool(X_val, y_val, cat_features=cat_feats, feature_names=feats)
            
            print('Train catboost')
            cat_model.fit(
                D_train, 
                eval_set=D_val if early_stopping else None,
                verbose=True,
                plot=False
            )
            
            if self.mode == 'classification':
                D_train_lgb = lgb.Dataset(X_train, y_train, weight=None, free_raw_data=False)
                D_val_lgb = lgb.Dataset(X_val, y_val, weight=None, free_raw_data=False)
                
                print('Train lgbm')
                lgbm_model = lgb.train(
                    {
                    'objective': 'binary',
                    'feature_pre_filter': False,
                    'lambda_l1': 5.246525412521277e-08,
                    'lambda_l2': 3.963188589061798e-05,
                    'num_leaves': 6,
                    'feature_fraction': 0.7,
                    'bagging_fraction': 1.0,
                    'bagging_freq': 0,
                    'min_child_samples': 20,
                    },
                   D_train_lgb,
                   num_boost_round=iterations,
                   early_stopping_rounds=200 if early_stopping else None,
                   valid_sets=D_val_lgb if early_stopping else None,
                   feature_name=feats,
                   verbose_eval=500
                  )
                preds = (0.5*cat_model.predict_proba(X_val)[:,1] + 0.5*lgbm_model.predict(X_val))
                print()
                print(f'Fold {fold_} F1 Score ', metrics.f1_score(y_val, preds.round()))
                print(f'Fold {fold_} ROC AUC Score ', metrics.roc_auc_score(y_val, preds.round()))
                print(f'Fold {fold_} Confusion matrix')
                print(metrics.confusion_matrix(y_val, preds.round()))
                oof_preds[test_index] = preds
            else:
                oof_preds[test_index] = cat_model.predict(X_val)
                print(f'Fold {fold_} F1 Score ', metrics.f1_score(y_val, oof_preds[test_index].round(), average='micro'))
                try:
                    print(f'Fold {fold_} ROC AUC Score ', metrics.roc_auc_score(y_val, oof_preds[test_index]))
                except ValueError:
                    print(f'Fold {fold_} ROC AUC Score ', 0)
                    
            folds_mask[test_index] = 1
        
        if self.mode == 'classification':
            oof_f1micro = metrics.f1_score(self.df.iloc[folds_mask==1,:][self.target], oof_preds[folds_mask==1].round(), average='micro')
            oof_f1micro = metrics.roc_auc_score(self.df.iloc[folds_mask==1,:][self.target], oof_preds[folds_mask==1], average='micro')
        else:
            oof_f1micro = metrics.f1_score(self.df.iloc[folds_mask==1,:][self.target], oof_preds[folds_mask==1].round(), average='micro')
            oof_f1micro = metrics.roc_auc_score(self.df.iloc[folds_mask==1,:][self.target], oof_preds[folds_mask==1], average='micro')
        
        print()
        print('Overall OOF F1 Micro ', oof_f1micro)
        print('Overall OOF Mean ROC AUC Score ', oof_f1micro)
        
    def train_final_models(self, iterations=1000, early_stopping=False):
        if self.mode == 'classification':
            X_train, y_train = self.df.iloc[:,:][self.feats], self.df.iloc[:,:][self.target]
            
      
            
            cat_model = CatBoostClassifier(
                iterations=iterations,
                learning_rate=0.05,
                metric_period=500,
                loss_function='Logloss',
                l2_leaf_reg=10,
                eval_metric='F1', 
                task_type='CPU',
                random_seed=1234,
                use_best_model=early_stopping
                )
            
            D_train = Pool(X_train, y_train, cat_features=cat_feats, feature_names=feats)
            
            print('Train catboost')
            cat_model.fit(
                D_train, 
                eval_set=None,
                verbose=True,
                plot=False
            )
            
            D_train_lgb = lgb.Dataset(X_train, y_train, weight=None, free_raw_data=False)

            print('Train lgbm')
            lgbm_model = lgb.train(
                {
                'objective': 'binary',
                'feature_pre_filter': False,
                'lambda_l1': 5.246525412521277e-08,
                'lambda_l2': 3.963188589061798e-05,
                'num_leaves': 6,
                'feature_fraction': 0.7,
                'bagging_fraction': 1.0,
                'bagging_freq': 0,
                'min_child_samples': 20,
                },
               D_train_lgb,
               num_boost_round=iterations,
               valid_sets=None,
               feature_name=feats,
               verbose_eval=500
              )
            
            return cat_model, lgbm_model
            
        elif self.mode == 'multiclassification':
            raise NotImplementedError 

In [3]:
df = pd.read_csv('merged.csv')

In [4]:
df['CONFIDENCE'] = df['CONFIDENCE'].map({'l':0, 'h':1, 'n':3})
df['SATELLITE'] = df['SATELLITE'].map({'1':0, 'N':1})
df['DAYNIGHT'] = df['DAYNIGHT'].map({'D':0, 'N':1})
df['dt'] = pd.to_datetime(df['dt']).dt.date
df = df.set_index('dt')

In [5]:
targets = ['infire_day_1','infire_day_2','infire_day_3','infire_day_4','infire_day_5','infire_day_6','infire_day_7','infire_day_8']
feats = ['grid_index','LATITUDE','LONGITUDE','BRIGHTNESS','SCAN','TRACK','ACQ_TIME','SATELLITE','DAYNIGHT','CONFIDENCE','BRIGHT_T31','FRP','TYPE']
#cat_feats = ['grid_index', 'DAYNIGHT','SATELLITE']
cat_feats = []

In [6]:
targets = ['infire_day_1','infire_day_2','infire_day_3','infire_day_4','infire_day_5','infire_day_6','infire_day_7','infire_day_8']
df['target'] = (df[targets].sum(axis=1)>0).astype(np.uint8)

In [7]:
df['target'].value_counts(normalize=True)

1    0.842362
0    0.157638
Name: target, dtype: float64

In [8]:
### syntetic data
DROPOUT_PROBA = 0.7
UPSAMPLE_RATE = 6

df_syn_base = df[df['target']==0][feats]
df_syn_final = pd.DataFrame()

for i in range(UPSAMPLE_RATE):
    df_syn = df_syn_base.copy()
    for f in feats[3:]:
        df_syn[f] = df_syn[f].apply(lambda x: x if np.random.random()>DROPOUT_PROBA else None).sample(frac=1.0).values
    df_syn_final = pd.concat([df_syn_final, df_syn], axis=0)

df_syn_final['target'] = 0

In [9]:
df_combined = pd.concat([
    df[feats+['target']],
    df_syn_final], axis=0)

In [10]:
df_combined['target'].value_counts(normalize=True)

0    0.567092
1    0.432908
Name: target, dtype: float64

### Single lable (will we see fire during a period of 8 days)

In [11]:
fire_model = ModelBuilder(df_combined, 'target', feats, cat_feats)

In [12]:
fire_model.train_folds(train_size=120, test_size=30, iterations=1000, early_stopping=False)



Fold 0 train (2020-01-26, 2020-05-24) test (2020-05-25, 2020-06-23)
Train catboost
0:	learn: 0.9092012	total: 190ms	remaining: 3m 10s
500:	learn: 0.9192056	total: 5.71s	remaining: 5.69s
999:	learn: 0.9256910	total: 11.3s	remaining: 0us
Train lgbm
[LightGBM] [Info] Number of positive: 14305, number of negative: 18620
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1858
[LightGBM] [Info] Number of data points in the train set: 32925, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.434472 -> initscore=-0.263627
[LightGBM] [Info] Start training from score -0.263627





Fold 0 F1 Score  0.7801026450848795
Fold 0 ROC AUC Score  0.8035772103887693
Fold 0 Confusion matrix
[[1338  153]
 [ 404  988]]


Fold 1 train (2020-02-25, 2020-06-23) test (2020-06-24, 2020-07-23)
Train catboost
0:	learn: 0.9135035	total: 23.2ms	remaining: 23.1s
500:	learn: 0.9202317	total: 6.07s	remaining: 6.04s
999:	learn: 0.9258262	total: 12s	remaining: 0us
Train lgbm
[LightGBM] [Info] Number of positive: 15654, number of negative: 19992
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1867
[LightGBM] [Info] Number of data points in the train set: 35646, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.439152 -> initscore=-0.244606
[LightGBM] [Info] Start training from score -0.244606





Fold 1 F1 Score  0.9053108275010291
Fold 1 ROC AUC Score  0.92160137954899
Fold 1 Confusion matrix
[[7863 1244]
 [ 136 6597]]


Fold 2 train (2020-03-26, 2020-07-23) test (2020-07-24, 2020-08-22)
Train catboost
0:	learn: 0.9086546	total: 41.9ms	remaining: 41.9s
500:	learn: 0.9140407	total: 6.37s	remaining: 6.34s
999:	learn: 0.9196146	total: 12.7s	remaining: 0us
Train lgbm
[LightGBM] [Info] Number of positive: 19192, number of negative: 26124
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1871
[LightGBM] [Info] Number of data points in the train set: 45316, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.423515 -> initscore=-0.308361
[LightGBM] [Info] Start training from score -0.308361





Fold 2 F1 Score  0.9297418271920711
Fold 2 ROC AUC Score  0.9266535318194321
Fold 2 Confusion matrix
[[5162  837]
 [  42 5816]]


Fold 3 train (2020-04-25, 2020-08-22) test (2020-08-23, 2020-09-21)
Train catboost
0:	learn: 0.9099302	total: 28.5ms	remaining: 28.5s
500:	learn: 0.9184891	total: 5.81s	remaining: 5.79s
999:	learn: 0.9266217	total: 11.6s	remaining: 0us
Train lgbm
[LightGBM] [Info] Number of positive: 15254, number of negative: 20412
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1866
[LightGBM] [Info] Number of data points in the train set: 35666, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.427690 -> initscore=-0.291281
[LightGBM] [Info] Start training from score -0.291281





Fold 3 F1 Score  0.9221148379761228
Fold 3 ROC AUC Score  0.9201391593592079
Fold 3 Confusion matrix
[[3007  444]
 [ 104 3244]]


Fold 4 train (2020-05-25, 2020-09-21) test (2020-09-22, 2020-10-21)
Train catboost
0:	learn: 0.9205340	total: 29.6ms	remaining: 29.6s
500:	learn: 0.9281676	total: 6.18s	remaining: 6.15s
999:	learn: 0.9337143	total: 12.3s	remaining: 0us
Train lgbm
[LightGBM] [Info] Number of positive: 17331, number of negative: 20048
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1864
[LightGBM] [Info] Number of data points in the train set: 37379, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.463656 -> initscore=-0.145633
[LightGBM] [Info] Start training from score -0.145633





Fold 4 F1 Score  0.8568281938325992
Fold 4 ROC AUC Score  0.9210158342772584
Fold 4 Confusion matrix
[[4589  731]
 [  49 2334]]


Fold 5 train (2020-06-24, 2020-10-21) test (2020-10-22, 2020-11-20)
Train catboost
0:	learn: 0.9114570	total: 17.9ms	remaining: 17.8s
500:	learn: 0.9190994	total: 6.35s	remaining: 6.32s
999:	learn: 0.9258276	total: 12.7s	remaining: 0us
Train lgbm
[LightGBM] [Info] Number of positive: 18322, number of negative: 23877
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1862
[LightGBM] [Info] Number of data points in the train set: 42199, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.434181 -> initscore=-0.264813
[LightGBM] [Info] Start training from score -0.264813





Fold 5 F1 Score  0.8176229508196721
Fold 5 ROC AUC Score  0.8852268414621574
Fold 5 Confusion matrix
[[895 134]
 [ 44 399]]


Fold 6 train (2020-07-24, 2020-11-20) test (2020-11-21, 2020-12-20)
Train catboost
0:	learn: 0.9109824	total: 28.6ms	remaining: 28.6s
500:	learn: 0.9220326	total: 5.85s	remaining: 5.83s
999:	learn: 0.9337218	total: 11.9s	remaining: 0us
Train lgbm
[LightGBM] [Info] Number of positive: 12032, number of negative: 15799
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1858
[LightGBM] [Info] Number of data points in the train set: 27831, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.432324 -> initscore=-0.272377
[LightGBM] [Info] Start training from score -0.272377





Fold 6 F1 Score  0.822857142857143
Fold 6 ROC AUC Score  0.9034930586654724
Fold 6 Confusion matrix
[[354  52]
 [ 10 144]]


Fold 7 train (2020-08-23, 2020-12-20) test (2020-12-21, 2021-01-19)
Train catboost
0:	learn: 0.8928851	total: 27.8ms	remaining: 27.8s
500:	learn: 0.9117011	total: 4.45s	remaining: 4.43s
999:	learn: 0.9322221	total: 9.03s	remaining: 0us
Train lgbm
[LightGBM] [Info] Number of positive: 6328, number of negative: 10206
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1848
[LightGBM] [Info] Number of data points in the train set: 16534, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.382727 -> initscore=-0.477992
[LightGBM] [Info] Start training from score -0.477992





Fold 7 F1 Score  0.786206896551724
Fold 7 ROC AUC Score  0.8601557430240103
Fold 7 Confusion matrix
[[140  21]
 [ 10  57]]


Fold 8 train (2020-09-22, 2021-01-19) test (2021-01-20, 2021-02-18)
Train catboost
0:	learn: 0.8508634	total: 17.6ms	remaining: 17.5s
500:	learn: 0.9096079	total: 2.36s	remaining: 2.35s
999:	learn: 0.9456353	total: 4.76s	remaining: 0us
Train lgbm
[LightGBM] [Info] Number of positive: 3047, number of negative: 6916
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1845
[LightGBM] [Info] Number of data points in the train set: 9963, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.305832 -> initscore=-0.819680
[LightGBM] [Info] Start training from score -0.819680





Fold 8 F1 Score  0.7956989247311828
Fold 8 ROC AUC Score  0.912951912951913
Fold 8 Confusion matrix
[[224  35]
 [  3  74]]


Fold 9 train (2020-10-22, 2021-02-18) test (2021-02-19, 2021-03-20)
Train catboost
0:	learn: 0.8358209	total: 3.39ms	remaining: 3.39s
500:	learn: 0.9686275	total: 1.38s	remaining: 1.37s
999:	learn: 0.9966375	total: 2.72s	remaining: 0us
Train lgbm
[LightGBM] [Info] Number of positive: 741, number of negative: 1855
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1779
[LightGBM] [Info] Number of data points in the train set: 2596, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.285439 -> initscore=-0.917639
[LightGBM] [Info] Start training from score -0.917639





Fold 9 F1 Score  0.8262411347517731
Fold 9 ROC AUC Score  0.8662516615941134
Fold 9 Confusion matrix
[[443  54]
 [ 44 233]]


Fold 10 train (2020-11-21, 2021-03-20) test (2021-03-21, 2021-04-19)
Train catboost
0:	learn: 0.8524590	total: 37.9ms	remaining: 37.9s
500:	learn: 0.9803922	total: 1.2s	remaining: 1.19s
999:	learn: 0.9991312	total: 2.34s	remaining: 0us
Train lgbm
[LightGBM] [Info] Number of positive: 575, number of negative: 1323
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1720
[LightGBM] [Info] Number of data points in the train set: 1898, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.302950 -> initscore=-0.833287
[LightGBM] [Info] Start training from score -0.833287





Fold 10 F1 Score  0.8071157305616631
Fold 10 ROC AUC Score  0.8277269166913754
Fold 10 Confusion matrix
[[2782  270]
 [ 695 2019]]

Overall OOF F1 Micro  0.9363852537115387
Overall OOF Mean ROC AUC Score  0.9363852537115387


In [13]:
cat_model, lgbm_model = fire_model.train_final_models()

Train catboost
0:	learn: 0.9116969	total: 30.5ms	remaining: 30.5s
500:	learn: 0.9157650	total: 7.92s	remaining: 7.88s
999:	learn: 0.9179507	total: 15.9s	remaining: 0us
Train lgbm
[LightGBM] [Info] Number of positive: 37753, number of negative: 49455
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1872
[LightGBM] [Info] Number of data points in the train set: 87208, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.432908 -> initscore=-0.269998
[LightGBM] [Info] Start training from score -0.269998




In [14]:
cat_model.save_model('cat_fire', format="cbm")

#save just last
lgbm_model.save_model('lgbm_model.txt', 
                      #num_iteration=bst.best_iteration
                     )

<lightgbm.basic.Booster at 0x1e1a17bd310>