# Data was generated based on the Starter Notebook created by John Whitaker (JW)
https://colab.research.google.com/drive/1DPizsNT7GUK776TRDmk5rZVMsB1kJY5H

In [2]:
import tifffile as tiff
import datetime
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import glob
import datetime
from tqdm.notebook import tqdm

import sklearn

from sklearn.utils.class_weight import compute_class_weight

from sklearn.inspection import permutation_importance
from scipy.stats.mstats import gmean

# Models
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import BaggingClassifier
import catboost
from catboost import CatBoostClassifier

import eli5
from eli5.sklearn import PermutationImportance

from sklearn.model_selection import cross_val_predict, train_test_split
from sklearn.metrics import log_loss

import os

# Package versions

In [3]:
catboost.__version__ #0.22

'1.0.0'

In [4]:
sklearn.__version__ #0.22.2

'0.23.2'

In [5]:
eli5.__version__ #0.10.1

'0.11.0'

In [6]:
dates_raw = [
datetime.datetime(2019, 6, 6, 0, 0),
datetime.datetime(2019, 7, 1, 0, 0),
datetime.datetime(2019, 7, 6, 0, 0),
datetime.datetime(2019, 7, 11, 0, 0),
datetime.datetime(2019, 7, 21, 0, 0),
datetime.datetime(2019, 8, 5, 0, 0),
datetime.datetime(2019, 8, 15, 0, 0),
datetime.datetime(2019, 8, 25, 0, 0),
datetime.datetime(2019, 9, 9, 0, 0),
datetime.datetime(2019, 9, 19, 0, 0),
datetime.datetime(2019, 9, 24, 0, 0),
datetime.datetime(2019, 10, 4, 0, 0),
datetime.datetime(2019, 11, 3, 0, 0)
]

dates = []

for i in range(13):
    dt = "".join(str(dates_raw[i].date()).split("-"))
    dates.append(dt)

In [7]:
bands = ['B01', 'B02', 'B03', 'B04', 'B05', 'B06', 'B07', 'B08', 'B8A', 'B09', 'B11', 'B12']

In [8]:
# Including Cloud Layer
bands_all = ['B01', 'B02', 'B03', 'B04', 'B05', 'B06', 'B07', 'B08', 'B8A', 'B09', 'B11', 'B12', 'CLD']

# DATA GENERATION - based on JW starter code
> NB: Directory of competition data (multi spectral satellite images) should be named "data", and all relevant images should be included in their respective tile folder and sub folders

In [9]:
def load_file(fp):
    """Takes a PosixPath object or string filepath
    and returns np array"""
    
    return tiff.imread(fp.__str__())

In [15]:
row_locs = []
col_locs = []
field_ids = []
labels = []
tiles = []

for tile in range(4):
    fids = f'../input/crop-classification/ref_african_crops_kenya_02_labels/ref_african_crops_kenya_02_labels/ref_african_crops_kenya_02_tile_0{tile}_label/field_ids.tif'
    labs = f'../input/crop-classification/ref_african_crops_kenya_02_labels/ref_african_crops_kenya_02_labels/ref_african_crops_kenya_02_tile_0{tile}_label/labels.tif'
    fid_arr = load_file(fids)
    lab_arr = load_file(labs)
    for row in range(len(fid_arr)):
        
        for col in range(len(fid_arr[0])):
            if fid_arr[row][col] != 0:
                row_locs.append(row)
                col_locs.append(col)
                field_ids.append(fid_arr[row][col])
                labels.append(lab_arr[row][col])
                tiles.append(tile)

In [16]:
df_generated = pd.DataFrame({
    'fid':field_ids,
    'label':labels,
    'row_loc': row_locs,
    'col_loc':col_locs,
    'tile':tiles
})

In [17]:
df_generated.head()

Unnamed: 0,fid,label,row_loc,col_loc,tile
0,2928,4,214,1278,0
1,2928,4,214,1279,0
2,2928,4,214,1280,0
3,2928,4,214,1281,0
4,2928,4,214,1282,0


In [18]:
df_generated.shape

(67557, 5)

In [22]:
col_names = []
col_values = []

for tile in range(4): # 1) For each tile
    print('Tile: ', tile)
    for d in dates: # 2) For each date
        print(str(d))
#         d = ''.join(str(d.date()).split('-')) # Nice date string
        t = '0' + str(tile)
        for b in bands_all: # 3) For each band
            col_name = d + '_' + b

            if tile == 0:
                # If the column doesn't exist, create it and populate with 0s
                df_generated[col_name] = 0

            # Load im
            im = load_file(f"../input/crop-classification/ref_african_crops_kenya_02_source/ref_african_crops_kenya_02_source/ref_african_crops_kenya_02_tile_{t}_{d}/{b}.tif")

            # Going four levels deep. Each second on the outside is four weeks in this loop
            # If we die here, there's no waking up.....
            vals = []
            for row, col in df_generated.loc[df_generated.tile == tile][['row_loc', 'col_loc']].values: # 4) For each location of a pixel in a field
                vals.append(im[row][col])
            df_generated.loc[df_generated.tile == tile, col_name] = vals

Tile:  0
20190606
20190701
20190706
20190711
20190721
20190805
20190815
20190825
20190909
20190919
20190924
20191004
20191103
Tile:  1
20190606
20190701
20190706
20190711
20190721
20190805
20190815
20190825
20190909
20190919
20190924
20191004
20191103
Tile:  2
20190606
20190701
20190706
20190711
20190721
20190805
20190815
20190825
20190909
20190919
20190924
20191004
20191103
Tile:  3
20190606
20190701
20190706
20190711
20190721
20190805
20190815
20190825
20190909
20190919
20190924
20191004
20191103


In [23]:
df_generated.head()

Unnamed: 0,fid,label,row_loc,col_loc,tile,20190606_B01,20190606_B02,20190606_B03,20190606_B04,20190606_B05,...,20191103_B04,20191103_B05,20191103_B06,20191103_B07,20191103_B08,20191103_B8A,20191103_B09,20191103_B11,20191103_B12,20191103_CLD
0,2928,4,214,1278,0,0.1995,0.2024,0.1782,0.1898,0.2462,...,0.0669,0.1239,0.266,0.3214,0.3204,0.3515,0.361,0.2362,0.1347,0.0
1,2928,4,214,1279,0,0.1995,0.1942,0.1848,0.1846,0.2462,...,0.0603,0.1239,0.266,0.3214,0.3296,0.3515,0.361,0.2362,0.1347,0.0
2,2928,4,214,1280,0,0.1995,0.1868,0.193,0.186,0.2558,...,0.0631,0.1259,0.2579,0.3131,0.3459,0.3307,0.361,0.2304,0.1471,0.0
3,2928,4,214,1281,0,0.1995,0.1962,0.1978,0.1954,0.2558,...,0.1214,0.1259,0.2579,0.3131,0.2861,0.3307,0.361,0.2304,0.1471,0.0
4,2928,4,214,1282,0,0.1995,0.1976,0.1914,0.1956,0.2571,...,0.1278,0.1385,0.2726,0.3239,0.3286,0.3446,0.361,0.2323,0.1355,0.0


In [24]:
df_generated.shape

(67557, 174)

In [27]:
df_generated.to_csv("bands_ungrouped.csv", index = False)

# Import data
> NB: The above "Data Generation" code is preliminary. The data generated isn't used directly, but rather exported to .csv format, and imported below for further usage

In [28]:
sample_submission = pd.read_csv("../input/field-id/SampleSubmission.csv")

In [29]:
df_ungrouped = pd.read_csv("./bands_ungrouped.csv")

In [30]:
df_ungrouped.head()

Unnamed: 0,fid,label,row_loc,col_loc,tile,20190606_B01,20190606_B02,20190606_B03,20190606_B04,20190606_B05,...,20191103_B04,20191103_B05,20191103_B06,20191103_B07,20191103_B08,20191103_B8A,20191103_B09,20191103_B11,20191103_B12,20191103_CLD
0,2928,4,214,1278,0,0.1995,0.2024,0.1782,0.1898,0.2462,...,0.0669,0.1239,0.266,0.3214,0.3204,0.3515,0.361,0.2362,0.1347,0.0
1,2928,4,214,1279,0,0.1995,0.1942,0.1848,0.1846,0.2462,...,0.0603,0.1239,0.266,0.3214,0.3296,0.3515,0.361,0.2362,0.1347,0.0
2,2928,4,214,1280,0,0.1995,0.1868,0.193,0.186,0.2558,...,0.0631,0.1259,0.2579,0.3131,0.3459,0.3307,0.361,0.2304,0.1471,0.0
3,2928,4,214,1281,0,0.1995,0.1962,0.1978,0.1954,0.2558,...,0.1214,0.1259,0.2579,0.3131,0.2861,0.3307,0.361,0.2304,0.1471,0.0
4,2928,4,214,1282,0,0.1995,0.1976,0.1914,0.1956,0.2571,...,0.1278,0.1385,0.2726,0.3239,0.3286,0.3446,0.361,0.2323,0.1355,0.0


In [31]:
df_ungrouped.shape

(67557, 174)

# DATA PREPARATION/PRE-PROCESSING

In [32]:
# Spatial features to be merged with dataset (by Field ID) later on
row_size = df_ungrouped.groupby("fid")["row_loc"].nunique()
column_size = df_ungrouped.groupby("fid")["col_loc"].nunique()
num_pixels = df_ungrouped.groupby("fid")["label"].count()

In [33]:
# Grouped Data
df_grouped = df_ungrouped.groupby("fid", as_index = False).mean()

### Two separate modelling was done on different set of features, and later combined through ensembling
> ### 1st modelling 
Features used:
1. Pixel values of each of the 12 bands, INCLUDING the cloud probabilities (for the 13 timestamps)
2. Vegetation/Spectral indices like NDVI, GNDVI, AVI etc, and relevant statistics related to the indices like mean, max etc.
3. Spatial features - row_size, column_size (both indicating height, and width), area of field, and number of pixels covered by a field in the area computed  

Relevant code variables are usually suffixed with "_all"  

> ### 2nd modelling  (Pixel related)
Features used:
1. Pixel values of each of the 12 bands, EXCLUDING the cloud probabilities (for the 13 timestamps)
2. Statistics related to pixel values of each band like mean, max etc.  

Relevant code variables are usually suffixed with "_pixels"

In [34]:
# Dataframe for 1st modelling
df_all = df_grouped.copy()

In [35]:
# Dataframe for 2nd modelling
df_pixels = df_grouped.copy()

In [36]:
# Drop non-allowed features. Field ID is dropped later on as it's still needed for further processing
df_all = df_all.drop(columns = ["row_loc", "col_loc", "tile"])
df_pixels = df_pixels.drop(columns = ["row_loc", "col_loc", "tile"])

In [37]:
cloud_columns = ['20190606_CLD', '20190701_CLD', '20190706_CLD', '20190711_CLD', '20190721_CLD', '20190805_CLD', '20190815_CLD', '20190825_CLD', '20190909_CLD', '20190919_CLD', '20190924_CLD', '20191004_CLD', '20191103_CLD']

In [38]:
# Drop Cloud probabilities, and Field ID's in 2nd Dataframe.
# Field ID in 1st Dataframe will be dropped later
df_pixels.drop(columns = cloud_columns + ["fid"], inplace = True)

In [39]:
df_all.head()

Unnamed: 0,fid,label,20190606_B01,20190606_B02,20190606_B03,20190606_B04,20190606_B05,20190606_B06,20190606_B07,20190606_B08,...,20191103_B04,20191103_B05,20191103_B06,20191103_B07,20191103_B08,20191103_B8A,20191103_B09,20191103_B11,20191103_B12,20191103_CLD
0,1,1.0,0.0986,0.0782,0.108,0.0986,0.1581,0.3094,0.3668,0.3554,...,0.1028,0.1472,0.2637,0.3058,0.2994,0.32,0.3297,0.2527,0.1535,0.0
1,2,2.0,0.037855,0.058818,0.086627,0.060945,0.119536,0.275473,0.325918,0.316755,...,0.096155,0.171527,0.323836,0.372173,0.379673,0.410027,0.408255,0.323173,0.2072,0.0
2,3,0.0,0.2607,0.264511,0.232378,0.196911,0.229033,0.301122,0.333367,0.349678,...,0.446889,0.488256,0.522944,0.526311,0.625122,0.521933,0.6964,0.546033,0.435767,100.0
3,4,2.0,0.027792,0.033031,0.060354,0.047331,0.103085,0.260508,0.320369,0.297408,...,0.073946,0.158177,0.341762,0.392846,0.394592,0.419885,0.4028,0.310362,0.200377,0.0
4,5,5.0,0.0146,0.0225,0.0354,0.0302,0.0634,0.1544,0.1853,0.1868,...,0.0641,0.1344,0.2862,0.3445,0.3339,0.3571,0.3526,0.2139,0.1261,0.0


In [40]:
df_pixels.head()

Unnamed: 0,label,20190606_B01,20190606_B02,20190606_B03,20190606_B04,20190606_B05,20190606_B06,20190606_B07,20190606_B08,20190606_B8A,...,20191103_B03,20191103_B04,20191103_B05,20191103_B06,20191103_B07,20191103_B08,20191103_B8A,20191103_B09,20191103_B11,20191103_B12
0,1.0,0.0986,0.0782,0.108,0.0986,0.1581,0.3094,0.3668,0.3554,0.391,...,0.1042,0.1028,0.1472,0.2637,0.3058,0.2994,0.32,0.3297,0.2527,0.1535
1,2.0,0.037855,0.058818,0.086627,0.060945,0.119536,0.275473,0.325918,0.316755,0.348482,...,0.109055,0.096155,0.171527,0.323836,0.372173,0.379673,0.410027,0.408255,0.323173,0.2072
2,0.0,0.2607,0.264511,0.232378,0.196911,0.229033,0.301122,0.333367,0.349678,0.340689,...,0.510644,0.446889,0.488256,0.522944,0.526311,0.625122,0.521933,0.6964,0.546033,0.435767
3,2.0,0.027792,0.033031,0.060354,0.047331,0.103085,0.260508,0.320369,0.297408,0.338,...,0.093308,0.073946,0.158177,0.341762,0.392846,0.394592,0.419885,0.4028,0.310362,0.200377
4,5.0,0.0146,0.0225,0.0354,0.0302,0.0634,0.1544,0.1853,0.1868,0.1876,...,0.0883,0.0641,0.1344,0.2862,0.3445,0.3339,0.3571,0.3526,0.2139,0.1261


### Vegetation/Spectral indices

1. Normalized Difference Vegetation Index (NDVI)
2. Green Normalized Difference Vegetation Index (GNDVI)
3. Enhanced Vegetation Index (EVI)
4. Enhanced Vegetation Index 2 (EVI2)
5. Advanced Vegetation Index (AVI)
6. Bare Soil Index (BSI)
7. Shadow Index (SI)
8. Normalized Difference Water Index (NDWI)
9. Normalized Difference Moisture Index (NDMI)
10. Normalized Pigment Chlorophyll Ratio Index (NPCRI)

In [41]:
spectral_indices = ["NDVI", "GNDVI", "EVI", "EVI2", "AVI", "BSI", "SI", "NDWI", "NDMI", "NPCRI"]

In [43]:
for i in range(13):
#     Band Pixel values per timestamp
    b1 = df_all.filter(like = "B01").values[:,i]
    b2 = df_all.filter(like = "B02").values[:,i]
    b3 = df_all.filter(like = "B03").values[:,i]
    b4 = df_all.filter(like = "B04").values[:,i]
    b5 = df_all.filter(like = "B05").values[:,i]
    b6 = df_all.filter(like = "B06").values[:,i]
    b7 = df_all.filter(like = "B07").values[:,i]
    b8 = df_all.filter(like = "B08").values[:,i]
    b8a = df_all.filter(like = "B8A").values[:,i]
    b9 = df_all.filter(like = "B09").values[:,i]    
    b11 = df_all.filter(like = "B11").values[:,i]
    b12 = df_all.filter(like = "B12").values[:,i]
    
#     Computation of indices
    ndvi = (b8 - b4) / (b8 + b4)
    gndvi = (b8 - b3) / (b8 + b3)
    evi = 2.5 * (b8 - b4) / ((b8 + 6.0 * b4 - 7.5 * b2) + 1.0)    
    evi2 = 2.4 * (b8 - b4) / (b8 + b4 + 1.0)
    avi = (b8 * (1 - b4) * (b8 - b4))
    bsi = ((b11 + b4) - (b8 + b2)) / ((b11 + b4) + (b8 + b2))
    si = ((1 - b2) * (1 - b3) * (1 - b4))
    ndwi = (b3 - b8) / (b3 + b8)
    ndmi = (b8 - b11) / (b8 + b11)
    npcri = (b4 - b2) / (b4 + b2) 
    
#     Add indices as features to 1st dataframe per timestamp
    df_all[f'NDVI_{dates[i]}'] = ndvi 
    df_all[f'GNDVI_{dates[i]}'] = gndvi
    df_all[f'EVI_{dates[i]}'] = evi
    df_all[f'EVI2_{dates[i]}'] = evi2
    df_all[f'AVI_{dates[i]}'] = avi
    df_all[f'BSI_{dates[i]}'] = bsi
    df_all[f'SI_{dates[i]}'] = si    
    df_all[f'NDWI_{dates[i]}'] = ndwi
    df_all[f'NDMI_{dates[i]}'] = ndmi
    df_all[f'NPCRI_{dates[i]}'] = npcri

In [44]:
# Add spectral indices statistics related to 1st dataframe
for i in spectral_indices:
    df_all[f'{i}_min'] = df_all.filter(regex = f'^{i}').min(axis = 1)
    df_all[f'{i}_max'] = df_all.filter(regex = f'^{i}').max(axis = 1)
    df_all[f'{i}_avg'] = df_all.filter(regex = f'^{i}').mean(axis = 1)
    df_all[f'{i}_std'] = df_all.filter(regex = f'^{i}').std(axis = 1)    

In [45]:
# Add band pixel values statistics related to 2nd dataframe
for i in bands:
    df_pixels[f'{i}_std'] = df_pixels.filter(like = f'_{i}').std(axis = 1)
    df_pixels[f'{i}_max'] = df_pixels.filter(like = f'_{i}').max(axis = 1)
    df_pixels[f'{i}_min'] = df_pixels.filter(like = f'_{i}').min(axis = 1)
    df_pixels[f'{i}_avg'] = df_pixels.filter(like = f'_{i}').mean(axis = 1)

In [46]:
# NB: "new_df" represents 1st dataframe
new_df = df_all.copy()

In [47]:
new_df.head()

Unnamed: 0,fid,label,20190606_B01,20190606_B02,20190606_B03,20190606_B04,20190606_B05,20190606_B06,20190606_B07,20190606_B08,...,NDWI_avg,NDWI_std,NDMI_min,NDMI_max,NDMI_avg,NDMI_std,NPCRI_min,NPCRI_max,NPCRI_avg,NPCRI_std
0,1,1.0,0.0986,0.0782,0.108,0.0986,0.1581,0.3094,0.3668,0.3554,...,-0.483442,0.050483,-0.104512,0.133472,0.011174,0.073139,0.115385,0.295775,0.221467,0.055749
1,2,2.0,0.037855,0.058818,0.086627,0.060945,0.119536,0.275473,0.325918,0.316755,...,-0.56786,0.046722,0.027348,0.183566,0.105168,0.055493,0.012265,0.375071,0.165172,0.105581
2,3,0.0,0.2607,0.264511,0.232378,0.196911,0.229033,0.301122,0.333367,0.349678,...,-0.506783,0.231016,-0.003991,0.288075,0.160101,0.085492,-0.146504,0.47364,0.128886,0.182138
3,4,2.0,0.027792,0.033031,0.060354,0.047331,0.103085,0.260508,0.320369,0.297408,...,-0.589717,0.090632,-0.024303,0.1929,0.103119,0.064809,-0.057309,0.271984,0.171804,0.098295
4,5,5.0,0.0146,0.0225,0.0354,0.0302,0.0634,0.1544,0.1853,0.1868,...,-0.517158,0.140478,-0.098094,0.220915,0.067419,0.114413,-0.287516,0.375354,0.180413,0.211939


## Spatial statistics - used in first dataframe

In [48]:
new_df["row_size"] = new_df.fid.map(row_size)
new_df["col_size"] = new_df.fid.map(column_size)
new_df["area"] = new_df.apply(lambda row: row.row_size * row.col_size, axis = 1)
# number of pixels covered by a field in the area computed
new_df["num_pixels"] = new_df.fid.map(num_pixels)

In [49]:
# Drop non-allowed Field ID in 1st dataframe
new_df = new_df.drop(columns = ["fid"])

## Train, and Test set generation

In [50]:
# 1st dataframe
df_all_train = new_df[new_df.label != 0].copy()
df_all_test = new_df[new_df.label == 0].copy()

df_all_train = df_all_train.reset_index(drop = True)
df_all_test = df_all_test.reset_index(drop = True)

In [51]:
# 2nd dataframe
df_pixels_train = df_pixels[df_pixels.label != 0].copy()
df_pixels_test = df_pixels[df_pixels.label == 0].copy()

df_pixels_train = df_pixels_train.reset_index(drop = True)
df_pixels_test = df_pixels_test.reset_index(drop = True)

In [52]:
(df_all_train.shape, df_all_test.shape), (df_pixels_train.shape, df_pixels_test.shape)

(((3286, 344), (1402, 344)), ((3286, 205), (1402, 205)))

In [53]:
# Drop crop labels (0) from test sets
df_all_test.drop("label", inplace = True, axis = 1)
df_pixels_test.drop("label", inplace = True, axis = 1)

## Train X, and y generation (Features, and Target/Label)

In [54]:
# 1st dataframe
train_X_all = df_all_train.drop("label", axis = 1)
train_y_all = df_all_train.label

In [55]:
# 2nd dataframe
train_X_pixels = df_pixels_train.drop("label", axis = 1)
train_y_pixels = df_pixels_train.label

In [56]:
train_X_all.shape, df_all_test.shape

((3286, 343), (1402, 343))

In [57]:
train_X_pixels.shape, df_pixels_test.shape

((3286, 204), (1402, 204))

## Class/Label weights

In [58]:
# Class weights for cross validation
# Cross validation weights are based on 80% of train set
X_trn, X_val, y_trn, y_val = train_test_split(train_X_all, train_y_all, test_size = 0.2, stratify = train_y_all, random_state = 5, shuffle = True)
label_weights1 = compute_class_weight("balanced", np.unique(y_trn), y_trn)

1509    4.0
2633    2.0
1922    5.0
3051    2.0
       ... 
126     3.0
1741    1.0
1576    1.0
1685    3.0
2246    2.0
Name: label, Length: 2628, dtype: float64 as keyword args. From version 0.25 passing these as positional arguments will result in an error


In [96]:
label_weights1

array([0.32115361, 0.56625727, 4.81318681, 0.96263736, 2.72049689,
       2.93303571, 6.05529954])

In [59]:
# Class weights for full training
# Cross validation weights are based on full train set
label_weights2 = compute_class_weight("balanced", np.unique(train_y_all), train_y_all)

1       2.0
2       2.0
3       5.0
4       2.0
       ... 
3281    1.0
3282    1.0
3283    1.0
3284    1.0
3285    1.0
Name: label, Length: 3286, dtype: float64 as keyword args. From version 0.25 passing these as positional arguments will result in an error


## Permutation Importance for Feature Selection
> NB: Feature selection/dropping was only carried out on 1st dataframe

In [61]:
# cb_pi --> catboost_permutation_importance
cb_pi = CatBoostClassifier(n_estimators = 1400, learning_rate = 0.03, random_state = 11, task_type = "GPU")
cb_pi.fit(X_trn, y_trn)

0:	learn: 1.9110878	total: 27.6ms	remaining: 38.7s
1:	learn: 1.8784228	total: 40.6ms	remaining: 28.4s
2:	learn: 1.8497762	total: 53.7ms	remaining: 25s
3:	learn: 1.8228234	total: 66.7ms	remaining: 23.3s
4:	learn: 1.7973510	total: 79.6ms	remaining: 22.2s
5:	learn: 1.7730135	total: 92.1ms	remaining: 21.4s
6:	learn: 1.7504013	total: 105ms	remaining: 20.9s
7:	learn: 1.7282849	total: 118ms	remaining: 20.6s
8:	learn: 1.7073423	total: 132ms	remaining: 20.3s
9:	learn: 1.6875333	total: 145ms	remaining: 20.1s
10:	learn: 1.6692770	total: 158ms	remaining: 19.9s
11:	learn: 1.6521113	total: 184ms	remaining: 21.3s
12:	learn: 1.6359427	total: 197ms	remaining: 21s
13:	learn: 1.6208648	total: 209ms	remaining: 20.7s
14:	learn: 1.6064375	total: 222ms	remaining: 20.5s
15:	learn: 1.5917727	total: 237ms	remaining: 20.5s
16:	learn: 1.5786595	total: 249ms	remaining: 20.3s
17:	learn: 1.5648812	total: 261ms	remaining: 20.1s
18:	learn: 1.5523881	total: 274ms	remaining: 19.9s
19:	learn: 1.5402665	total: 286ms	remai

<catboost.core.CatBoostClassifier at 0x7f033bfc8c50>

In [62]:
pi = PermutationImportance(cb_pi, random_state = 90, n_iter = 5)
pi.fit(X_val, y_val)

PermutationImportance(estimator=<catboost.core.CatBoostClassifier object at 0x7f033bfc8c50>,
                      random_state=90)

In [63]:
eli5.show_weights(pi, feature_names = train_X_all.columns.tolist(), top = None)

Weight,Feature
0.0188  ± 0.0099,20191004_B05
0.0137  ± 0.0051,NPCRI_20190701
0.0097  ± 0.0097,20191004_B11
0.0088  ± 0.0035,20190919_B05
0.0085  ± 0.0073,area
0.0079  ± 0.0056,NPCRI_20190805
0.0079  ± 0.0070,20190919_B09
0.0076  ± 0.0100,num_pixels
0.0070  ± 0.0053,20190711_B08
0.0067  ± 0.0036,EVI_20190711


In [64]:
pi_results = eli5.formatters.as_dataframe.explain_weights_df(pi, feature_names = train_X_all.columns.tolist())

In [65]:
# feature importance weigth threshold is 0
low_importance = pi_results[pi_results.weight <= 0].feature.values

In [66]:
low_importance

array(['20190825_CLD', '20190919_CLD', '20190701_CLD', '20190924_CLD',
       '20191004_CLD', '20190909_CLD', '20190706_CLD', '20190815_CLD',
       '20190711_CLD', 'NDMI_20190721', '20190909_B12', 'NDVI_20190924',
       '20190909_B04', 'EVI2_20190606', 'BSI_20190815', 'NDMI_20190711',
       '20190815_B07', '20190825_B07', 'SI_20190805', '20190706_B11',
       '20190606_B05', 'NDWI_avg', 'NDWI_min', 'EVI2_avg', '20191004_B8A',
       'AVI_20190606', 'EVI_min', '20190711_B09', 'BSI_20190909',
       '20190721_B04', 'SI_avg', '20190924_B8A', 'SI_20190919',
       '20190606_B03', '20190721_B11', 'NDVI_20190706', '20190711_B12',
       'BSI_20190711', 'EVI2_std', 'NDMI_20190815', 'AVI_20190919',
       '20190721_CLD', '20191004_B07', 'SI_20190606', 'GNDVI_avg',
       'NDVI_20190919', '20190706_B01', 'EVI2_20190805', '20190924_B12',
       '20191004_B03', 'BSI_20190706', '20190711_B06', 'EVI_20191004',
       'NDWI_20190711', '20191103_B06', 'SI_20190711', 'NDWI_20190706',
       'SI_201

## Drop below features based on Permutation Importance (PI) performed above
> NB: Features returned by the PI above may not be the same as the features I decided to eventually drop due to subjective reasons

In [67]:
features_to_drop = [
    '20190815_CLD', '20191004_CLD', '20190919_CLD', '20190701_CLD',
    '20190924_CLD', '20190706_CLD', 'SI_avg', '20190606_B07',
    'BSI_max', 'BSI_20191103', '20190805_B04', '20191004_B03',
    'SI_std', 'NPCRI_20190606', '20190919_B03', '20190805_B08',
    'NPCRI_20190909', 'EVI_min', '20190721_B07', '20190711_B07',
    '20190721_B05', 'AVI_20191103', 'SI_20190924', '20190805_B09',
    '20190706_B09', 'EVI_20191004', 'EVI_20190711', '20191004_B08',
    'NDVI_20191004', '20190825_B8A', '20190919_B01', '20190805_B06',
    'GNDVI_20190706', 'SI_20190805', 'EVI2_20190805', '20190706_B11',
    'col_size', '20190701_B04', 'GNDVI_20191004', 'GNDVI_20190924',
    'NPCRI_max', 'EVI_20190701', 'BSI_20190711', '20190805_B12',
    'SI_20191103', '20190919_B04', 'NDMI_20190924', 'BSI_avg',
    'NPCRI_20190706', '20190919_B8A', '20190909_B05', '20190706_B06',
    'GNDVI_20190606', '20190924_B8A', 'NDWI_20190701', 'AVI_avg',
    'GNDVI_20190919', 'NDVI_20190606', 'NDVI_20191103', 'SI_20190919',
    'GNDVI_20190721', 'NPCRI_20190721', '20190919_B09', '20190606_B01',
    '20190825_B05', '20190909_B09', '20191004_B01', '20190805_B03',
    'GNDVI_20191103', 'NPCRI_20190919', 'EVI_20190805', '20190706_B07',
    '20190909_CLD', '20190825_B12', '20190606_B06', '20190909_B07',
    'BSI_20190825', '20190805_B8A', '20190924_B09', '20190701_B07',
    'EVI2_20190721', 'NPCRI_20190805'
]

In [68]:
train_X_all = train_X_all.drop(columns = features_to_drop)
df_all_test = df_all_test.drop(columns = features_to_drop)

## MODELLING

cb = Catboost model - Dataframe 1  
cb2 = Catboost (with class weights) -  Dataframe 1  

cb_pixels = Catboost -  Dataframe 2  
cb2_pixels = Catboost (with class weights) -  Dataframe 2 

lda = LinearDiscriminantAnalysis model  
bc = BaggingClassifier (Bagged Ensemble) with lda as it's base estimator  

**NB: Catboost is trained with GPU enabled**

In [69]:
cb = CatBoostClassifier(n_estimators = 1500, learning_rate=0.03, depth = 6,
                        random_state = 11, bagging_temperature = 1, task_type = "GPU")
# Use "class_weights = label_weights1" for cross validation
cb2 = CatBoostClassifier(n_estimators = 1100, learning_rate=0.03, depth = 6,
                         random_state = 11, bagging_temperature = 1, task_type = "GPU", class_weights = label_weights2)

cb_pixels = CatBoostClassifier(n_estimators = 1500, learning_rate=0.03, depth = 6, 
                               random_state = 11, bagging_temperature = 1, task_type = "GPU")
# Use "class_weights = label_weights1" for cross validation
cb2_pixels = CatBoostClassifier(n_estimators = 1100, learning_rate=0.03, depth = 6,
                                random_state = 11, bagging_temperature = 1, task_type = "GPU", class_weights = label_weights2)

lda = LinearDiscriminantAnalysis()
bc = BaggingClassifier(base_estimator = lda, n_estimators = 30, random_state = 0)

## Cross Validation (CV)
Cross validation is also used to determine weights to be applied to the average (weighted averga) of models ensembled later on.

## CV with 1st DataFrame

In [70]:
# Catboost
cv_all_1 = cross_val_predict(cb, train_X_all, train_y_all, cv = 5, method = "predict_proba", verbose = 5)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0:	learn: 1.9110678	total: 18.1ms	remaining: 27.2s
1:	learn: 1.8787899	total: 29.4ms	remaining: 22s
2:	learn: 1.8488405	total: 40.6ms	remaining: 20.3s
3:	learn: 1.8196351	total: 52ms	remaining: 19.5s
4:	learn: 1.7947294	total: 63.5ms	remaining: 19s
5:	learn: 1.7706957	total: 76.5ms	remaining: 19.1s
6:	learn: 1.7483403	total: 88.2ms	remaining: 18.8s
7:	learn: 1.7261777	total: 99.9ms	remaining: 18.6s
8:	learn: 1.7058557	total: 111ms	remaining: 18.5s
9:	learn: 1.6863150	total: 122ms	remaining: 18.3s
10:	learn: 1.6676867	total: 134ms	remaining: 18.1s
11:	learn: 1.6509848	total: 145ms	remaining: 18s
12:	learn: 1.6332242	total: 161ms	remaining: 18.4s
13:	learn: 1.6174608	total: 171ms	remaining: 18.2s
14:	learn: 1.6030251	total: 182ms	remaining: 18s
15:	learn: 1.5890957	total: 193ms	remaining: 17.9s
16:	learn: 1.5750957	total: 203ms	remaining: 17.7s
17:	learn: 1.5624110	total: 214ms	remaining: 17.6s
18:	learn: 1.5503463	total: 228ms	remaining: 17.8s
19:	learn: 1.5381253	total: 239ms	remaining

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   18.3s remaining:    0.0s


0:	learn: 1.9098797	total: 13.1ms	remaining: 19.6s
1:	learn: 1.8763417	total: 23.5ms	remaining: 17.6s
2:	learn: 1.8442351	total: 33.4ms	remaining: 16.7s
3:	learn: 1.8157800	total: 43.8ms	remaining: 16.4s
4:	learn: 1.7885800	total: 54.2ms	remaining: 16.2s
5:	learn: 1.7646943	total: 64.6ms	remaining: 16.1s
6:	learn: 1.7416704	total: 74.9ms	remaining: 16s
7:	learn: 1.7191891	total: 85.4ms	remaining: 15.9s
8:	learn: 1.6981581	total: 95.6ms	remaining: 15.8s
9:	learn: 1.6785804	total: 106ms	remaining: 15.8s
10:	learn: 1.6599648	total: 116ms	remaining: 15.7s
11:	learn: 1.6424578	total: 126ms	remaining: 15.6s
12:	learn: 1.6256084	total: 136ms	remaining: 15.6s
13:	learn: 1.6102639	total: 147ms	remaining: 15.6s
14:	learn: 1.5947371	total: 157ms	remaining: 15.5s
15:	learn: 1.5798430	total: 167ms	remaining: 15.5s
16:	learn: 1.5662573	total: 177ms	remaining: 15.5s
17:	learn: 1.5528709	total: 188ms	remaining: 15.5s
18:	learn: 1.5406796	total: 198ms	remaining: 15.4s
19:	learn: 1.5290413	total: 208ms	

[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   35.6s remaining:    0.0s


0:	learn: 1.9119060	total: 13.3ms	remaining: 20s
1:	learn: 1.8791707	total: 24.3ms	remaining: 18.2s
2:	learn: 1.8476564	total: 34.9ms	remaining: 17.4s
3:	learn: 1.8184288	total: 45.6ms	remaining: 17.1s
4:	learn: 1.7912790	total: 56.4ms	remaining: 16.9s
5:	learn: 1.7666478	total: 67.1ms	remaining: 16.7s
6:	learn: 1.7426849	total: 77.7ms	remaining: 16.6s
7:	learn: 1.7202028	total: 88.1ms	remaining: 16.4s
8:	learn: 1.6994143	total: 99.3ms	remaining: 16.5s
9:	learn: 1.6795928	total: 110ms	remaining: 16.4s
10:	learn: 1.6611213	total: 121ms	remaining: 16.3s
11:	learn: 1.6441687	total: 131ms	remaining: 16.3s
12:	learn: 1.6290101	total: 142ms	remaining: 16.2s
13:	learn: 1.6132075	total: 153ms	remaining: 16.2s
14:	learn: 1.5981486	total: 164ms	remaining: 16.2s
15:	learn: 1.5841788	total: 175ms	remaining: 16.2s
16:	learn: 1.5701936	total: 186ms	remaining: 16.2s
17:	learn: 1.5567506	total: 197ms	remaining: 16.2s
18:	learn: 1.5440005	total: 207ms	remaining: 16.2s
19:	learn: 1.5321476	total: 222ms	

[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   54.0s remaining:    0.0s


0:	learn: 1.9099705	total: 13.3ms	remaining: 20s
1:	learn: 1.8776398	total: 24ms	remaining: 18s
2:	learn: 1.8461661	total: 34.7ms	remaining: 17.3s
3:	learn: 1.8175849	total: 45.4ms	remaining: 17s
4:	learn: 1.7912205	total: 55.8ms	remaining: 16.7s
5:	learn: 1.7666166	total: 66ms	remaining: 16.4s
6:	learn: 1.7432729	total: 76.4ms	remaining: 16.3s
7:	learn: 1.7211134	total: 86.6ms	remaining: 16.2s
8:	learn: 1.7004035	total: 97.1ms	remaining: 16.1s
9:	learn: 1.6807148	total: 107ms	remaining: 16s
10:	learn: 1.6614824	total: 117ms	remaining: 15.9s
11:	learn: 1.6443661	total: 128ms	remaining: 15.8s
12:	learn: 1.6280931	total: 138ms	remaining: 15.8s
13:	learn: 1.6120079	total: 148ms	remaining: 15.7s
14:	learn: 1.5974326	total: 158ms	remaining: 15.7s
15:	learn: 1.5829722	total: 169ms	remaining: 15.6s
16:	learn: 1.5692392	total: 179ms	remaining: 15.6s
17:	learn: 1.5570025	total: 190ms	remaining: 15.6s
18:	learn: 1.5437343	total: 200ms	remaining: 15.6s
19:	learn: 1.5315884	total: 212ms	remaining:

[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  1.2min remaining:    0.0s


0:	learn: 1.9114926	total: 13.6ms	remaining: 20.3s
1:	learn: 1.8792320	total: 24.6ms	remaining: 18.4s
2:	learn: 1.8512360	total: 35.2ms	remaining: 17.6s
3:	learn: 1.8226200	total: 45.7ms	remaining: 17.1s
4:	learn: 1.7968527	total: 56.4ms	remaining: 16.9s
5:	learn: 1.7731749	total: 67.1ms	remaining: 16.7s
6:	learn: 1.7504677	total: 77.9ms	remaining: 16.6s
7:	learn: 1.7296452	total: 88.5ms	remaining: 16.5s
8:	learn: 1.7093200	total: 99.3ms	remaining: 16.4s
9:	learn: 1.6899943	total: 110ms	remaining: 16.4s
10:	learn: 1.6711059	total: 121ms	remaining: 16.3s
11:	learn: 1.6536992	total: 131ms	remaining: 16.3s
12:	learn: 1.6368822	total: 142ms	remaining: 16.2s
13:	learn: 1.6219152	total: 153ms	remaining: 16.2s
14:	learn: 1.6074429	total: 163ms	remaining: 16.2s
15:	learn: 1.5927980	total: 174ms	remaining: 16.2s
16:	learn: 1.5790540	total: 186ms	remaining: 16.2s
17:	learn: 1.5652334	total: 197ms	remaining: 16.2s
18:	learn: 1.5521527	total: 207ms	remaining: 16.2s
19:	learn: 1.5405090	total: 221m

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.5min finished


In [71]:
# Catboost with weights
cv_all_2 = cross_val_predict(cb2, train_X_all, train_y_all, cv = 5, method = "predict_proba", verbose = 5)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0:	learn: 1.9347403	total: 13.5ms	remaining: 14.8s
1:	learn: 1.9252350	total: 27.1ms	remaining: 14.9s
2:	learn: 1.9139310	total: 38.3ms	remaining: 14s
3:	learn: 1.9035424	total: 49.4ms	remaining: 13.5s
4:	learn: 1.8943864	total: 60.2ms	remaining: 13.2s
5:	learn: 1.8840554	total: 71.6ms	remaining: 13.1s
6:	learn: 1.8730451	total: 82.6ms	remaining: 12.9s
7:	learn: 1.8635521	total: 93.8ms	remaining: 12.8s
8:	learn: 1.8530283	total: 105ms	remaining: 12.8s
9:	learn: 1.8428295	total: 117ms	remaining: 12.8s
10:	learn: 1.8325367	total: 129ms	remaining: 12.7s
11:	learn: 1.8241809	total: 140ms	remaining: 12.7s
12:	learn: 1.8192266	total: 150ms	remaining: 12.6s
13:	learn: 1.8094728	total: 162ms	remaining: 12.6s
14:	learn: 1.8010631	total: 174ms	remaining: 12.6s
15:	learn: 1.7906107	total: 186ms	remaining: 12.6s
16:	learn: 1.7829987	total: 197ms	remaining: 12.6s
17:	learn: 1.7740969	total: 209ms	remaining: 12.5s
18:	learn: 1.7643567	total: 223ms	remaining: 12.7s
19:	learn: 1.7570845	total: 240ms	r

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   13.6s remaining:    0.0s


0:	learn: 1.9343637	total: 13.5ms	remaining: 14.8s
1:	learn: 1.9222751	total: 24.3ms	remaining: 13.3s
2:	learn: 1.9106005	total: 35ms	remaining: 12.8s
3:	learn: 1.8996006	total: 46.6ms	remaining: 12.8s
4:	learn: 1.8889961	total: 57.4ms	remaining: 12.6s
5:	learn: 1.8783449	total: 68.2ms	remaining: 12.4s
6:	learn: 1.8681715	total: 79.1ms	remaining: 12.3s
7:	learn: 1.8591754	total: 89.8ms	remaining: 12.3s
8:	learn: 1.8500515	total: 100ms	remaining: 12.2s
9:	learn: 1.8399609	total: 111ms	remaining: 12.1s
10:	learn: 1.8303326	total: 122ms	remaining: 12.1s
11:	learn: 1.8204316	total: 134ms	remaining: 12.1s
12:	learn: 1.8120984	total: 145ms	remaining: 12.1s
13:	learn: 1.8035604	total: 156ms	remaining: 12.1s
14:	learn: 1.7936875	total: 167ms	remaining: 12.1s
15:	learn: 1.7850720	total: 179ms	remaining: 12.1s
16:	learn: 1.7770677	total: 190ms	remaining: 12.1s
17:	learn: 1.7681908	total: 201ms	remaining: 12.1s
18:	learn: 1.7596367	total: 215ms	remaining: 12.2s
19:	learn: 1.7501884	total: 226ms	r

[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   26.7s remaining:    0.0s


0:	learn: 1.9344753	total: 13.3ms	remaining: 14.6s
1:	learn: 1.9226643	total: 23.6ms	remaining: 12.9s
2:	learn: 1.9101211	total: 33.9ms	remaining: 12.4s
3:	learn: 1.8985569	total: 44.3ms	remaining: 12.1s
4:	learn: 1.8892435	total: 54.7ms	remaining: 12s
5:	learn: 1.8765672	total: 65.3ms	remaining: 11.9s
6:	learn: 1.8663890	total: 76.1ms	remaining: 11.9s
7:	learn: 1.8545607	total: 86.4ms	remaining: 11.8s
8:	learn: 1.8449580	total: 96.8ms	remaining: 11.7s
9:	learn: 1.8337009	total: 107ms	remaining: 11.7s
10:	learn: 1.8249222	total: 117ms	remaining: 11.6s
11:	learn: 1.8168479	total: 128ms	remaining: 11.6s
12:	learn: 1.8097428	total: 138ms	remaining: 11.5s
13:	learn: 1.7998331	total: 149ms	remaining: 11.5s
14:	learn: 1.7908618	total: 159ms	remaining: 11.5s
15:	learn: 1.7824572	total: 170ms	remaining: 11.5s
16:	learn: 1.7737381	total: 180ms	remaining: 11.5s
17:	learn: 1.7633586	total: 191ms	remaining: 11.5s
18:	learn: 1.7542665	total: 201ms	remaining: 11.4s
19:	learn: 1.7447084	total: 215ms	

[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   39.7s remaining:    0.0s


0:	learn: 1.9363659	total: 27.8ms	remaining: 30.5s
1:	learn: 1.9236608	total: 48ms	remaining: 26.3s
2:	learn: 1.9120169	total: 68.3ms	remaining: 25s
3:	learn: 1.9013816	total: 88.8ms	remaining: 24.3s
4:	learn: 1.8890563	total: 109ms	remaining: 23.9s
5:	learn: 1.8776684	total: 128ms	remaining: 23.3s
6:	learn: 1.8680693	total: 147ms	remaining: 22.9s
7:	learn: 1.8577103	total: 166ms	remaining: 22.7s
8:	learn: 1.8457375	total: 186ms	remaining: 22.5s
9:	learn: 1.8357890	total: 206ms	remaining: 22.5s
10:	learn: 1.8260081	total: 233ms	remaining: 23s
11:	learn: 1.8170782	total: 256ms	remaining: 23.2s
12:	learn: 1.8063107	total: 277ms	remaining: 23.1s
13:	learn: 1.8002531	total: 287ms	remaining: 22.3s
14:	learn: 1.7900782	total: 298ms	remaining: 21.6s
15:	learn: 1.7816684	total: 309ms	remaining: 21s
16:	learn: 1.7729477	total: 320ms	remaining: 20.4s
17:	learn: 1.7636622	total: 331ms	remaining: 19.9s
18:	learn: 1.7552039	total: 342ms	remaining: 19.5s
19:	learn: 1.7472946	total: 354ms	remaining: 

[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   53.5s remaining:    0.0s


0:	learn: 1.9351291	total: 12.9ms	remaining: 14.2s
1:	learn: 1.9225531	total: 23.3ms	remaining: 12.8s
2:	learn: 1.9118040	total: 33.7ms	remaining: 12.3s
3:	learn: 1.8994612	total: 44.3ms	remaining: 12.1s
4:	learn: 1.8847700	total: 54.9ms	remaining: 12s
5:	learn: 1.8744351	total: 65.5ms	remaining: 11.9s
6:	learn: 1.8639513	total: 76ms	remaining: 11.9s
7:	learn: 1.8531100	total: 86.2ms	remaining: 11.8s
8:	learn: 1.8408523	total: 96.7ms	remaining: 11.7s
9:	learn: 1.8334456	total: 107ms	remaining: 11.7s
10:	learn: 1.8255549	total: 117ms	remaining: 11.6s
11:	learn: 1.8173807	total: 127ms	remaining: 11.6s
12:	learn: 1.8086015	total: 138ms	remaining: 11.5s
13:	learn: 1.8031125	total: 147ms	remaining: 11.4s
14:	learn: 1.7930087	total: 158ms	remaining: 11.4s
15:	learn: 1.7841703	total: 168ms	remaining: 11.4s
16:	learn: 1.7763944	total: 178ms	remaining: 11.4s
17:	learn: 1.7683278	total: 189ms	remaining: 11.3s
18:	learn: 1.7581498	total: 199ms	remaining: 11.3s
19:	learn: 1.7488221	total: 210ms	re

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.1min finished


In [72]:
# Bagged LDA
cv_all_3 = cross_val_predict(bc, train_X_all, train_y_all, cv = 5, method = "predict_proba", verbose = 5)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    7.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   13.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   20.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   26.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   33.2s finished


In [73]:
# Weighted Average of above 3 (in two steps)
cv_all_1_2 = (0.72 * cv_all_1) + ((1 - 0.72) * cv_all_2)
cv_all_1_2_3 = (0.7 * cv_all_1_2) + ((1 - 0.7) * cv_all_3)

## CV with 2nd DataFrame

In [74]:
# Catboost
cv_pixels_1 = cross_val_predict(cb_pixels, train_X_pixels, train_y_pixels, cv = 5, method = "predict_proba", verbose = 5)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0:	learn: 1.9126179	total: 12.5ms	remaining: 18.7s
1:	learn: 1.8841315	total: 22.6ms	remaining: 16.9s
2:	learn: 1.8563868	total: 35.9ms	remaining: 17.9s
3:	learn: 1.8326721	total: 45.8ms	remaining: 17.1s
4:	learn: 1.8086857	total: 55.6ms	remaining: 16.6s
5:	learn: 1.7848376	total: 65.6ms	remaining: 16.3s
6:	learn: 1.7606991	total: 75.2ms	remaining: 16s
7:	learn: 1.7394733	total: 85ms	remaining: 15.9s
8:	learn: 1.7197921	total: 95ms	remaining: 15.7s
9:	learn: 1.7004079	total: 134ms	remaining: 20s
10:	learn: 1.6818593	total: 154ms	remaining: 20.9s
11:	learn: 1.6642673	total: 182ms	remaining: 22.5s
12:	learn: 1.6481111	total: 226ms	remaining: 25.9s
13:	learn: 1.6328333	total: 251ms	remaining: 26.6s
14:	learn: 1.6184143	total: 271ms	remaining: 26.8s
15:	learn: 1.6037371	total: 294ms	remaining: 27.3s
16:	learn: 1.5897726	total: 327ms	remaining: 28.6s
17:	learn: 1.5762677	total: 343ms	remaining: 28.3s
18:	learn: 1.5630836	total: 361ms	remaining: 28.1s
19:	learn: 1.5508927	total: 389ms	remain

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   16.8s remaining:    0.0s


0:	learn: 1.9125551	total: 12.7ms	remaining: 19.1s
1:	learn: 1.8819947	total: 23.1ms	remaining: 17.3s
2:	learn: 1.8534802	total: 33.5ms	remaining: 16.7s
3:	learn: 1.8259572	total: 43.5ms	remaining: 16.3s
4:	learn: 1.8000980	total: 53.5ms	remaining: 16s
5:	learn: 1.7752227	total: 63.6ms	remaining: 15.8s
6:	learn: 1.7526275	total: 73.8ms	remaining: 15.7s
7:	learn: 1.7308605	total: 84ms	remaining: 15.7s
8:	learn: 1.7105911	total: 93.9ms	remaining: 15.6s
9:	learn: 1.6895915	total: 104ms	remaining: 15.5s
10:	learn: 1.6724489	total: 114ms	remaining: 15.4s
11:	learn: 1.6561766	total: 124ms	remaining: 15.3s
12:	learn: 1.6407123	total: 134ms	remaining: 15.3s
13:	learn: 1.6245990	total: 144ms	remaining: 15.3s
14:	learn: 1.6096501	total: 154ms	remaining: 15.2s
15:	learn: 1.5953930	total: 164ms	remaining: 15.2s
16:	learn: 1.5821048	total: 174ms	remaining: 15.1s
17:	learn: 1.5686879	total: 183ms	remaining: 15.1s
18:	learn: 1.5555890	total: 193ms	remaining: 15.1s
19:	learn: 1.5437676	total: 203ms	re

[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   33.0s remaining:    0.0s


0:	learn: 1.9103396	total: 24ms	remaining: 35.9s
1:	learn: 1.8791915	total: 39.4ms	remaining: 29.5s
2:	learn: 1.8499968	total: 59.8ms	remaining: 29.8s
3:	learn: 1.8240475	total: 76.8ms	remaining: 28.7s
4:	learn: 1.7991920	total: 93.6ms	remaining: 28s
5:	learn: 1.7758445	total: 111ms	remaining: 27.6s
6:	learn: 1.7529321	total: 128ms	remaining: 27.2s
7:	learn: 1.7315926	total: 145ms	remaining: 27s
8:	learn: 1.7114581	total: 162ms	remaining: 26.8s
9:	learn: 1.6927615	total: 180ms	remaining: 26.9s
10:	learn: 1.6753013	total: 198ms	remaining: 26.7s
11:	learn: 1.6577293	total: 207ms	remaining: 25.7s
12:	learn: 1.6411709	total: 217ms	remaining: 24.8s
13:	learn: 1.6253828	total: 230ms	remaining: 24.4s
14:	learn: 1.6108398	total: 239ms	remaining: 23.7s
15:	learn: 1.5961340	total: 249ms	remaining: 23.1s
16:	learn: 1.5833213	total: 258ms	remaining: 22.5s
17:	learn: 1.5707813	total: 268ms	remaining: 22.1s
18:	learn: 1.5578129	total: 278ms	remaining: 21.7s
19:	learn: 1.5458201	total: 288ms	remainin

[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   50.3s remaining:    0.0s


0:	learn: 1.9086817	total: 12.9ms	remaining: 19.4s
1:	learn: 1.8788223	total: 23.5ms	remaining: 17.6s
2:	learn: 1.8490910	total: 33.7ms	remaining: 16.8s
3:	learn: 1.8217681	total: 44.2ms	remaining: 16.5s
4:	learn: 1.7974925	total: 54.4ms	remaining: 16.3s
5:	learn: 1.7732644	total: 65ms	remaining: 16.2s
6:	learn: 1.7496144	total: 75.3ms	remaining: 16.1s
7:	learn: 1.7280130	total: 85.7ms	remaining: 16s
8:	learn: 1.7075377	total: 96ms	remaining: 15.9s
9:	learn: 1.6891693	total: 107ms	remaining: 15.9s
10:	learn: 1.6711871	total: 117ms	remaining: 15.8s
11:	learn: 1.6536078	total: 127ms	remaining: 15.7s
12:	learn: 1.6374329	total: 137ms	remaining: 15.7s
13:	learn: 1.6222741	total: 147ms	remaining: 15.6s
14:	learn: 1.6072805	total: 156ms	remaining: 15.5s
15:	learn: 1.5925554	total: 166ms	remaining: 15.4s
16:	learn: 1.5790334	total: 176ms	remaining: 15.3s
17:	learn: 1.5654492	total: 186ms	remaining: 15.3s
18:	learn: 1.5529495	total: 196ms	remaining: 15.3s
19:	learn: 1.5410559	total: 206ms	rema

[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  1.1min remaining:    0.0s


0:	learn: 1.9102521	total: 12.2ms	remaining: 18.4s
1:	learn: 1.8802487	total: 22ms	remaining: 16.5s
2:	learn: 1.8505989	total: 31.7ms	remaining: 15.8s
3:	learn: 1.8242657	total: 41.8ms	remaining: 15.6s
4:	learn: 1.7987286	total: 51.8ms	remaining: 15.5s
5:	learn: 1.7751698	total: 61.5ms	remaining: 15.3s
6:	learn: 1.7529873	total: 71.4ms	remaining: 15.2s
7:	learn: 1.7313389	total: 83.3ms	remaining: 15.5s
8:	learn: 1.7121778	total: 93.2ms	remaining: 15.4s
9:	learn: 1.6935128	total: 103ms	remaining: 15.3s
10:	learn: 1.6765524	total: 113ms	remaining: 15.3s
11:	learn: 1.6593949	total: 123ms	remaining: 15.2s
12:	learn: 1.6434733	total: 132ms	remaining: 15.1s
13:	learn: 1.6271756	total: 142ms	remaining: 15.1s
14:	learn: 1.6124158	total: 152ms	remaining: 15s
15:	learn: 1.5974172	total: 161ms	remaining: 15s
16:	learn: 1.5836532	total: 171ms	remaining: 14.9s
17:	learn: 1.5716135	total: 181ms	remaining: 14.9s
18:	learn: 1.5590993	total: 191ms	remaining: 14.9s
19:	learn: 1.5469781	total: 201ms	rema

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.4min finished


In [75]:
# Catboost with weights
cv_pixels_2 = cross_val_predict(cb2_pixels, train_X_pixels, train_y_pixels, cv = 5, method = "predict_proba", verbose = 5)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0:	learn: 1.9349539	total: 11.8ms	remaining: 13s
1:	learn: 1.9264988	total: 21.4ms	remaining: 11.8s
2:	learn: 1.9161221	total: 30.7ms	remaining: 11.2s
3:	learn: 1.9069436	total: 40ms	remaining: 11s
4:	learn: 1.8970771	total: 49.3ms	remaining: 10.8s
5:	learn: 1.8865303	total: 58.7ms	remaining: 10.7s
6:	learn: 1.8762032	total: 68.1ms	remaining: 10.6s
7:	learn: 1.8676664	total: 77.5ms	remaining: 10.6s
8:	learn: 1.8570773	total: 87.2ms	remaining: 10.6s
9:	learn: 1.8468940	total: 97ms	remaining: 10.6s
10:	learn: 1.8372926	total: 107ms	remaining: 10.6s
11:	learn: 1.8288096	total: 117ms	remaining: 10.6s
12:	learn: 1.8220187	total: 127ms	remaining: 10.6s
13:	learn: 1.8129928	total: 137ms	remaining: 10.6s
14:	learn: 1.8062943	total: 146ms	remaining: 10.6s
15:	learn: 1.7969162	total: 156ms	remaining: 10.6s
16:	learn: 1.7879532	total: 167ms	remaining: 10.6s
17:	learn: 1.7802343	total: 176ms	remaining: 10.6s
18:	learn: 1.7706305	total: 186ms	remaining: 10.6s
19:	learn: 1.7634805	total: 196ms	remai

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   12.1s remaining:    0.0s


0:	learn: 1.9344391	total: 12.5ms	remaining: 13.7s
1:	learn: 1.9237928	total: 22.9ms	remaining: 12.6s
2:	learn: 1.9141263	total: 33ms	remaining: 12s
3:	learn: 1.9049306	total: 43.1ms	remaining: 11.8s
4:	learn: 1.8925092	total: 53.6ms	remaining: 11.7s
5:	learn: 1.8847695	total: 63.5ms	remaining: 11.6s
6:	learn: 1.8749682	total: 73.6ms	remaining: 11.5s
7:	learn: 1.8640405	total: 83.6ms	remaining: 11.4s
8:	learn: 1.8550331	total: 93.7ms	remaining: 11.4s
9:	learn: 1.8460227	total: 104ms	remaining: 11.3s
10:	learn: 1.8344147	total: 114ms	remaining: 11.3s
11:	learn: 1.8255309	total: 123ms	remaining: 11.2s
12:	learn: 1.8177931	total: 133ms	remaining: 11.1s
13:	learn: 1.8096229	total: 143ms	remaining: 11.1s
14:	learn: 1.7992640	total: 153ms	remaining: 11.1s
15:	learn: 1.7906642	total: 163ms	remaining: 11s
16:	learn: 1.7824573	total: 173ms	remaining: 11s
17:	learn: 1.7721649	total: 183ms	remaining: 11s
18:	learn: 1.7639877	total: 192ms	remaining: 10.9s
19:	learn: 1.7556946	total: 202ms	remainin

[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   25.0s remaining:    0.0s


0:	learn: 1.9335485	total: 12.5ms	remaining: 13.7s
1:	learn: 1.9232742	total: 22.2ms	remaining: 12.2s
2:	learn: 1.9139385	total: 32ms	remaining: 11.7s
3:	learn: 1.9025329	total: 45.4ms	remaining: 12.4s
4:	learn: 1.8925534	total: 55.3ms	remaining: 12.1s
5:	learn: 1.8789592	total: 65.2ms	remaining: 11.9s
6:	learn: 1.8680931	total: 74.9ms	remaining: 11.7s
7:	learn: 1.8556306	total: 84.8ms	remaining: 11.6s
8:	learn: 1.8447377	total: 94.8ms	remaining: 11.5s
9:	learn: 1.8340363	total: 105ms	remaining: 11.4s
10:	learn: 1.8248888	total: 115ms	remaining: 11.3s
11:	learn: 1.8161121	total: 124ms	remaining: 11.3s
12:	learn: 1.8072671	total: 134ms	remaining: 11.2s
13:	learn: 1.7987266	total: 144ms	remaining: 11.1s
14:	learn: 1.7913162	total: 153ms	remaining: 11.1s
15:	learn: 1.7825731	total: 163ms	remaining: 11.1s
16:	learn: 1.7759152	total: 173ms	remaining: 11s
17:	learn: 1.7662568	total: 183ms	remaining: 11s
18:	learn: 1.7566217	total: 193ms	remaining: 11s
19:	learn: 1.7485374	total: 202ms	remain

[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   37.4s remaining:    0.0s


0:	learn: 1.9344640	total: 17.4ms	remaining: 19.2s
1:	learn: 1.9250658	total: 64.8ms	remaining: 35.6s
2:	learn: 1.9143052	total: 109ms	remaining: 39.8s
3:	learn: 1.9043199	total: 119ms	remaining: 32.6s
4:	learn: 1.8948181	total: 129ms	remaining: 28.3s
5:	learn: 1.8832419	total: 139ms	remaining: 25.4s
6:	learn: 1.8750300	total: 150ms	remaining: 23.5s
7:	learn: 1.8635079	total: 165ms	remaining: 22.5s
8:	learn: 1.8534234	total: 175ms	remaining: 21.2s
9:	learn: 1.8417795	total: 185ms	remaining: 20.2s
10:	learn: 1.8321739	total: 199ms	remaining: 19.7s
11:	learn: 1.8224274	total: 209ms	remaining: 19s
12:	learn: 1.8148828	total: 222ms	remaining: 18.5s
13:	learn: 1.8040094	total: 235ms	remaining: 18.3s
14:	learn: 1.7946055	total: 246ms	remaining: 17.8s
15:	learn: 1.7843333	total: 256ms	remaining: 17.3s
16:	learn: 1.7753644	total: 268ms	remaining: 17.1s
17:	learn: 1.7680669	total: 298ms	remaining: 17.9s
18:	learn: 1.7574687	total: 314ms	remaining: 17.8s
19:	learn: 1.7487568	total: 323ms	remaini

[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   49.5s remaining:    0.0s


0:	learn: 1.9349627	total: 12.1ms	remaining: 13.3s
1:	learn: 1.9237637	total: 21.8ms	remaining: 12s
2:	learn: 1.9138985	total: 31.9ms	remaining: 11.7s
3:	learn: 1.9004397	total: 42.2ms	remaining: 11.6s
4:	learn: 1.8906176	total: 52ms	remaining: 11.4s
5:	learn: 1.8805103	total: 61.8ms	remaining: 11.3s
6:	learn: 1.8693986	total: 71.4ms	remaining: 11.1s
7:	learn: 1.8556556	total: 81ms	remaining: 11.1s
8:	learn: 1.8444425	total: 90.7ms	remaining: 11s
9:	learn: 1.8351644	total: 100ms	remaining: 10.9s
10:	learn: 1.8278706	total: 110ms	remaining: 10.9s
11:	learn: 1.8197556	total: 119ms	remaining: 10.8s
12:	learn: 1.8116358	total: 129ms	remaining: 10.8s
13:	learn: 1.8042841	total: 139ms	remaining: 10.8s
14:	learn: 1.7960792	total: 149ms	remaining: 10.7s
15:	learn: 1.7863291	total: 158ms	remaining: 10.7s
16:	learn: 1.7778485	total: 168ms	remaining: 10.7s
17:	learn: 1.7696866	total: 178ms	remaining: 10.7s
18:	learn: 1.7595069	total: 187ms	remaining: 10.7s
19:	learn: 1.7510847	total: 197ms	remain

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.0min finished


In [76]:
# Bagged LDA
cv_pixels_3 = cross_val_predict(bc, train_X_pixels, train_y_pixels, cv = 5, method = "predict_proba", verbose = 5)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   10.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   15.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   20.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   25.6s finished


In [77]:
# Weighted Average of above 3 (in two steps)
cv_pixels_1_2 = (0.76 * cv_pixels_1) + ((1 - 0.76) * cv_pixels_2)
cv_pixels_1_2_3 = (0.67 * cv_pixels_1_2 ) + ((1 - 0.67) * cv_pixels_3)

In [78]:
# Code to determine weights
scores = []

for w in range(0, 100):
    w = w / 100.
#     two cross-validation results of choice should be imputed to determine appropriate weights
    scores.append(log_loss(train_y_all, (w * cv_all_1_2_3) + ((1 - w) * cv_pixels_1_2_3)))

best_score = min(scores)
weight = scores.index(best_score) / 100.

print(weight, best_score)

0.68 1.2931788151755188


# TRAINING

## Modelling Architecture - Explanation

The two different datasets each have the same model pipeline.  

For each of the datasets:  
### Level 1
Training is done using both a CatboostClassifier (without class_weights), and another CatboostClassifier (with class_weights), denoted by "cb" and "cb2" respectively.  
The weighted average of the predictions from both classifiers are obtained, using:   
1. 72% of cb,  and 28% of cb2 for dataset 1;
2. 76% of cb, and 24% of cb2 for dataset 2  

### Level 2
The model used for training here is a bagged ensemble (using sklearn's BaggingClassifier), with the LinearDiscriminantAnalysis alogrithm as the base estimator.  
The predicitons from the bagged classifier is then averaged (weighted) with the individual results from level 1, using:   
1. 70% of level 1, and 30% of bagged classifier for dataset 1;  
2. 67% of level 1, and 33% of bagged classifier for dataset 2  

### Level 3
This level simply finds the weighted average of the final predictions from both datasets.  
It takes 68% of the final predictions from dataset 1, and 32% from dataset 2.  

## 1st part of model - using dataset 1

In [79]:
cb.fit(train_X_all, train_y_all)

0:	learn: 1.9093373	total: 14.1ms	remaining: 21.1s
1:	learn: 1.8762225	total: 24.9ms	remaining: 18.7s
2:	learn: 1.8451582	total: 35.9ms	remaining: 17.9s
3:	learn: 1.8164707	total: 46.9ms	remaining: 17.6s
4:	learn: 1.7904840	total: 57.8ms	remaining: 17.3s
5:	learn: 1.7660992	total: 68.8ms	remaining: 17.1s
6:	learn: 1.7420454	total: 79.4ms	remaining: 16.9s
7:	learn: 1.7199368	total: 90.5ms	remaining: 16.9s
8:	learn: 1.6994216	total: 101ms	remaining: 16.8s
9:	learn: 1.6803201	total: 112ms	remaining: 16.8s
10:	learn: 1.6621074	total: 123ms	remaining: 16.7s
11:	learn: 1.6452167	total: 135ms	remaining: 16.7s
12:	learn: 1.6286684	total: 146ms	remaining: 16.7s
13:	learn: 1.6133994	total: 157ms	remaining: 16.6s
14:	learn: 1.5988499	total: 168ms	remaining: 16.6s
15:	learn: 1.5847305	total: 179ms	remaining: 16.6s
16:	learn: 1.5710452	total: 190ms	remaining: 16.6s
17:	learn: 1.5580468	total: 201ms	remaining: 16.6s
18:	learn: 1.5458611	total: 215ms	remaining: 16.7s
19:	learn: 1.5340302	total: 226ms

<catboost.core.CatBoostClassifier at 0x7f033c838c90>

In [80]:
cb2.fit(train_X_all, train_y_all)

0:	learn: 1.9339088	total: 12.7ms	remaining: 14s
1:	learn: 1.9218413	total: 23.1ms	remaining: 12.7s
2:	learn: 1.9120079	total: 33.4ms	remaining: 12.2s
3:	learn: 1.9005917	total: 43.7ms	remaining: 12s
4:	learn: 1.8887884	total: 54ms	remaining: 11.8s
5:	learn: 1.8787644	total: 64.3ms	remaining: 11.7s
6:	learn: 1.8703732	total: 74.5ms	remaining: 11.6s
7:	learn: 1.8595573	total: 84.8ms	remaining: 11.6s
8:	learn: 1.8476506	total: 95.3ms	remaining: 11.6s
9:	learn: 1.8381315	total: 106ms	remaining: 11.5s
10:	learn: 1.8304627	total: 116ms	remaining: 11.5s
11:	learn: 1.8207375	total: 126ms	remaining: 11.5s
12:	learn: 1.8117169	total: 137ms	remaining: 11.4s
13:	learn: 1.8034019	total: 147ms	remaining: 11.4s
14:	learn: 1.7959047	total: 157ms	remaining: 11.4s
15:	learn: 1.7875073	total: 168ms	remaining: 11.4s
16:	learn: 1.7790533	total: 178ms	remaining: 11.3s
17:	learn: 1.7713499	total: 188ms	remaining: 11.3s
18:	learn: 1.7630153	total: 199ms	remaining: 11.3s
19:	learn: 1.7543240	total: 209ms	rema

<catboost.core.CatBoostClassifier at 0x7f033c87bc90>

In [81]:
bc.fit(train_X_all, train_y_all)

BaggingClassifier(base_estimator=LinearDiscriminantAnalysis(), n_estimators=30,
                  random_state=0)

### Predictions

In [82]:
test_preds_all_1 = cb.predict_proba(df_all_test)
test_preds_all_2 = cb2.predict_proba(df_all_test)
test_preds_all_3 = bc.predict_proba(df_all_test)

### Weighted Average

In [83]:
# Level 1
test_preds_all = (0.72 * test_preds_all_1) + ((1 - 0.72) * test_preds_all_2)
# Level 2
test_preds_all = (0.7 * test_preds_all) + ((1 - 0.7) * test_preds_all_3)

## 2nd part of model - using dataset 2

In [84]:
cb_pixels.fit(train_X_pixels, train_y_pixels)

0:	learn: 1.9098635	total: 12.1ms	remaining: 18.2s
1:	learn: 1.8789418	total: 21.9ms	remaining: 16.4s
2:	learn: 1.8492110	total: 31.6ms	remaining: 15.7s
3:	learn: 1.8217861	total: 41.5ms	remaining: 15.5s
4:	learn: 1.7972799	total: 51.5ms	remaining: 15.4s
5:	learn: 1.7733504	total: 61.1ms	remaining: 15.2s
6:	learn: 1.7499850	total: 70.8ms	remaining: 15.1s
7:	learn: 1.7283682	total: 80.7ms	remaining: 15.1s
8:	learn: 1.7092533	total: 90.6ms	remaining: 15s
9:	learn: 1.6914216	total: 100ms	remaining: 15s
10:	learn: 1.6740840	total: 110ms	remaining: 15s
11:	learn: 1.6574064	total: 120ms	remaining: 14.9s
12:	learn: 1.6416409	total: 130ms	remaining: 14.9s
13:	learn: 1.6256987	total: 140ms	remaining: 14.8s
14:	learn: 1.6114167	total: 150ms	remaining: 14.8s
15:	learn: 1.5971394	total: 161ms	remaining: 14.9s
16:	learn: 1.5841288	total: 171ms	remaining: 14.9s
17:	learn: 1.5713674	total: 182ms	remaining: 15s
18:	learn: 1.5590369	total: 192ms	remaining: 15s
19:	learn: 1.5475884	total: 202ms	remainin

<catboost.core.CatBoostClassifier at 0x7f033c838150>

In [85]:
cb2_pixels.fit(train_X_pixels, train_y_pixels)

0:	learn: 1.9368506	total: 12.8ms	remaining: 14.1s
1:	learn: 1.9287998	total: 22.8ms	remaining: 12.5s
2:	learn: 1.9208295	total: 33.2ms	remaining: 12.1s
3:	learn: 1.9108251	total: 43.6ms	remaining: 11.9s
4:	learn: 1.9002652	total: 54.1ms	remaining: 11.9s
5:	learn: 1.8883274	total: 64.2ms	remaining: 11.7s
6:	learn: 1.8785392	total: 74.3ms	remaining: 11.6s
7:	learn: 1.8679718	total: 84.5ms	remaining: 11.5s
8:	learn: 1.8586234	total: 94.8ms	remaining: 11.5s
9:	learn: 1.8479121	total: 105ms	remaining: 11.4s
10:	learn: 1.8407642	total: 115ms	remaining: 11.4s
11:	learn: 1.8338716	total: 125ms	remaining: 11.3s
12:	learn: 1.8249744	total: 135ms	remaining: 11.3s
13:	learn: 1.8156151	total: 145ms	remaining: 11.3s
14:	learn: 1.8066760	total: 155ms	remaining: 11.2s
15:	learn: 1.7994607	total: 165ms	remaining: 11.2s
16:	learn: 1.7912849	total: 175ms	remaining: 11.2s
17:	learn: 1.7826168	total: 185ms	remaining: 11.1s
18:	learn: 1.7750195	total: 195ms	remaining: 11.1s
19:	learn: 1.7665420	total: 206m

<catboost.core.CatBoostClassifier at 0x7f033c838a90>

In [86]:
bc.fit(train_X_pixels, train_y_pixels)

BaggingClassifier(base_estimator=LinearDiscriminantAnalysis(), n_estimators=30,
                  random_state=0)

### Predictions

In [88]:
test_preds_pixels_1 = cb_pixels.predict_proba(df_pixels_test)
test_preds_pixels_2 = cb2_pixels.predict_proba(df_pixels_test)
test_preds_pixels_3 = bc.predict_proba(df_pixels_test)

### Weighted Average

In [89]:
# Level 1
test_preds_pixels = (0.76 * test_preds_pixels_1) + ((1 - 0.76) * test_preds_pixels_2)
# Level 2
test_preds_pixels = (0.67 * test_preds_pixels) + ((1 - 0.67) * test_preds_pixels_3)

## Final Predictions - Weighted Average of 2 parts

In [90]:
# Level 3
test_preds = (0.68 * test_preds_all) + ((1 - 0.68) * test_preds_pixels)

### Convert to DataFrame

In [91]:
test_preds = pd.DataFrame(test_preds)

### Merge predictions with Sample Submission format

In [92]:
sample_submission.Crop_ID_1 = test_preds[0]
sample_submission.Crop_ID_2 = test_preds[1]
sample_submission.Crop_ID_3 = test_preds[2]
sample_submission.Crop_ID_4 = test_preds[3]
sample_submission.Crop_ID_5 = test_preds[4]
sample_submission.Crop_ID_6 = test_preds[5]
sample_submission.Crop_ID_7 = test_preds[6]

In [93]:
sample_submission.head()

Unnamed: 0,Field_ID,Crop_ID_1,Crop_ID_2,Crop_ID_3,Crop_ID_4,Crop_ID_5,Crop_ID_6,Crop_ID_7
0,3,0.048144,0.63308,0.112392,0.034141,0.060961,0.071527,0.039755
1,6,0.033953,0.923566,0.002183,0.015207,0.016838,0.006105,0.002148
2,11,0.068728,0.592727,0.011139,0.221482,0.062336,0.020743,0.022846
3,13,0.266771,0.635671,0.011173,0.027469,0.02194,0.029894,0.007081
4,14,0.041726,0.831046,0.038373,0.025451,0.017899,0.004614,0.04089


### Export 

In [94]:
sample_submission.to_csv("SampleSub.csv", index = False)