In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import os
import sys
sys.path.insert(0, "C:\\Users\\KonuTech\\DataSpellProjects\\kaggle-tabular-playground-series-dec-2021\\scripts")
import json
import numpy as np
import pandas as pd
import math

In [3]:
from catboost import CatBoostClassifier, Pool, EShapCalcType, EFeaturesSelectionAlgorithm

In [4]:
! python scripts\unzip.py inputs\tabular-playground-series-dec-2021.zip

In [5]:
CURRENT_WORKING_DIRECTORY = os.getcwd()

In [6]:
INPUTS = CURRENT_WORKING_DIRECTORY + "\\INPUTS"
SCRIPTS = CURRENT_WORKING_DIRECTORY + "\\SCRIPTS"

In [7]:
CONFIG_FILE = "config.json"
with open(CURRENT_WORKING_DIRECTORY + "\\" + CONFIG_FILE, encoding='utf-8') as f:
    CONFIG = json.load(f)

In [8]:
TRAIN_PATH = CONFIG["INPUTS"]["TRAIN_PATH"][0]
TEST_PATH = CONFIG["INPUTS"]["TEST_PATH"][0]
TARGET = CONFIG["INPUTS"]["TARGET"]
INDEX_COL = CONFIG["INPUTS"]["INDEX_COLUMNS"]
SEP = CONFIG["INPUTS"]["SEPARATOR"]
DECIMAL = CONFIG["INPUTS"]["DECIMAL"]
ENCODING = CONFIG["INPUTS"]["ENCODING"]
DATE_COLUMNS = CONFIG["INPUTS"]["DATE_COLUMNS"]
FLOAT_PRECISION = CONFIG["INPUTS"]["FLOAT_PRECISION"]
DTYPE = CONFIG["INPUTS"]["DTYPE"]
COLUMNS_WITH_NAN_VALUES = CONFIG["INPUTS"]["COLUMNS_WITH_NAN_VALUES"]

In [9]:
TARGET

'Cover_Type'

# TRAIN OVERVIEW

In [10]:
train = pd.read_csv(
    INPUTS + "\\" + TRAIN_PATH,
    index_col=INDEX_COL,
    sep=SEP,
    encoding=ENCODING,
    infer_datetime_format=True,
    engine="c",
    low_memory=False,
    # dtype=DTYPE
)

In [11]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4000000 entries, 0 to 3999999
Data columns (total 55 columns):
 #   Column                              Dtype
---  ------                              -----
 0   Elevation                           int64
 1   Aspect                              int64
 2   Slope                               int64
 3   Horizontal_Distance_To_Hydrology    int64
 4   Vertical_Distance_To_Hydrology      int64
 5   Horizontal_Distance_To_Roadways     int64
 6   Hillshade_9am                       int64
 7   Hillshade_Noon                      int64
 8   Hillshade_3pm                       int64
 9   Horizontal_Distance_To_Fire_Points  int64
 10  Wilderness_Area1                    int64
 11  Wilderness_Area2                    int64
 12  Wilderness_Area3                    int64
 13  Wilderness_Area4                    int64
 14  Soil_Type1                          int64
 15  Soil_Type2                          int64
 16  Soil_Type3                          

In [12]:
train[TARGET].value_counts()

2    2262087
1    1468136
3     195712
7      62261
6      11426
4        377
5          1
Name: Cover_Type, dtype: int64

In [13]:
TRAIN_NUMERIC_COLUMNS = train.select_dtypes(include=["float64", "int64"]).columns
TRAIN_NUMERIC_COLUMNS = TRAIN_NUMERIC_COLUMNS.drop(TARGET)
TRAIN_NUMERIC_COLUMNS

Index(['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology',
       'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways',
       'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm',
       'Horizontal_Distance_To_Fire_Points', 'Wilderness_Area1',
       'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4',
       'Soil_Type1', 'Soil_Type2', 'Soil_Type3', 'Soil_Type4', 'Soil_Type5',
       'Soil_Type6', 'Soil_Type7', 'Soil_Type8', 'Soil_Type9', 'Soil_Type10',
       'Soil_Type11', 'Soil_Type12', 'Soil_Type13', 'Soil_Type14',
       'Soil_Type15', 'Soil_Type16', 'Soil_Type17', 'Soil_Type18',
       'Soil_Type19', 'Soil_Type20', 'Soil_Type21', 'Soil_Type22',
       'Soil_Type23', 'Soil_Type24', 'Soil_Type25', 'Soil_Type26',
       'Soil_Type27', 'Soil_Type28', 'Soil_Type29', 'Soil_Type30',
       'Soil_Type31', 'Soil_Type32', 'Soil_Type33', 'Soil_Type34',
       'Soil_Type35', 'Soil_Type36', 'Soil_Type37', 'Soil_Type38',
       'Soil_Type39', 'Soil_Type40

In [14]:
TRAIN_OBJECT_COLUMNS = train.select_dtypes(include=["object"]).columns
TRAIN_OBJECT_COLUMNS

Index([], dtype='object')

# TEST OVERVIEW

In [15]:
test = pd.read_csv(
    INPUTS + "\\" + TEST_PATH,
    index_col=INDEX_COL,
    sep=SEP,
    encoding=ENCODING,
    infer_datetime_format=True,
    engine="c",
    low_memory=False,
    # dtype=DTYPE
)

In [16]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000000 entries, 4000000 to 4999999
Data columns (total 54 columns):
 #   Column                              Non-Null Count    Dtype
---  ------                              --------------    -----
 0   Elevation                           1000000 non-null  int64
 1   Aspect                              1000000 non-null  int64
 2   Slope                               1000000 non-null  int64
 3   Horizontal_Distance_To_Hydrology    1000000 non-null  int64
 4   Vertical_Distance_To_Hydrology      1000000 non-null  int64
 5   Horizontal_Distance_To_Roadways     1000000 non-null  int64
 6   Hillshade_9am                       1000000 non-null  int64
 7   Hillshade_Noon                      1000000 non-null  int64
 8   Hillshade_3pm                       1000000 non-null  int64
 9   Horizontal_Distance_To_Fire_Points  1000000 non-null  int64
 10  Wilderness_Area1                    1000000 non-null  int64
 11  Wilderness_Area2               

In [17]:
TEST_NUMERIC_COLUMNS = test.select_dtypes(include=["float64", "int64"]).columns
TEST_NUMERIC_COLUMNS

Index(['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology',
       'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways',
       'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm',
       'Horizontal_Distance_To_Fire_Points', 'Wilderness_Area1',
       'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4',
       'Soil_Type1', 'Soil_Type2', 'Soil_Type3', 'Soil_Type4', 'Soil_Type5',
       'Soil_Type6', 'Soil_Type7', 'Soil_Type8', 'Soil_Type9', 'Soil_Type10',
       'Soil_Type11', 'Soil_Type12', 'Soil_Type13', 'Soil_Type14',
       'Soil_Type15', 'Soil_Type16', 'Soil_Type17', 'Soil_Type18',
       'Soil_Type19', 'Soil_Type20', 'Soil_Type21', 'Soil_Type22',
       'Soil_Type23', 'Soil_Type24', 'Soil_Type25', 'Soil_Type26',
       'Soil_Type27', 'Soil_Type28', 'Soil_Type29', 'Soil_Type30',
       'Soil_Type31', 'Soil_Type32', 'Soil_Type33', 'Soil_Type34',
       'Soil_Type35', 'Soil_Type36', 'Soil_Type37', 'Soil_Type38',
       'Soil_Type39', 'Soil_Type40

In [18]:
TEST_OBJECT_COLUMNS = test.select_dtypes(include=["object"]).columns
TEST_OBJECT_COLUMNS

Index([], dtype='object')

# FEATURES AND TARGET

In [19]:
features = [col for col in train.columns if col not in [TARGET]]

In [20]:
X_train = train[features]

In [21]:
X_train.sample(2)

Unnamed: 0_level_0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2349063,2844,116,36,409,21,1135,185,223,70,1070,...,0,0,0,0,0,0,0,1,0,0
3365312,3198,104,21,86,70,259,233,250,91,1298,...,0,0,0,0,0,0,0,0,0,0


In [22]:
y_train = train[TARGET]

In [23]:
y_train.value_counts()

2    2262087
1    1468136
3     195712
7      62261
6      11426
4        377
5          1
Name: Cover_Type, dtype: int64

In [24]:
X_test = test[features]

In [25]:
X_test.sample(2)

Unnamed: 0_level_0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4720318,3295,319,2,123,24,4646,221,250,120,513,...,0,0,0,0,0,0,0,0,0,0
4156504,2753,-2,4,317,14,1224,199,214,102,841,...,0,0,0,0,0,0,1,0,0,0


In [26]:
print(" X_train shape: ", X_train.shape, "\n", "y_train shape: ", y_train.shape, "\n", "X_test  shape: ", X_test.shape, "\n", "y_test  shape: ", None)

 X_train shape:  (4000000, 54) 
 y_train shape:  (4000000,) 
 X_test  shape:  (1000000, 54) 
 y_test  shape:  None


### Categorical Features

In [27]:
categorical_features = TEST_OBJECT_COLUMNS

In [28]:
categorical_features_index = []
for column in categorical_features:
    categorical_features_index.append(X_train.columns.get_loc(column))

In [29]:
categorical_features_index

[]

### Class Weights

In [30]:
# labels_dict : {ind_label: count_label}
# mu : parameter to tune

def create_class_weight(labels_dict, mu=0.15):
    total = np.sum(list(labels_dict.values()))
    keys = labels_dict.keys()
    class_weight = dict()

    for key in keys:
        score = math.log(mu*total/float(labels_dict[key]))
        class_weight[key] = score if score > 1.0 else 1.0

    return class_weight

In [31]:
labels_dict = {0: 22483, 1: 915}

In [32]:
create_class_weight(labels_dict)

{0: 1.0, 1: 1.3443617774461443}

In [33]:
labels_dict = {
    2: 2262087,
    1: 1468136,
    3: 195712,
    7: 62261,
    6: 11426,
    4: 377,
    5: 1
}

In [34]:
labels_dict

{2: 2262087, 1: 1468136, 3: 195712, 7: 62261, 6: 11426, 4: 377, 5: 1}

In [35]:
create_class_weight(labels_dict)

{2: 1.0,
 1: 1.0,
 3: 1.120285464349604,
 7: 2.2655944286548326,
 6: 3.9610381949138755,
 4: 7.372439746750272,
 5: 13.304684934198283}

In [36]:
class_weights = {2: 2262087/4000000, 1: 1468136/4000000, 3: 195712/4000000, 7: 62261/4000000, 6: 11426/4000000, 4: 377/4000000, 5: 1/4000000}

### FEATURE SELECTOR

In [37]:
selector = CatBoostClassifier(
    #     loss_function="CrossEntropy", # class weights takes effect only with Logloss, MultiClass, MultiClassOneVsAll
    loss_function="MultiClass",
    eval_metric="Accuracy",
    class_weights=class_weights,
    one_hot_max_size=31,
    depth=6,
    iterations= 1000,
    l2_leaf_reg= 3,
    learning_rate= 0.03,
    nan_mode="Max",
    cat_features=categorical_features_index
)

In [38]:
feature_names= X_train.columns.to_list()

In [39]:
train_pool = Pool(X_train, y_train, cat_features=categorical_features_index, feature_names=feature_names)
#test_pool = Pool(X_test, y_test, cat_features=categorical_features_index, feature_names=feature_names)

In [40]:
X_test.shape[1]-1

53

In [41]:
'0-' + str(X_test.shape[1]-1)

'0-53'

In [42]:
summary = selector.select_features(
    train_pool, # X_train, y_train
    #eval_set=test_pool, # The validation dataset or datasets used for the following processes: overfitting detector, best iteration selection, monitoring metrics changes
    features_for_select='0-' + str(X_test.shape[1]-1), # Features which participate in the selection.
    num_features_to_select=10, # The number of features to select from features_for_select.
    steps=10, # The number of times for training the model. Use more steps for more accurate selection
    algorithm=EFeaturesSelectionAlgorithm.RecursiveByShapValues, # the most accurate method
    shap_calc_type=EShapCalcType.Exact, # The method of the SHAP values calculations ordered by accuracy: Approximate, Regular, Exact
    train_final_model=True, # If specified, then the model with selected features will be trained after features selection.
    #logging_level='Silent', # optimized metric, elapsed time of training, remaining time of training
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Step #1 out of 10
0:	learn: 0.9445243	total: 1.75s	remaining: 29m 8s
1:	learn: 0.9460076	total: 3.4s	remaining: 28m 15s
2:	learn: 0.9495158	total: 5s	remaining: 27m 41s
3:	learn: 0.9489258	total: 6.48s	remaining: 26m 54s
4:	learn: 0.9460824	total: 7.91s	remaining: 26m 13s
5:	learn: 0.9460039	total: 9.45s	remaining: 26m 5s
6:	learn: 0.9465193	total: 10.9s	remaining: 25m 43s
7:	learn: 0.9497443	total: 12.5s	remaining: 25m 45s
8:	learn: 0.9496211	total: 14s	remaining: 25m 41s
9:	learn: 0.9497429	total: 15.6s	remaining: 25m 41s
10:	learn: 0.9499528	total: 17.1s	remaining: 25m 37s
11:	learn: 0.9499666	total: 18.7s	remaining: 25m 37s
12:	learn: 0.9500961	total: 20.2s	remaining: 25m 31s
13:	learn: 0.9502051	total: 21.7s	remaining: 25m 29s
14:	learn: 0.9502413	total: 23.3s	remaining: 25m 29s
15:	learn: 0.9500432	total: 24.9s	remaining: 25m 34s
16:	learn: 0.9502103	total: 26.4s	remaining: 25m 27s
17:	learn: 0.9500005	total: 27.9s	remaining: 25m 20s
18:	learn: 0.9503483	total: 29.3s	remaining: 2

153:	learn: 0.9618387	total: 3m 44s	remaining: 20m 31s
154:	learn: 0.9618827	total: 3m 45s	remaining: 20m 30s
155:	learn: 0.9619155	total: 3m 46s	remaining: 20m 27s
156:	learn: 0.9619630	total: 3m 48s	remaining: 20m 24s
157:	learn: 0.9620418	total: 3m 49s	remaining: 20m 22s
158:	learn: 0.9620751	total: 3m 50s	remaining: 20m 20s
159:	learn: 0.9621184	total: 3m 52s	remaining: 20m 18s
160:	learn: 0.9621450	total: 3m 53s	remaining: 20m 15s
161:	learn: 0.9621748	total: 3m 54s	remaining: 20m 14s
162:	learn: 0.9622340	total: 3m 56s	remaining: 20m 12s
163:	learn: 0.9623033	total: 3m 57s	remaining: 20m 10s
164:	learn: 0.9623499	total: 3m 58s	remaining: 20m 7s
165:	learn: 0.9624369	total: 3m 59s	remaining: 20m 5s
166:	learn: 0.9625036	total: 4m 1s	remaining: 20m 4s
167:	learn: 0.9625269	total: 4m 2s	remaining: 20m 3s
168:	learn: 0.9625824	total: 4m 4s	remaining: 20m 2s
169:	learn: 0.9626268	total: 4m 5s	remaining: 19m 59s
170:	learn: 0.9626757	total: 4m 6s	remaining: 19m 57s
171:	learn: 0.962729

304:	learn: 0.9672183	total: 7m 4s	remaining: 16m 7s
305:	learn: 0.9672496	total: 7m 6s	remaining: 16m 6s
306:	learn: 0.9672743	total: 7m 7s	remaining: 16m 4s
307:	learn: 0.9673109	total: 7m 8s	remaining: 16m 3s
308:	learn: 0.9673369	total: 7m 9s	remaining: 16m 1s
309:	learn: 0.9673521	total: 7m 11s	remaining: 15m 59s
310:	learn: 0.9673886	total: 7m 12s	remaining: 15m 57s
311:	learn: 0.9674277	total: 7m 13s	remaining: 15m 56s
312:	learn: 0.9674483	total: 7m 15s	remaining: 15m 55s
313:	learn: 0.9674625	total: 7m 16s	remaining: 15m 53s
314:	learn: 0.9674791	total: 7m 17s	remaining: 15m 52s
315:	learn: 0.9675037	total: 7m 19s	remaining: 15m 50s
316:	learn: 0.9675228	total: 7m 20s	remaining: 15m 49s
317:	learn: 0.9675544	total: 7m 21s	remaining: 15m 47s
318:	learn: 0.9675792	total: 7m 23s	remaining: 15m 45s
319:	learn: 0.9676096	total: 7m 24s	remaining: 15m 44s
320:	learn: 0.9676317	total: 7m 25s	remaining: 15m 42s
321:	learn: 0.9676389	total: 7m 26s	remaining: 15m 41s
322:	learn: 0.967665

KeyboardInterrupt: 