In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import os
import sys
sys.path.insert(0, "C:\\Users\\KonuTech\\DataSpellProjects\\kaggle-tabular-playground-series-oct-2021\\scripts")
import json
import numpy as np
import pandas as pd
import math
from itertools import product, chain
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Image
from datetime import datetime

In [3]:
pd.options.display.float_format = "{:.2f}".format

In [4]:
from catboost import CatBoostClassifier, Pool, EShapCalcType, EFeaturesSelectionAlgorithm

In [5]:
from scripts.paramsearch import paramsearch

In [6]:
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import RobustScaler

In [7]:
# ! python scripts\unzip.py inputs\tabular-playground-series-oct-2021.zip

In [8]:
CURRENT_WORKING_DIRECTORY = os.getcwd()

In [9]:
INPUTS = CURRENT_WORKING_DIRECTORY + "\\INPUTS"
SCRIPTS = CURRENT_WORKING_DIRECTORY + "\\SCRIPTS"

In [10]:
CONFIG_FILE = "config.json"
with open(CURRENT_WORKING_DIRECTORY + "\\" + CONFIG_FILE, encoding='utf-8') as f:
    CONFIG = json.load(f)

In [11]:
TRAIN_PATH = CONFIG["INPUTS"]["TRAIN_PATH"][0]
TEST_PATH = CONFIG["INPUTS"]["TEST_PATH"][0]
TARGET = CONFIG["INPUTS"]["TARGET"]
INDEX_COL = CONFIG["INPUTS"]["INDEX_COLUMNS"]
SEP = CONFIG["INPUTS"]["SEPARATOR"]
DECIMAL = CONFIG["INPUTS"]["DECIMAL"]
ENCODING = CONFIG["INPUTS"]["ENCODING"]
DATE_COLUMNS = CONFIG["INPUTS"]["DATE_COLUMNS"]
FLOAT_PRECISION = CONFIG["INPUTS"]["FLOAT_PRECISION"]
DTYPE = CONFIG["INPUTS"]["DTYPE"]
COLUMNS_WITH_NAN_VALUES = CONFIG["INPUTS"]["COLUMNS_WITH_NAN_VALUES"]

In [12]:
TARGET

'target'

# TRAIN OVERVIEW

In [13]:
train = pd.read_csv(
    INPUTS + "\\" + TRAIN_PATH,
    index_col=INDEX_COL,
    sep=SEP,
    encoding=ENCODING,
    infer_datetime_format=True,
    engine="c",
    low_memory=False,
    # dtype=DTYPE
)

In [14]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000000 entries, 0 to 999999
Columns: 286 entries, f0 to target
dtypes: float64(240), int64(46)
memory usage: 2.1 GB


In [15]:
train[TARGET].value_counts()

1    500485
0    499515
Name: target, dtype: int64

In [16]:
TRAIN_NUMERIC_COLUMNS = train.select_dtypes(include=["float64", "int64"]).columns
TRAIN_NUMERIC_COLUMNS = TRAIN_NUMERIC_COLUMNS.drop(TARGET)
TRAIN_NUMERIC_COLUMNS

Index(['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9',
       ...
       'f275', 'f276', 'f277', 'f278', 'f279', 'f280', 'f281', 'f282', 'f283',
       'f284'],
      dtype='object', length=285)

In [17]:
TRAIN_OBJECT_COLUMNS = train.select_dtypes(include=["object"]).columns
TRAIN_OBJECT_COLUMNS

Index([], dtype='object')

# TEST OVERVIEW

In [18]:
test = pd.read_csv(
    INPUTS + "\\" + TEST_PATH,
    index_col=INDEX_COL,
    sep=SEP,
    encoding=ENCODING,
    infer_datetime_format=True,
    engine="c",
    low_memory=False,
    # dtype=DTYPE
)

In [19]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 500000 entries, 1000000 to 1499999
Columns: 285 entries, f0 to f284
dtypes: float64(240), int64(45)
memory usage: 1.1 GB


In [20]:
TEST_NUMERIC_COLUMNS = test.select_dtypes(include=["float64", "int64"]).columns
TEST_NUMERIC_COLUMNS

Index(['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9',
       ...
       'f275', 'f276', 'f277', 'f278', 'f279', 'f280', 'f281', 'f282', 'f283',
       'f284'],
      dtype='object', length=285)

In [21]:
TEST_OBJECT_COLUMNS = test.select_dtypes(include=["object"]).columns
TEST_OBJECT_COLUMNS

Index([], dtype='object')

In [22]:
test.sample(5)

Unnamed: 0_level_0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f275,f276,f277,f278,f279,f280,f281,f282,f283,f284
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1232754,0.25,0.48,0.01,0.23,0.58,0.48,0.46,0.65,0.57,0.34,...,0,0,0,0,0,0,0,0,0,0
1449089,0.2,0.43,0.09,0.25,0.44,0.43,0.5,0.57,0.55,0.19,...,1,0,0,0,1,0,1,0,0,0
1486856,0.21,0.53,0.45,0.17,0.61,0.34,0.1,0.67,0.58,0.31,...,0,0,0,0,0,0,0,0,0,0
1320681,0.2,0.28,0.09,0.39,0.56,0.42,0.53,0.65,0.43,0.14,...,0,0,0,1,0,0,0,1,0,0
1290238,0.23,0.51,0.02,0.29,0.52,0.41,0.5,0.6,0.49,0.49,...,0,0,0,0,0,0,0,0,0,0


# FEATURES AND TARGET

In [23]:
features = [col for col in train.columns if col not in [TARGET]]

In [24]:
X_train = train[features]

In [25]:
X_train.sample(2)

Unnamed: 0_level_0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f275,f276,f277,f278,f279,f280,f281,f282,f283,f284
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
872646,0.23,0.36,0.01,0.51,0.45,0.44,0.43,0.6,0.62,0.39,...,0,0,0,0,0,1,0,0,1,0
355316,0.18,0.62,0.02,0.31,0.43,0.52,0.56,0.65,0.67,0.43,...,0,0,0,0,0,0,1,0,0,0


In [26]:
y_train = train[TARGET]

In [27]:
y_train.value_counts()

1    500485
0    499515
Name: target, dtype: int64

In [28]:
X_test = test[features]

In [29]:
X_test.sample(2)

Unnamed: 0_level_0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f275,f276,f277,f278,f279,f280,f281,f282,f283,f284
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1405442,0.28,0.43,0.1,0.45,0.47,0.43,0.07,0.65,0.42,0.28,...,0,1,1,0,0,1,0,0,0,0
1353550,0.23,0.42,0.18,0.26,0.74,0.43,0.5,0.5,0.58,0.16,...,1,1,0,0,0,0,0,1,0,0


In [30]:
print(" X_train shape: ", X_train.shape, "\n", "y_train shape: ", y_train.shape, "\n", "X_test  shape: ", X_test.shape, "\n", "y_test  shape: ", None)

 X_train shape:  (1000000, 285) 
 y_train shape:  (1000000,) 
 X_test  shape:  (500000, 285) 
 y_test  shape:  None


### Categorical Features

In [31]:
categorical_features = TEST_OBJECT_COLUMNS

In [32]:
categorical_features_index = []
for column in categorical_features:
    categorical_features_index.append(X_train.columns.get_loc(column))

In [33]:
categorical_features_index

[]

### Class Weights

In [34]:
# labels_dict : {ind_label: count_label}
# mu : parameter to tune

def create_class_weight(labels_dict, mu=0.15):
    total = np.sum(list(labels_dict.values()))
    keys = labels_dict.keys()
    class_weight = dict()

    for key in keys:
        score = math.log(mu*total/float(labels_dict[key]))
        class_weight[key] = score if score > 1.0 else 1.0

    return class_weight

In [35]:
y_train.value_counts()[1]

500485

In [36]:
labels_dict = {0: y_train.value_counts()[0], 1: y_train.value_counts()[1]}

In [37]:
labels_dict

{0: 499515, 1: 500485}

In [38]:
class_weights = create_class_weight(labels_dict)

In [39]:
class_weights

{0: 1.0, 1: 1.0}

### FEATURE SELECTOR

In [47]:
selector = CatBoostClassifier(
    #     loss_function="CrossEntropy", # class weights takes effect only with Logloss, MultiClass, MultiClassOneVsAll
    loss_function="CrossEntropy",
    eval_metric="AUC",
    custom_metric=['AUC:hints=skip_train~false'],
#     custom_metric=['AUC:type=OneVsAll;hints=skip_train~false', 'Accuracy'], # for many classes
#     class_weights=class_weights,
#     one_hot_max_size=31,
    depth=6,
    iterations= 25000,
    l2_leaf_reg= 3,
#     learning_rate= 0.03,
    learning_rate= 0.1,
    nan_mode="Max",
    cat_features=categorical_features_index
)

In [48]:
feature_names= X_train.columns.to_list()

In [49]:
train_pool = Pool(X_train, y_train, cat_features=categorical_features_index, feature_names=feature_names)
#test_pool = Pool(X_test, y_test, cat_features=categorical_features_index, feature_names=feature_names)

In [50]:
X_test.sample(2)

Unnamed: 0_level_0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f275,f276,f277,f278,f279,f280,f281,f282,f283,f284
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1046953,0.32,0.47,0.02,0.27,0.72,0.34,0.45,0.56,0.45,0.26,...,0,0,0,1,0,0,0,1,0,1
1439401,0.18,0.4,0.4,0.39,0.38,0.44,0.45,0.65,0.44,0.33,...,0,1,0,0,0,0,1,0,0,0


In [51]:
# X_test.shape[1]-1
X_test.shape[1]

285

In [52]:
'0-' + str(X_test.shape[1])

'0-285'

In [53]:
summary = selector.select_features(
    train_pool, # X_train, y_train
    #eval_set=test_pool, # The validation dataset or datasets used for the following processes: overfitting detector, best iteration selection, monitoring metrics changes
    features_for_select='0-' + str(X_test.shape[1]-1), # Features which participate in the selection.
    num_features_to_select=40, # The number of features to select from features_for_select.
    steps=1, # The number of times for training the model. Use more steps for more accurate selection
    algorithm=EFeaturesSelectionAlgorithm.RecursiveByShapValues, # the most accurate method
    shap_calc_type=EShapCalcType.Exact, # The method of the SHAP values calculations ordered by accuracy: Approximate, Regular, Exact
    train_final_model=True, # If specified, then the model with selected features will be trained after features selection.
    #logging_level='Silent', # optimized metric, elapsed time of training, remaining time of training
    plot=True
)

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Step #1 out of 1
0:	learn: 0.8196709	total: 334ms	remaining: 2h 19m 13s
1:	learn: 0.8256714	total: 644ms	remaining: 2h 14m 13s
2:	learn: 0.8261086	total: 969ms	remaining: 2h 14m 32s
3:	learn: 0.8270506	total: 1.34s	remaining: 2h 20m 3s
4:	learn: 0.8272231	total: 1.69s	remaining: 2h 21m 10s
5:	learn: 0.8285992	total: 2.01s	remaining: 2h 19m 52s
6:	learn: 0.8295996	total: 2.3s	remaining: 2h 16m 55s
7:	learn: 0.8298570	total: 2.59s	remaining: 2h 14m 54s
8:	learn: 0.8305956	total: 2.89s	remaining: 2h 13m 48s
9:	learn: 0.8314642	total: 3.21s	remaining: 2h 13m 31s
10:	learn: 0.8320007	total: 3.57s	remaining: 2h 15m 19s
11:	learn: 0.8322646	total: 3.94s	remaining: 2h 16m 34s
12:	learn: 0.8327947	total: 4.31s	remaining: 2h 18m
13:	learn: 0.8335691	total: 4.64s	remaining: 2h 17m 59s
14:	learn: 0.8338677	total: 5s	remaining: 2h 18m 50s


KeyboardInterrupt: 

In [None]:
summary['loss_graph']['loss_values'][-1]

In [None]:
summary["selected_features_names"]

### FEATURE IMPORTANCE

In [None]:
feature_importance = selector.get_feature_importance(
    prettified=True,
    thread_count=-1,
    verbose=True
)

In [None]:
feature_importance.to_csv('outputs\\feature_importance.csv')

In [None]:
feature_importance

### TRAIN CLASSIFIER

In [None]:
X_train[summary["selected_features_names"]].head()

In [None]:
X_train[summary["selected_features_names"]].describe()

In [None]:
# this function does 3-fold crossvalidation with catboostclassifier          
def crossvaltest(params, train_set, train_label, cat_dims, n_splits=3):
    kf = KFold(n_splits=n_splits,shuffle=True)
#     kf = StratifiedKFold(n_splits=n_splits, shuffle=True)
    res = []
    for train_index, test_index in kf.split(train_set):
        train = train_set.iloc[train_index,:]
        test = train_set.iloc[test_index,:]

        labels = train_label.iloc[train_index]
        test_labels = train_label.iloc[test_index]

        clf = CatBoostClassifier(**params)
        clf.fit(train, np.ravel(labels), cat_features=cat_dims)

        res.append(np.mean(clf.predict(test)==np.ravel(test_labels)))
    return np.mean(res)

In [None]:
# this function runs grid search on several parameters
def catboost_param_tune(params, train_set, train_label, cat_dims=None, n_splits=3):
    ps = paramsearch(params)
    # search 'border_count', 'l2_leaf_reg' etc. individually 
    #   but 'iterations','learning_rate' together
    for prms in chain(
#         ps.grid_search(['border_count']),
#         ps.grid_search(['ctr_border_count']),
        ps.grid_search(['l2_leaf_reg']),
        ps.grid_search(['iterations','learning_rate']),
        ps.grid_search(['depth'])):
        res = crossvaltest(prms, train_set, train_label, cat_dims, n_splits)
        # save the crossvalidation result so that future iterations can reuse the best parameters
        ps.register_result(res, prms)
        print(res, prms, ps, 'best:', ps.bestscore(), ps.bestparam())
    return ps.bestparam()

In [None]:
params = {
    "loss_function": "CrossEntropy",
    "eval_metric": "AUC",
    "custom_metric": ['AUC:hints=skip_train~false'],
#     "class_weights": class_weights,
#     "one_hot_max_size": 31,
#     "depth": [3, 1, 2, 6, 4, 5, 7, 8, 9, 10],
    "depth": [5, 7],
    "iterations": [50000],
#     "learning_rate": [0.03, 0.001, 0.01, 0.1],
    "learning_rate": [0.1],
#     "l2_leaf_reg": [3, 1, 5, 10, 100],
    "l2_leaf_reg": [5, 10],
#     "border_count": [32, 5, 10, 20, 50, 100, 200],
#     "ctr_border_count": [50, 5, 10, 20, 100, 200],
    "nan_mode": "Max",
    "thread_count": -1
}

In [None]:
categorical_features_index = None

In [None]:
%timeit
bestparams = catboost_param_tune(
    params=params,
#     train_set=X_train,
    train_set=X_train[summary["selected_features_names"]],
    train_label=y_train,
#     cat_dims=[1],
    n_splits=7
)

In [None]:
bestparams

In [None]:
bestparams.update({'iterations': 1500})

In [None]:
bestparams

In [None]:
# train classifier with tuned parameters    
model = CatBoostClassifier(
    **bestparams,
#     loss_function="MultiClass",
    class_weights=class_weights
#     cat_features=[1]
)
# clf.fit(train_set, np.ravel(train_label), cat_features=cat_dims)
# res = clf.predict(test_set)
# print('error:',1-np.mean(res==np.ravel(test_label)))

In [None]:
X_train[summary["selected_features_names"]]

### MODEL 1

In [None]:
model = model.fit(
    X_train[summary["selected_features_names"]],
    y_train,
#     eval_set=(X_test[summary["selected_features_names"]], y_test),
#     use_best_model=True,
    plot=True
)

In [None]:
print('CatBoost model is fitted: ' + str(model.is_fitted()))
print('CatBoost model parameters:')
print(model.get_params())

In [None]:
print(model.get_best_score())

In [None]:
print(model.get_all_params())

In [None]:
metadata = model.get_metadata()

In [None]:
print(metadata["model_guid"])

In [None]:
print(metadata["train_finish_time"])

In [None]:
print(metadata["params"])

### FEATURE IMPORTANCE

In [None]:
feature_importance = model.get_feature_importance(
    prettified=True,
    thread_count=-1,
    verbose=True
)

In [None]:
feature_importance

### PREDICTIONS

In [None]:
y_pred = model.predict(X_test[summary["selected_features_names"]])
y_pred.shape

In [None]:
y_prob = model.predict_proba(X_test[summary["selected_features_names"]])
y_prob.shape

### VISUALIZATIONS

In [None]:
# skplt.metrics.plot_roc(y_test, y_prob)

In [None]:
# roc_auc_score(y_test, y_pred)

In [None]:
# skplt.metrics.plot_precision_recall(y_test, y_prob)

In [None]:
# skplt.metrics.plot_ks_statistic(y_test, y_prob)

In [None]:
# skplt.metrics.plot_lift_curve(y_test, y_prob)
# plt.legend(loc='upper')

In [None]:
# skplt.metrics.plot_cumulative_gain(y_test, y_prob)

In [None]:
# (unique, counts) = np.unique(np.array(y_test), return_counts=True)

In [None]:
# frequencies = np.asarray((unique, counts)).T
# frequencies

In [None]:
# print(classification_report(y_test, y_pred))

In [None]:
# print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))

In [None]:
# print("Balanced score: " + str(balanced_accuracy_score(y_test, y_pred)))

In [None]:
# confusion_matrix(y_test, y_pred)

In [None]:
# _, ax = plt.subplots(figsize=(10,10))
# ax = sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', annot_kws={"size": 40, "weight": "bold"})
# labels = ['NIE LAPS', 'LAPS']
# ax.set_xticklabels(labels, fontsize=25);
# ax.set_yticklabels(labels, fontsize=25);
# ax.set_ylabel('True label', fontsize=30);
# ax.set_xlabel('Predicted label', fontsize=30)

In [None]:
corrmat = X_train[summary["selected_features_names"]].corr()
f, ax = plt.subplots(figsize=(20, 20))
fig = sns.heatmap(corrmat, vmax=1, square=True, annot=True)
fig.figure.savefig("correlation_matrix_selected_features.jpg")
plt.clf()

In [None]:
Image(filename="correlation_matrix_selected_features.jpg")

In [None]:
selected_features_target = summary["selected_features_names"] + [TARGET]

In [None]:
selected_features_target

In [None]:
df = pd.merge(X_train, y_train, left_index=True, right_index=True)

In [None]:
corr_target = df[selected_features_target].corr()["Cover_Type"][:-1]

In [None]:
plt.subplots(figsize=(20,20))
sns_plot = sns.heatmap(corr_target.sort_values(ascending=False).to_frame(),annot=True, annot_kws={'size':12},cmap="GnBu")
plt.show()
fig = sns_plot.get_figure()
fig.savefig("correlation_target.jpg")
plt.clf()

In [None]:
correlated_columns = pd.DataFrame(corr_target.sort_values(ascending=False)).T.columns
correlated_columns

In [None]:
# sns.set()
# # fig = sns.pairplot(df[selected_features_target], size = 5, hue=df[selected_features_target].columns[-1])
# fig = sns.pairplot(df[selected_features_target].sample(frac=0.001), size=3, hue=df[selected_features_target].columns[-1])
# plt.show();
# fig.savefig("pair_plots.jpg")
# plt.clf()

In [None]:
Image(filename="pair_plots.jpg")

In [None]:
train[TARGET].value_counts()

In [None]:
train[TARGET].value_counts(normalize=True)

In [None]:
train[TARGET].describe()

In [None]:
pd.DataFrame(y_pred).value_counts()

In [None]:
pd.DataFrame(y_pred).value_counts(normalize=True)

In [None]:
pd.DataFrame(y_pred).describe()

* NO ALL CLASSES WERE REPRESENTED!!!

In [None]:
y_pred_submission = pd.DataFrame(y_pred, index=X_test.index, columns=["Cover_Type"])

In [None]:
y_pred_submission

In [None]:
y_pred_submission.head()

In [None]:
y_pred_submission.tail()

In [None]:
now = datetime.now()
date_string = now.strftime("%Y%m%d_%H%M%S")

### MODEL 1 PREDICTIONS TO CSV

In [None]:
y_pred_submission.to_csv("outputs\submission_" + date_string + ".csv")

In [None]:
y_pred_submission.to_csv("outputs\submission.csv")

### MODEL 1 SUBMISSION

In [None]:
! kaggle competitions submit tabular-playground-series-oct-2021 -f outputs\submission.csv -m "Submission"

### GINI

In [None]:
# #The function used in most kernels
# def gini(actual, pred, cmpcol = 0, sortcol = 1):
#     assert( len(actual) == len(pred) )
#     all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
#     all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
#     totalLosses = all[:,0].sum()
#     giniSum = all[:,0].cumsum().sum() / totalLosses
    
#     giniSum -= (len(actual) + 1) / 2.
#     return giniSum / len(actual)
 
# def gini_normalized(a, p):
#     return gini(a, p) / gini(a, a)

In [None]:
# gini_normalized(y_test, y_pred)

### REMOVING FEATURES NOT IMPACTING LOSS

In [None]:
features_to_exclude = {
    "feature_index": [
        "1",
        "2",
        "6",
        "8",
        "14",
        "16",
        "17",
        "18",
        "19",
        "20",
        "21",
        "22",
        "28",
        "38",
        "25"
    ],
    "feature_name": [
        "Aspect",
        "Slope",
        "Hillshade_9am",
        "Hillshade_3am",
        "Soil_Type1",
        "Soil_Type3",
        "Soil_Type4",
        "Soil_Type5",
        "Soil_Type6",
        "Soil_Type7",
        "Soil_Type8",
        "Soil_Type9",
        "Soil_Type15",
        "Soil_Type25",
        "Soil_Type12"
    ]
}

In [None]:
to_exclude = pd.DataFrame.from_dict(features_to_exclude)

In [None]:
to_exclude['feature_name']

In [None]:
X_train.loc[:, ~X_train.columns.isin(to_exclude['feature_name'])]

### MODEL 2

In [None]:
model = model.fit(
    X_train.loc[:, ~X_train.columns.isin(to_exclude['feature_name'])],
    y_train,
#     eval_set=(X_test[summary["selected_features_names"]], y_test),
#     use_best_model=True,
    plot=True
)

In [None]:
print('CatBoost model is fitted: ' + str(model.is_fitted()))
print('CatBoost model parameters:')
print(model.get_params())

In [None]:
print(model.get_best_score())

In [None]:
feature_importance = model.get_feature_importance(
    prettified=True,
    thread_count=-1,
    verbose=True
)

In [None]:
feature_importance

In [None]:
y_pred = model.predict(X_test.loc[:, ~X_train.columns.isin(to_exclude['feature_name'])])
y_pred.shape

In [None]:
train[TARGET].value_counts(normalize=True)

In [None]:
pd.DataFrame(y_pred).value_counts(normalize=True)

In [None]:
y_pred_submission = pd.DataFrame(y_pred, index=X_test.index, columns=["Cover_Type"])

In [None]:
y_pred_submission

In [None]:
y_pred_submission.head()

In [None]:
y_pred_submission.tail()

In [None]:
now = datetime.now()
date_string = now.strftime("%Y%m%d_%H%M%S")

### MODEL 2 PREDICTIONS TO CSV

In [None]:
y_pred_submission.to_csv("outputs\submission_" + date_string + ".csv")

In [None]:
y_pred_submission.to_csv("outputs\submission.csv")

### MODEL 2  SUBMISSION

In [None]:
! kaggle competitions submit tabular-playground-series-oct-2021 -f outputs\submission.csv -m "Submission"

### MODEL 3 - ONLY 'Elevation'

In [None]:
model = model.fit(
    X_train['Elevation'],
    y_train,
#     eval_set=(X_test[summary["selected_features_names"]], y_test),
#     use_best_model=True,
    plot=True
)

In [None]:
print('CatBoost model is fitted: ' + str(model.is_fitted()))
print('CatBoost model parameters:')
print(model.get_params())

In [None]:
print(model.get_best_score())

In [None]:
feature_importance = model.get_feature_importance(
    prettified=True,
    thread_count=-1,
    verbose=True
)

In [None]:
feature_importance

In [None]:
y_pred = model.predict(X_test.loc[:, ~X_train.columns.isin(to_exclude['feature_name'])])
y_pred.shape

In [None]:
train[TARGET].value_counts(normalize=True)

In [None]:
pd.DataFrame(y_pred).value_counts(normalize=True)

In [None]:
y_pred_submission = pd.DataFrame(y_pred, index=X_test.index, columns=["Cover_Type"])

In [None]:
y_pred_submission

In [None]:
y_pred_submission.head()

In [None]:
y_pred_submission.tail()

In [None]:
now = datetime.now()
date_string = now.strftime("%Y%m%d_%H%M%S")

### MODEL 3 PREDICTIONS TO CSV

In [None]:
y_pred_submission.to_csv("outputs\submission_" + date_string + ".csv")

In [None]:
y_pred_submission.to_csv("outputs\submission.csv")

### MODEL 3  SUBMISSION

In [None]:
! kaggle competitions submit tabular-playground-series-oct-2021 -f outputs\submission.csv -m "Submission"

### 'Elevation' is responsible for ~0.88 of Accuracy 

### MODEL 4 -- ADDINING Soil_Type23, Soil_Type40 Hillshade_Noon TO MODEL 1

In [None]:
summary["selected_features_names"] + ["Soil_Type23", "Soil_Type40", "Hillshade_Noon"]

In [None]:
X_train[summary["selected_features_names"] + ["Soil_Type23", "Soil_Type40", "Hillshade_Noon"]].sample(5)

In [None]:
model = model.fit(
    X_train[summary["selected_features_names"] + ["Soil_Type23", "Soil_Type40", "Hillshade_Noon"]],
    y_train,
#     eval_set=(X_test[summary["selected_features_names"]], y_test),
#     use_best_model=True,
    plot=True
)

In [None]:
print('CatBoost model is fitted: ' + str(model.is_fitted()))
print('CatBoost model parameters:')
print(model.get_params())

In [None]:
print(model.get_best_score())

In [None]:
feature_importance = model.get_feature_importance(
    prettified=True,
    thread_count=-1,
    verbose=True
)

In [None]:
feature_importance

In [None]:
y_pred = model.predict(X_test.loc[:, ~X_train.columns.isin(to_exclude['feature_name'])])
y_pred.shape

In [None]:
train[TARGET].value_counts(normalize=True)

In [None]:
pd.DataFrame(y_pred).value_counts(normalize=True)

In [None]:
y_pred_submission = pd.DataFrame(y_pred, index=X_test.index, columns=["Cover_Type"])

In [None]:
y_pred_submission

In [None]:
y_pred_submission.head()

In [None]:
y_pred_submission.tail()

In [None]:
now = datetime.now()
date_string = now.strftime("%Y%m%d_%H%M%S")

### MODEL 4 PREDICTIONS TO CSV

In [None]:
y_pred_submission.to_csv("outputs\submission_" + date_string + ".csv")

In [None]:
y_pred_submission.to_csv("outputs\submission.csv")

### MODEL 4  SUBMISSION

In [None]:
! kaggle competitions submit tabular-playground-series-oct-2021 -f outputs\submission.csv -m "Submission"

### MODEL 5 --feature engineering

https://www.kaggle.com/chryzal/features-engineering-for-you

In [None]:
new_names = {
    "Horizontal_Distance_To_Hydrology": "x_dist_hydrlgy",
    "Vertical_Distance_To_Hydrology": "y_dist_hydrlgy",
    "Horizontal_Distance_To_Roadways": "x_dist_rdwys",
    "Horizontal_Distance_To_Fire_Points": "x_dist_firepts"
}

X_train.rename(new_names, axis=1, inplace=True)
X_test.rename(new_names, axis=1, inplace=True)

In [None]:
X_train["Aspect"][X_train["Aspect"] < 0] += 360
X_train["Aspect"][X_train["Aspect"] > 359] -= 360

X_test["Aspect"][X_test["Aspect"] < 0] += 360
X_test["Aspect"][X_test["Aspect"] > 359] -= 360

In [None]:
# Manhhattan distance to Hydrology
X_train["mnhttn_dist_hydrlgy"] = np.abs(X_train["x_dist_hydrlgy"]) + np.abs(X_train["y_dist_hydrlgy"])
X_test["mnhttn_dist_hydrlgy"] = np.abs(X_test["x_dist_hydrlgy"]) + np.abs(X_test["y_dist_hydrlgy"])

# Euclidean distance to Hydrology
X_train["ecldn_dist_hydrlgy"] = (X_train["x_dist_hydrlgy"]**2 + X_train["y_dist_hydrlgy"]**2)**0.5
X_test["ecldn_dist_hydrlgy"] = (X_test["x_dist_hydrlgy"]**2 + X_test["y_dist_hydrlgy"]**2)**0.5

In [None]:
X_train.loc[X_train["Hillshade_9am"] < 0, "Hillshade_9am"] = 0
X_test.loc[X_test["Hillshade_9am"] < 0, "Hillshade_9am"] = 0

X_train.loc[X_train["Hillshade_Noon"] < 0, "Hillshade_Noon"] = 0
X_test.loc[X_test["Hillshade_Noon"] < 0, "Hillshade_Noon"] = 0

X_train.loc[X_train["Hillshade_3pm"] < 0, "Hillshade_3pm"] = 0
X_test.loc[X_test["Hillshade_3pm"] < 0, "Hillshade_3pm"] = 0

X_train.loc[X_train["Hillshade_9am"] > 255, "Hillshade_9am"] = 255
X_test.loc[X_test["Hillshade_9am"] > 255, "Hillshade_9am"] = 255

X_train.loc[X_train["Hillshade_Noon"] > 255, "Hillshade_Noon"] = 255
X_test.loc[X_test["Hillshade_Noon"] > 255, "Hillshade_Noon"] = 255

X_train.loc[X_train["Hillshade_3pm"] > 255, "Hillshade_3pm"] = 255
X_test.loc[X_test["Hillshade_3pm"] > 255, "Hillshade_3pm"] = 255

In [None]:
features_Hillshade = ['Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm']
soil_features = [x for x in X_train.columns if x.startswith("Soil_Type")]
wilderness_features = [x for x in X_train.columns if x.startswith("Wilderness_Area")]

def addFeature(X):
    # Thanks @mpwolke : https://www.kaggle.com/mpwolke/tooezy-where-are-you-no-camping-here
    X["Soil_Count"] = X[soil_features].apply(sum, axis=1)

    # Thanks @yannbarthelemy : https://www.kaggle.com/yannbarthelemy/tps-december-first-simple-feature-engineering
    X["Wilderness_Area_Count"] = X[wilderness_features].apply(sum, axis=1)
    X["Hillshade_mean"] = X[features_Hillshade].mean(axis=1)
    X['amp_Hillshade'] = X[features_Hillshade].max(axis=1) - X[features_Hillshade].min(axis=1)

In [None]:
addFeature(X_train)
addFeature(X_test)

In [None]:
cols = [
    "Elevation",
    "Aspect",
    "mnhttn_dist_hydrlgy",
    "ecldn_dist_hydrlgy",
    "Slope",
    "x_dist_hydrlgy",
    "y_dist_hydrlgy",
    "x_dist_rdwys",
    "Hillshade_9am",
    "Hillshade_Noon",
    "Hillshade_3pm",
    "x_dist_firepts",
    "Soil_Count",
    "Wilderness_Area_Count",
    "Hillshade_mean",
    "amp_Hillshade"
]

In [None]:
scaler = RobustScaler()
X_train[cols] = scaler.fit_transform(X_train[cols])
X_test[cols] = scaler.transform(X_test[cols])

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int8','int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2

    for col in df.columns:
        col_type = df[col].dtypes

        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()

            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2

    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
 
    return df

In [None]:
X_train = reduce_mem_usage(X_train)
X_test = reduce_mem_usage(X_test)

In [None]:
X_train.sample(5)

In [None]:
X_train.info()

In [None]:
X_test.sample(5)

### MODEL 5 - FEATURE SELECTOR

In [None]:
selector = CatBoostClassifier(
    #     loss_function="CrossEntropy", # class weights takes effect only with Logloss, MultiClass, MultiClassOneVsAll
    loss_function="MultiClass",
    eval_metric="Accuracy",
    class_weights=class_weights,
#     one_hot_max_size=31,
    depth=3,
    iterations= 1000,
    l2_leaf_reg= 5,
    learning_rate= 0.03,
    nan_mode="Max"
#     cat_features=categorical_features_index
)

In [None]:
feature_names= X_train.columns.to_list()

In [None]:
train_pool = Pool(X_train, y_train, cat_features=categorical_features_index, feature_names=feature_names)
#test_pool = Pool(X_test, y_test, cat_features=categorical_features_index, feature_names=feature_names)

In [None]:
X_test.shape[1]

In [None]:
'0-' + str(X_test.shape[1])

In [None]:
summary = selector.select_features(
    train_pool, # X_train, y_train
    #eval_set=test_pool, # The validation dataset or datasets used for the following processes: overfitting detector, best iteration selection, monitoring metrics changes
    features_for_select='0-' + str(X_test.shape[1]-1), # Features which participate in the selection.
    num_features_to_select=15, # The number of features to select from features_for_select.
    steps=3, # The number of times for training the model. Use more steps for more accurate selection
    algorithm=EFeaturesSelectionAlgorithm.RecursiveByShapValues, # the most accurate method
    shap_calc_type=EShapCalcType.Exact, # The method of the SHAP values calculations ordered by accuracy: Approximate, Regular, Exact
#     train_final_model=True, # If specified, then the model with selected features will be trained after features selection.
    #logging_level='Silent', # optimized metric, elapsed time of training, remaining time of training
    plot=True
)

In [None]:
summary["selected_features_names"]

In [None]:
bestparams.update({'iterations': 1500})

In [None]:
bestparams

In [None]:
# train classifier with tuned parameters    
model = CatBoostClassifier(
    **bestparams,
#     loss_function="MultiClass",
    class_weights=class_weights
#     cat_features=[1]
)
# clf.fit(train_set, np.ravel(train_label), cat_features=cat_dims)
# res = clf.predict(test_set)
# print('error:',1-np.mean(res==np.ravel(test_label)))

In [None]:
X_train[summary["selected_features_names"]]

### MODEL 5

In [None]:
model = model.fit(
    X_train[summary["selected_features_names"]],
    y_train,
#     eval_set=(X_test[summary["selected_features_names"]], y_test),
#     use_best_model=True,
    plot=True
)

In [None]:
print('CatBoost model is fitted: ' + str(model.is_fitted()))
print('CatBoost model parameters:')
print(model.get_params())

In [None]:
print(model.get_best_score())

In [None]:
feature_importance = model.get_feature_importance(
    prettified=True,
    thread_count=-1,
    verbose=True
)

In [None]:
feature_importance

In [None]:
y_pred = model.predict(X_test[summary["selected_features_names"]])
y_pred.shape

In [None]:
train[TARGET].value_counts(normalize=True)

In [None]:
pd.DataFrame(y_pred).value_counts(normalize=True)

In [None]:
y_pred_submission = pd.DataFrame(y_pred, index=X_test.index, columns=["Cover_Type"])

In [None]:
y_pred_submission

In [None]:
y_pred_submission.head()

In [None]:
y_pred_submission.tail()

In [None]:
now = datetime.now()
date_string = now.strftime("%Y%m%d_%H%M%S")

### MODEL 5 PREDICTIONS TO CSV

In [None]:
y_pred_submission.to_csv("outputs\submission_" + date_string + ".csv")

In [None]:
y_pred_submission.to_csv("outputs\submission.csv")

### MODEL 5  SUBMISSION

In [None]:
! kaggle competitions submit tabular-playground-series-oct-2021 -f outputs\submission.csv -m "Submission"

### MODEL 6 - ADDING x_dist_hydrlgy,  Soil_Type2, Soil_Type10, ecldn_dist_hydrlgy, Soil_Type40

In [None]:
x_dist_hydrlgy
Soil_Type2
Soil_Type10
ecldn_dist_hydrlgy
Soil_Type40

In [None]:
summary["selected_features_names"] + ["x_dist_hydrlgy", "Soil_Type2", "Soil_Type10", "ecldn_dist_hydrlgy", "Soil_Type40"]

In [None]:
bestparams.update({'iterations': 1500})

In [None]:
bestparams

In [None]:
# train classifier with tuned parameters    
model = CatBoostClassifier(
    **bestparams,
#     loss_function="MultiClass",
    class_weights=class_weights
#     cat_features=[1]
)
# clf.fit(train_set, np.ravel(train_label), cat_features=cat_dims)
# res = clf.predict(test_set)
# print('error:',1-np.mean(res==np.ravel(test_label)))

In [None]:
X_train[summary["selected_features_names"] + ["x_dist_hydrlgy", "Soil_Type2", "Soil_Type10", "ecldn_dist_hydrlgy", "Soil_Type40"]]

### MODEL 6

In [None]:
model = model.fit(
    X_train[summary["selected_features_names"] + ["x_dist_hydrlgy", "Soil_Type2", "Soil_Type10", "ecldn_dist_hydrlgy", "Soil_Type40"]],
    y_train,
#     eval_set=(X_test[summary["selected_features_names"]], y_test),
#     use_best_model=True,
    plot=True
)

In [None]:
print('CatBoost model is fitted: ' + str(model.is_fitted()))
print('CatBoost model parameters:')
print(model.get_params())

In [None]:
print(model.get_best_score())

In [None]:
feature_importance = model.get_feature_importance(
    prettified=True,
    thread_count=-1,
    verbose=True
)

In [None]:
feature_importance

In [None]:
y_pred = model.predict(X_test[summary["selected_features_names"] + ["x_dist_hydrlgy", "Soil_Type2", "Soil_Type10", "ecldn_dist_hydrlgy", "Soil_Type40"]])
y_pred.shape

In [None]:
train[TARGET].value_counts(normalize=True)

In [None]:
pd.DataFrame(y_pred).value_counts(normalize=True)

In [None]:
y_pred_submission = pd.DataFrame(y_pred, index=X_test.index, columns=["Cover_Type"])

In [None]:
y_pred_submission

In [None]:
y_pred_submission.head()

In [None]:
y_pred_submission.tail()

In [None]:
now = datetime.now()
date_string = now.strftime("%Y%m%d_%H%M%S")

### MODEL 6 PREDICTIONS TO CSV

In [None]:
y_pred_submission.to_csv("outputs\submission_" + date_string + ".csv")

In [None]:
y_pred_submission.to_csv("outputs\submission.csv")

### MODEL 6  SUBMISSION

In [None]:
! kaggle competitions submit tabular-playground-series-oct-2021 -f outputs\submission.csv -m "Submission"