In [1]:
import warnings

warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

from catboost import CatBoostClassifier, Pool

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from scipy.stats.mstats import winsorize
from featexp import get_trend_stats, get_univariate_plots

from utils import fill_na, reshape_dataset, weighted_roc_auc, fill_start_cluster

In [2]:
#Settings
FILL_NAN_ON_ALL_DATA = True

# Combined

In [3]:
train_df_raw = pd.read_parquet("data/train_data.pqt")
test_df_raw = pd.read_parquet("data/test_data.pqt")

In [4]:
train_df_raw['is_train'] = 1
test_df_raw['is_train'] = 0

combined_df = pd.concat([train_df_raw, test_df_raw], ignore_index=True)

In [5]:
train_mask = combined_df['is_train'] == 1
test_mask = combined_df['is_train'] == 0

Filling NaN values separately for the train and test

In [6]:
end_cluster = combined_df['end_cluster']
start_cluster = combined_df['start_cluster']
combined_df.drop(columns=['end_cluster', 'start_cluster'], inplace=True)

In [7]:
if FILL_NAN_ON_ALL_DATA:
    combined_df = fill_na(combined_df, for_num='mean', for_object='None')  #fill both
else:
    combined_df.loc[train_mask] = fill_na(combined_df.loc[train_mask], for_num='mean', for_object='None')

    test_indexes = pd.Index(
        range(combined_df.loc[train_mask].shape[0],
              combined_df.loc[train_mask].shape[0] + combined_df.loc[test_mask].shape[0]))
    combined_df.loc[test_mask] = fill_na(combined_df.loc[test_mask], for_num='mean', for_object='None').set_index(
        test_indexes)

In [8]:
combined_df['start_cluster'] = start_cluster
combined_df['end_cluster'] = end_cluster

In [9]:
columns_to_drop = ['index_city_code', 'balance_amt_max', 'balance_amt_day_avg', 'city_type']
combined_df.drop(columns=columns_to_drop, inplace=True)

Handling outliers

In [10]:
combined_df.loc[train_mask, 'balance_amt_min'] = winsorize(combined_df.loc[train_mask, 'balance_amt_min'], (0, 0.01))
combined_df.loc[train_mask, 'balance_amt_avg'] = winsorize(combined_df.loc[train_mask, 'balance_amt_avg'], (0, 0.03))

In [11]:
combined_df['date'] = combined_df['date'].replace({'month_4': 'month_1', 'month_5': 'month_2', 'month_6': 'month_3'})

# Pivot

Combining rows in the pivot table

In [12]:
end_cluster_for_train = combined_df.loc[train_mask, 'end_cluster'][2::3]
combined_df.drop(columns=['end_cluster'], inplace=True)

In [13]:
combined_df_pivot = reshape_dataset(combined_df, transform_categories=False, drop_month_3=False)

In [14]:
combine_mask = combined_df_pivot[['is_train_month_1', 'is_train_month_2', 'is_train_month_3']].fillna(0).any(axis=1)
combined_df_pivot['is_train'] = combine_mask.astype(int)
combined_df_pivot.drop(columns=['is_train_month_1', 'is_train_month_2', 'is_train_month_3'], inplace=True)

In [15]:
train_pivot_mask = combined_df_pivot['is_train'] == 1
test_pivot_mask = combined_df_pivot['is_train'] == 0

Filling start cluster for month_6

In [16]:
filled_result = fill_start_cluster(combined_df_pivot.loc[test_pivot_mask].drop(columns=['is_train']))
combined_df_pivot.update(filled_result)

Filling NaN rows separately for the train and test

In [17]:
if FILL_NAN_ON_ALL_DATA:
    combined_df_pivot = fill_na(combined_df_pivot, for_num='mean', for_object='None')  #fill both
else:
    combined_df_pivot.loc[train_pivot_mask] = fill_na(combined_df_pivot.loc[train_pivot_mask], for_num='mean',
                                                      for_object='None')  #doing nothing because no none

    test_pivot_indexes = pd.Index(range(combined_df_pivot.loc[train_pivot_mask].shape[0],
                                        combined_df_pivot.loc[train_pivot_mask].shape[0] +
                                        combined_df_pivot.loc[test_pivot_mask].shape[0]))
    combined_df_pivot.loc[test_pivot_mask] = fill_na(combined_df_pivot.loc[test_pivot_mask], for_num='mean',
                                                     for_object='None').set_index(test_pivot_indexes)

# Find features to drop

In [18]:
combined_df_pivot.drop(columns=['id'], inplace=True)

Find trend stats

In [19]:
le = LabelEncoder()
y_trend_stats = pd.DataFrame(le.fit_transform(end_cluster_for_train), columns=['end_cluster'])
X_trend_stats = combined_df_pivot[combined_df_pivot['is_train'] == 1].drop(columns=['is_train'])
data = pd.concat([X_trend_stats, y_trend_stats], axis=1)

In [20]:
train, test = train_test_split(data, test_size=0.3, shuffle=True, random_state=42, stratify=y_trend_stats)

In [21]:
num_cols = data.select_dtypes(include=['float32', 'int32'])

In [22]:
stats = get_trend_stats(train, data_test=test, features_list=num_cols, target_col='end_cluster')

Categorical features ['end_cluster'] ignored. Categorical features not supported yet.
Returning stats for all numeric features


Drop features with low trend correlation

In [23]:
threshold = 0.9
features_to_drop = stats[stats['Trend_correlation'] < 0.9]['Feature']
features_to_drop

141      ogrn_days_end_month_month_1
142      ogrn_days_end_month_month_2
143      ogrn_days_end_month_month_3
144    ogrn_days_end_quarter_month_1
145    ogrn_days_end_quarter_month_2
146    ogrn_days_end_quarter_month_3
Name: Feature, dtype: object

In [24]:
combined_df_pivot = combined_df_pivot.drop(features_to_drop, axis=1)

# Split 

In [25]:
category_columns = combined_df_pivot.select_dtypes(include=['category', 'O']).columns
combined_df_pivot[category_columns] = combined_df_pivot[category_columns].astype("category")

In [26]:
train_df_pivot = combined_df_pivot[combined_df_pivot['is_train'] == 1].copy()
test_df_pivot = combined_df_pivot[combined_df_pivot['is_train'] == 0].copy()

train_df_pivot.drop(columns=['is_train'], inplace=True)
test_df_pivot.drop(columns=['is_train'], inplace=True)

# Train

In [27]:
X = train_df_pivot
y = end_cluster_for_train

In [28]:
cat_cols = train_df_pivot.iloc[:, (train_df_pivot.dtypes == 'category').values].columns.to_list()

In [29]:
catboost = CatBoostClassifier(iterations=5000,
                              cat_features=cat_cols,
                              task_type="GPU", verbose=100, learning_rate=0.05, use_best_model=True)

In [30]:
dct = {}
kf = StratifiedKFold(3, shuffle=True)

for train_index, test_index in kf.split(X=X, y=y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    train_dataset = Pool(data=X_train, label=y_train, cat_features=cat_cols)
    eval_dataset = Pool(data=X_test, label=y_test, cat_features=cat_cols)

    catboost.fit(train_dataset, eval_set=eval_dataset, use_best_model=True)
    y_pred = catboost.predict_proba(X_test)
    clf_checkpoint = catboost.copy()
    score = weighted_roc_auc(y_test, y_pred, labels=catboost.classes_)

    dct[score] = clf_checkpoint
    print(score)

0:	learn: 2.4262678	test: 2.4268191	best: 2.4268191 (0)	total: 75.6ms	remaining: 6m 18s
100:	learn: 0.8425914	test: 0.8520884	best: 0.8520884 (100)	total: 6.8s	remaining: 5m 30s
200:	learn: 0.8092537	test: 0.8283874	best: 0.8283874 (200)	total: 13.2s	remaining: 5m 16s
300:	learn: 0.7914229	test: 0.8195592	best: 0.8195592 (300)	total: 19.6s	remaining: 5m 6s
400:	learn: 0.7807425	test: 0.8154591	best: 0.8154591 (400)	total: 25.6s	remaining: 4m 53s
500:	learn: 0.7719965	test: 0.8128785	best: 0.8128785 (500)	total: 31.6s	remaining: 4m 43s
600:	learn: 0.7651265	test: 0.8111969	best: 0.8111969 (600)	total: 37.5s	remaining: 4m 34s
700:	learn: 0.7585087	test: 0.8099040	best: 0.8099040 (700)	total: 43.3s	remaining: 4m 25s
800:	learn: 0.7523108	test: 0.8088095	best: 0.8088052 (799)	total: 49.1s	remaining: 4m 17s
900:	learn: 0.7461471	test: 0.8080731	best: 0.8080731 (900)	total: 54.9s	remaining: 4m 9s
1000:	learn: 0.7406925	test: 0.8075093	best: 0.8075037 (997)	total: 1m	remaining: 4m 2s
1100:	le

In [31]:
scores = list(dct.keys())
scores.sort(reverse=True)
sorted_dct = {i: dct[i] for i in scores}
sorted_dct

{0.9124481448820831: <catboost.core.CatBoostClassifier at 0x24b0775f860>,
 0.9082670729750258: <catboost.core.CatBoostClassifier at 0x24b00577800>,
 0.8994552638748404: <catboost.core.CatBoostClassifier at 0x24b01139df0>}

In [32]:
result_score_on_test = sum(sorted_dct.keys()) / len(sorted_dct)
print(f'Score on test: {result_score_on_test}')

Score on test: 0.9067234939106498


0.9108217807419258

In [33]:
def softmax(values):
    exp_values = np.exp(values)

    exp_values_sum = np.sum(exp_values)

    return exp_values / exp_values_sum


weights = softmax(scores)
weights

array([0.33524211, 0.33384337, 0.33091452])

In [34]:
final_pred = 0
for weight, model in zip(weights, sorted_dct.values()):
    final_pred += weight * model.predict_proba(X)

In [35]:
result_score_on_train = weighted_roc_auc(y, final_pred, labels=catboost.classes_)
print(f'Score on train: {result_score_on_train}')

Score on train: 0.9608663717025702


# Test

In [36]:
test_pred_proba = 0
for weight, model in zip(weights, sorted_dct.values()):
    test_pred_proba += weight * model.predict_proba(test_df_pivot)

test_pred_proba

array([[1.04184892e-02, 1.38010684e-02, 2.36922858e-02, ...,
        1.83878986e-03, 8.85755932e-01, 5.35173233e-06],
       [5.95239506e-03, 5.15097573e-01, 7.98071615e-04, ...,
        5.41615526e-04, 4.64228192e-01, 6.98009526e-04],
       [5.89246625e-01, 6.07388789e-03, 3.87994298e-03, ...,
        6.32092872e-02, 1.07028279e-01, 1.82014542e-05],
       ...,
       [3.64979470e-02, 3.53690477e-02, 2.99999380e-02, ...,
        2.17838434e-02, 7.62211915e-01, 1.07412020e-05],
       [1.31077631e-01, 1.17278670e-01, 1.80178026e-02, ...,
        1.06963863e-02, 5.98808431e-01, 1.83014559e-03],
       [2.14085450e-02, 7.43187579e-02, 1.21330837e-02, ...,
        4.68684222e-03, 7.98761672e-01, 1.86583441e-04]])

In [37]:
test_pred_proba_df = pd.DataFrame(test_pred_proba, columns=catboost.classes_)

sorted_classes = sorted(test_pred_proba_df.columns.to_list())

test_pred_proba_df = test_pred_proba_df[sorted_classes]
test_pred_proba_df.head(2)

Unnamed: 0,{other},{},"{α, β}","{α, γ}","{α, δ}","{α, ε, η}","{α, ε, θ}","{α, ε, ψ}","{α, ε}","{α, η}","{α, θ}","{α, λ}","{α, μ}","{α, π}","{α, ψ}",{α},{λ}
0,0.010418,0.013801,0.023692,0.019799,0.004646,0.000195,0.00576,0.00014,0.010103,0.004036,0.017848,0.000527,0.001423,1.1e-05,0.001839,0.885756,5e-06
1,0.005952,0.515098,0.000798,0.001797,0.00065,0.000196,0.000403,1.3e-05,0.00148,0.005531,0.001384,0.00058,0.00065,2e-06,0.000542,0.464228,0.000698


In [38]:
sample_submission_df = pd.read_csv('submissions/sample_submission.csv')

sample_submission_df[sorted_classes] = test_pred_proba_df
sample_submission_df.to_csv("submissions/blending_new.csv", index=False)