In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from tqdm import tqdm
from sklearn.model_selection import KFold, StratifiedKFold

In [17]:
import pandas as pd
import numpy as np
import os

def set_path(new_path: str):
    os.environ["DATA_PATH"] = new_path

def get_data_path():
    return os.environ.get("DATA_PATH", "data/")

def get_cluster_weights():
    path = get_data_path()
    return pd.read_excel(path + "cluster_weights.xlsx").set_index("cluster")

def get_train_data():
    path = get_data_path()
    return pd.read_parquet(path + "train_data.pqt")

def get_test_data():
    path = get_data_path()
    return pd.read_parquet(path + "test_data.pqt")

def get_sample_submission():
    path = get_data_path()
    return pd.read_csv(path + "sample_submission.csv")

def get_final_proba(test_start_cluster_proba: pd.DataFrame, transition_proba: np.array):
    # test_start_cluster_proba: (n_samples, n_clusters)
    # transition_proba: (n_samples, n_clusters, n_clusters)
    # return (n_samples, n_clusters)
    
    return np.einsum("ij,ijk->ik", test_start_cluster_proba, transition_proba)

clusters = [
    '{other}',
    '{}',
    '{α, β}',
    '{α, γ}',
    '{α, δ}',
    '{α, ε, η}',
    '{α, ε, θ}',
    '{α, ε, ψ}',
    '{α, ε}',
    '{α, η}',
    '{α, θ}',
    '{α, λ}',
    '{α, μ}',
    '{α, π}',
    '{α, ψ}',
    '{α}',
    '{λ}'
]

In [18]:
set_path('/kaggle/input/it-purple-hack-alpha/')

In [19]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score

def weighted_roc_auc(y_true, y_pred, labels, weights_dict):
    unnorm_weights = np.array([weights_dict[label] for label in labels])
    weights = unnorm_weights / unnorm_weights.sum()
    classes_roc_auc = roc_auc_score(y_true, y_pred, labels=labels,
                                    multi_class="ovr", average=None)
    
    print({labels[i]: classes_roc_auc[i] for i in range(len(labels))})
    
    return sum(weights * classes_roc_auc)

cluster_weights_ = get_cluster_weights()
weights_dict_ = cluster_weights_["unnorm_weight"].to_dict()

def score(y_true, y_pred):
    return weighted_roc_auc(y_true, y_pred, labels=clusters, weights_dict=weights_dict_)

In [20]:
train = get_train_data()
train = train.convert_dtypes()

In [21]:
train.head()

Unnamed: 0,id,date,balance_amt_avg,balance_amt_max,balance_amt_min,balance_amt_day_avg,channel_code,city,city_type,index_city_code,...,cnt_cred_g_oper_3m,cnt_days_cred_g_oper_3m,sum_deb_h_oper_3m,cnt_deb_h_oper_3m,cnt_days_deb_h_oper_3m,sum_cred_h_oper_3m,cnt_cred_h_oper_3m,cnt_days_cred_h_oper_3m,start_cluster,end_cluster
0,0,month_1,0.744845,0.705492,1.287207,0.748101,channel_code_5,city_23,city_type_0,index_city_code_39,...,0.951166,0.568681,0.897565,0.553624,0.774354,0.936506,0.295984,0.967947,"{α, γ}",{other}
1,0,month_2,1.049605,0.831916,2.458609,1.053805,channel_code_5,city_23,city_type_0,index_city_code_39,...,0.948812,0.499716,0.785029,0.551904,0.696576,0.990157,0.298873,0.945969,"{α, γ}",{other}
2,0,month_3,0.692653,0.740253,0.430042,0.695747,channel_code_5,city_23,city_type_0,index_city_code_39,...,0.946458,0.442244,0.87705,0.551044,0.663243,0.810065,0.294829,0.956958,"{α, γ}",{other}
3,1,month_1,-0.081586,-0.09186,-0.11404,-0.08089,channel_code_2,city_14,city_type_0,,...,0.945281,0.407762,0.369318,0.567093,0.785465,-0.184002,0.253523,0.462452,{other},{other}
4,1,month_2,-0.094962,-0.100504,-0.119302,-0.094307,channel_code_2,city_14,city_type_0,,...,0.946066,0.43075,0.067275,0.559928,0.696576,-0.183854,0.255545,0.495419,{other},{other}


In [1]:
from sklearn.preprocessing import LabelEncoder


target_feature = 'end_cluster'
to_drop = [
    'id', 'date'
]


class Imputer:
    """
    Класс для удаления нанов и приведения датасета к нормальной форме
    """

    def fit(self, X):
        X = X.copy()
        self.modes = []
        self.number_columns = [column for column in X.select_dtypes('number').columns if column not in to_drop]
        self.string_columns = [column for column in X.select_dtypes('string').columns if column not in to_drop]
        for column in self.number_columns:
            self.modes.append(X[column].mode()[0])
        self.les = []
        self.top_20s = []
        for column in self.string_columns:
            X.loc[:, column] = X[column].fillna('')
            top_20 = X[column].value_counts().nlargest(20).index
            X.loc[:, column] = X[column].where(X[column].isin(top_20), 'Other')
            self.top_20s.append(top_20)
            self.les.append(LabelEncoder().fit(pd.concat([X[column], pd.Series(['Other'])], axis=0)))
        self.modes = np.array(self.modes)

    def transform(self, X):
        X = X.copy()
        for ind, column in enumerate(self.number_columns):
            X.loc[:, column] = X[column].fillna(self.modes[ind])
        X.loc[:, self.number_columns] = X[self.number_columns].sub(self.modes, axis=1)
        for ind, column in enumerate(self.string_columns):
            X.loc[:, column] = X[column].fillna('')
            X.loc[:, column] = X[column].where(X[column].isin(self.top_20s[ind]), 'Other')
        for ind, column in enumerate(self.string_columns):
            X.loc[:, column] = self.les[ind].transform(X[column])
        return X

    def fit_transform(self, X):
        self.fit(X)
        return self.transform(X)


class FeatureExtractor:
    """
    Класс для добавления фич
    """

    def fit(self, X, imputer):
        self.imputer = imputer

    def transform(self, X, train):
        X = X.copy()
        
        X['is_new'] = 0
        if train:
            indices = X[X.date == 'month_1'][X.loc[X.date == 'month_1', self.imputer.number_columns].astype(bool).sum(axis=1) == 0].index
            new_ids = X.loc[indices, 'id']
            X.loc[X['id'].isin(new_ids), 'is_new'] = 1
        else:
            unique, counts = np.unique(X['id'], return_counts=True)
            two_counts = unique[counts == 2]
            X.loc[X['id'].isin(two_counts), 'is_new'] = 1
        
        X['cnt_null_features'] = X[self.imputer.number_columns].astype(bool).sum(axis=1)
        
        X['line_cnt_e_cnt_g'] = (X['cnt_cred_e_oper_3m'] * 31.25 + X['cnt_cred_g_oper_3m'] - 0.0125 < 0).astype(int)
        
        X['balance_amt_diff'] = X['balance_amt_max'] - X['balance_amt_min']
        X['d_oper_1m_diff'] = X['sum_cred_d_oper_1m'] - X['sum_deb_d_oper_1m']
        X['e_oper_1m_diff'] = X['sum_cred_e_oper_1m'] - X['sum_deb_e_oper_1m']
        X['f_oper_1m_diff'] = X['sum_cred_f_oper_1m'] - X['sum_deb_f_oper_1m']
        X['g_oper_1m_diff'] = X['sum_cred_g_oper_1m'] - X['sum_deb_g_oper_1m']
        X['h_oper_1m_diff'] = X['sum_cred_h_oper_1m'] - X['sum_deb_h_oper_1m']
        
        return X

    def fit_transform(self, X, imputer, train):
        self.fit(X, imputer)
        return self.transform(X, train)


additional_cat_features = [
    'is_new',
    'line_cnt_e_cnt_g'
]

In [2]:
def stratified_train_test_split(X, y, test_size, random_state, group):
    """
    Самописный stratified_train_test_split, учитывая, чтобы все id-шники были
    в одном датасете (для избежания утечек)
    """

    srt_values = y.value_counts(ascending=True)
    
    X_srt = pd.concat([X.copy(), y], axis=1)

    def srt_func(x):
        srt_arr = []
        for item in x:
            srt_arr.append((srt_values.index.get_loc(item), item))
        srt_arr.sort()
        return srt_arr[0][1]

    X_srt = X_srt.groupby(group).agg(
        stratified=(y.name, srt_func)
    )

    train_inds, test_inds = train_test_split(X_srt, test_size=test_size, random_state=random_state,
                                             stratify=X_srt.stratified)
    train_inds = train_inds.index
    test_inds = test_inds.index
    X_train = X[X[group].isin(train_inds)]
    X_test = X[X[group].isin(test_inds)]
    y_train = y.loc[X_train.index]
    y_test = y.loc[X_test.index]

    return X_train, X_test, y_train, y_test

In [87]:
def stratified_k_fold(X, y, n_splits, random_state, group):
    """
    Самописный stratified_k_fold, учитывая, чтобы все id-шники были
    в одном датасете (для избежания утечек)
    """

    srt_values = y.value_counts(ascending=True)
    
    X_srt = pd.concat([X.copy(), y], axis=1)

    def srt_func(x):
        srt_arr = []
        for item in x:
            srt_arr.append((srt_values.index.get_loc(item), item))
        srt_arr.sort()
        return srt_arr[0][1]

    X_srt = X_srt.groupby(group).agg(
        stratified=(y.name, srt_func)
    )
    
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    for train_inds, test_inds in kf.split(X=X_srt, y=X_srt.stratified):
        X_train = X[X[group].isin(train_inds)]
        X_test = X[X[group].isin(test_inds)]
        y_train = y.loc[X_train.index]
        y_test = y.loc[X_test.index]
        yield X_train, X_test, y_train, y_test

In [3]:
from catboost import CatBoostClassifier


class CatboostEstimator:
    """
    Класс для обучения Catboost
    """

    def fit(self, X, y, n_splits, cat_features, to_drop):
        self.one_model = False
        self.models = []
        scores = []
        for ind, (X_train, X_val, y_train, y_val) in enumerate(stratified_k_fold(X, y, n_splits,
                                                                                 random_state=42,
                                                                                 group='id')):
            X_train = X_train.drop(to_drop, axis=1)
            X_val = X_val.drop(to_drop, axis=1)
            
            model = CatBoostClassifier(cat_features=cat_features, verbose=500,
                                       loss_function='MultiClassOneVsAll',
                                       eval_metric='AUC', task_type='GPU',
                                       iterations=2000)
        
            model.fit(X_train, y_train, verbose=500, eval_set=(X_val, y_val))
            
            self.models.append(model)
            y_pred = model.predict_proba(X_val)
            scores.append(score(y_val, y_pred / y_pred.sum(axis=1).reshape(-1, 1)))
            print(f'model {ind}: score = {scores[-1].round(4)}')
        
        scores = np.array(scores)
        print(f'mean score = {scores.mean().round(4)}, std = {scores.std().round(4)}')
        print(f'overall score = {(scores.mean() - scores.std()).round(4)}')
            
    
    def fit_select_features(self, X, y, cat_features, to_drop):
        self.one_model = True
        
        X_train, X_val, y_train, y_val = stratified_train_test_split(X,
                                                                     y,
                                                                     test_size=0.2,
                                                                     random_state=42,
                                                                     group='id')
        X_train = X_train.drop(to_drop, axis=1)
        X_val = X_val.drop(to_drop, axis=1)
        
        self.model = CatBoostClassifier(cat_features=cat_features, verbose=150,
                                        loss_function='MultiClassOneVsAll',
                                        eval_metric='AUC', task_type='GPU',
                                       iterations=2000)
        
        self.model.select_features(X_train, y_train, verbose=500, eval_set=(X_val, y_val), steps=10,
                                  num_features_to_select=30, features_for_select=X.columns,
                                  algorithm='RecursiveByLossFunctionChange', train_final_model=True)

    def predict(self, X):
        if self.one_model:
            return self.model.predict_proba(X)
        
        cnt_classes = len(clusters)
        y_pred = np.zeros((X.shape[0], cnt_classes))

        for model in self.models:
            y_pred += model.predict_proba(X)
        y_pred /= cnt_classes
        
        return y_pred

In [4]:
from lightgbm import LGBMClassifier


class LightGBMEstimator:
    """
    Класс для обучения LightGBM
    """

    def fit(self, X, y, n_splits, cat_features, to_drop):
        self.models = []
        scores = []
        for ind, (X_train, X_val, y_train, y_val) in enumerate(stratified_k_fold(X, y, n_splits,
                                                                                 random_state=42,
                                                                                 group='id')):
            X_train = X_train.drop(to_drop, axis=1)
            X_val = X_val.drop(to_drop, axis=1)
            
            model = LGBMClassifier(
                objective="multiclass",
                metric='multi_logloss',
                n_estimators=10,
                num_class=len(clusters),
                seed=42,
                cat_feature=[X_train.columns.get_loc(c) for c in cat_features]
            )
            
            model.fit(
                X_train,
                y_train,
                eval_set=[(X_val, y_val)]
            )
            
            self.models.append(model)
            y_pred = model.predict_proba(X_val)
            scores.append(score(y_val, y_pred / y_pred.sum(axis=1).reshape(-1, 1)))
            print(f'model {ind}: score = {scores[-1].round(4)}')
        
        scores = np.array(scores)
        print(f'mean score = {scores.mean().round(4)}, std = {scores.std().round(4)}')
        print(f'overall score = {(scores.mean() - scores.std()).round(4)}')

    def predict(self, X):
        
        cnt_classes = len(clusters)
        y_pred = np.zeros((X.shape[0], cnt_classes))

        for model in self.models:
            y_pred += model.predict_proba(X)
        y_pred /= cnt_classes
        
        return y_pred

AttributeError: module 'pandas.core.strings' has no attribute 'StringMethods'

In [141]:
cat_features = [
    "channel_code", "city", "city_type",
    "okved", "segment", "start_cluster",
    "index_city_code", "ogrn_month", "ogrn_year"
] + additional_cat_features

X = train.copy()
y = X[target_feature]
X = X.drop([target_feature], axis=1)

imputer = Imputer()
feature_extractor = FeatureExtractor()

X = imputer.fit_transform(X)
X = feature_extractor.fit_transform(X, imputer, train=True)

  X.loc[:, column] = self.les[ind].transform(X[column])
  X.loc[:, column] = self.les[ind].transform(X[column])
  X.loc[:, column] = self.les[ind].transform(X[column])
  X.loc[:, column] = self.les[ind].transform(X[column])
  X.loc[:, column] = self.les[ind].transform(X[column])
  X.loc[:, column] = self.les[ind].transform(X[column])
  X.loc[:, column] = self.les[ind].transform(X[column])
  X.loc[:, column] = self.les[ind].transform(X[column])
  X.loc[:, column] = self.les[ind].transform(X[column])


In [120]:
X

Unnamed: 0,id,date,balance_amt_avg,balance_amt_max,balance_amt_min,balance_amt_day_avg,channel_code,city,city_type,index_city_code,...,start_cluster,is_new,cnt_null_features,line_cnt_e_cnt_g,balance_amt_diff,d_oper_1m_diff,e_oper_1m_diff,f_oper_1m_diff,g_oper_1m_diff,h_oper_1m_diff
0,0,month_1,0.901557,0.910179,1.413202,0.904349,16,10,2,1,...,4,0,49,0,-0.503023,0.0,0.110371,-0.048345,0.129693,0.129696
1,0,month_2,1.206317,1.036603,2.584604,1.210053,16,10,2,1,...,4,0,53,0,-1.548002,-0.040304,-0.035483,-0.071767,0.00833,0.003318
2,0,month_3,0.849365,0.94494,0.556037,0.851996,16,10,2,1,...,4,0,48,0,0.388903,0.0,-0.335959,-0.048345,0.0,-0.685725
3,1,month_1,0.075126,0.112827,0.011955,0.075358,11,5,2,0,...,1,0,57,0,0.100872,-0.098033,0.017147,-0.009383,-0.059819,-0.335305
4,1,month_2,0.06175,0.104183,0.006693,0.061941,11,5,2,0,...,1,0,57,1,0.09749,-0.698146,-0.023558,-0.010552,-0.064223,-0.099576
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
599995,199998,month_2,-0.000063,-0.000273,0.000008,-0.000063,20,5,2,0,...,2,0,8,1,-0.000281,0.0,0.0,0.0,0.0,0.0
599996,199998,month_3,-0.000064,-0.000273,0.0,-0.000064,20,5,2,0,...,16,0,13,1,-0.000273,0.0,-0.000001,0.0,0.0,0.0
599997,199999,month_1,0.0,-0.000226,0.000164,0.0,6,1,2,1,...,2,0,8,1,-0.00039,0.0,0.0,0.0,0.0,0.0
599998,199999,month_2,0.0,-0.000226,0.000164,0.0,6,1,2,1,...,2,0,8,1,-0.00039,0.0,0.0,0.0,0.0,0.0


In [37]:
y.name = 'end_cluster_2'
y = y.shift(-1)
X = X[X.date != 'month_3']
y = y.loc[X.index]

target_feature = 'end_cluster_2'

In [142]:
clf = CatboostEstimator()
clf.fit(X, y, 5, cat_features, to_drop)

Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.6978080	best: 0.6978080 (0)	total: 169ms	remaining: 5m 38s
500:	test: 0.9031250	best: 0.9031250 (500)	total: 29.1s	remaining: 1m 26s
1000:	test: 0.8979383	best: 0.9031250 (500)	total: 57s	remaining: 56.9s
1500:	test: 0.8989684	best: 0.9031250 (500)	total: 1m 24s	remaining: 28.1s
1999:	test: 0.8986782	best: 0.9031250 (500)	total: 1m 52s	remaining: 0us
bestTest = 0.9031249534
bestIteration = 500
Shrink model to first 501 iterations.
{'{other}': 0.897987048315328, '{}': 0.9041073839246393, '{α, β}': 0.8756751277594426, '{α, γ}': 0.9110797935700224, '{α, δ}': 0.8915737534323811, '{α, ε, η}': 0.9229917470256667, '{α, ε, θ}': 0.8962586805870671, '{α, ε, ψ}': 0.927572794111044, '{α, ε}': 0.8777694023465438, '{α, η}': 0.9551774943744197, '{α, θ}': 0.8689809480214247, '{α, λ}': 0.8690113380340454, '{α, μ}': 0.8397458346009778, '{α, π}': 0.36366186932451094, '{α, ψ}': 0.9300135467957878, '{α}': 0.852707570438814, '{λ}': 0.9073733275859606}
model 0: score = 0.8814


Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.7212351	best: 0.7212351 (0)	total: 158ms	remaining: 5m 16s
500:	test: 0.9330258	best: 0.9330258 (500)	total: 28.9s	remaining: 1m 26s
1000:	test: 0.9402558	best: 0.9402729 (985)	total: 57s	remaining: 56.8s
1500:	test: 0.9418660	best: 0.9419535 (1465)	total: 1m 24s	remaining: 28.1s
1999:	test: 0.9417455	best: 0.9420309 (1725)	total: 1m 52s	remaining: 0us
bestTest = 0.9420308607
bestIteration = 1725
Shrink model to first 1726 iterations.
{'{other}': 0.906537800190754, '{}': 0.9104077482653836, '{α, β}': 0.8821178420244926, '{α, γ}': 0.9183858250660047, '{α, δ}': 0.8972960457557422, '{α, ε, η}': 0.9429240539184319, '{α, ε, θ}': 0.8879038567684778, '{α, ε, ψ}': 0.9537411809354509, '{α, ε}': 0.8796311819484135, '{α, η}': 0.9584599892053537, '{α, θ}': 0.8696248356605689, '{α, λ}': 0.8943804407144091, '{α, μ}': 0.8646495724925206, '{α, π}': 0.9961137917336822, '{α, ψ}': 0.9423598002619993, '{α}': 0.8579120473136062, '{λ}': 0.8997000424527881}
model 1: score = 0.9087


Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.7320155	best: 0.7320155 (0)	total: 150ms	remaining: 4m 59s
500:	test: 0.9347039	best: 0.9347039 (500)	total: 28.8s	remaining: 1m 26s
1000:	test: 0.9415048	best: 0.9416337 (985)	total: 56.6s	remaining: 56.5s
1500:	test: 0.9418825	best: 0.9423844 (1265)	total: 1m 24s	remaining: 28s
1999:	test: 0.9441209	best: 0.9443350 (1930)	total: 1m 51s	remaining: 0us
bestTest = 0.9443350309
bestIteration = 1930
Shrink model to first 1931 iterations.
{'{other}': 0.8995781311013615, '{}': 0.9088691870217402, '{α, β}': 0.8843548579082668, '{α, γ}': 0.9166635921017651, '{α, δ}': 0.8968196227037905, '{α, ε, η}': 0.9441800916005866, '{α, ε, θ}': 0.9160825399194215, '{α, ε, ψ}': 0.9346802185081746, '{α, ε}': 0.8864018500190453, '{α, η}': 0.9594525081205674, '{α, θ}': 0.8643117965630603, '{α, λ}': 0.9210193387982435, '{α, μ}': 0.871116532826935, '{α, π}': 0.9282821380356339, '{α, ψ}': 0.9480958111982274, '{α}': 0.856899907501462, '{λ}': 0.9732537300991915}
model 2: score = 0.9128


Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.7077521	best: 0.7077521 (0)	total: 155ms	remaining: 5m 9s
500:	test: 0.9223202	best: 0.9223525 (495)	total: 28.8s	remaining: 1m 26s
1000:	test: 0.9300421	best: 0.9300571 (985)	total: 56.9s	remaining: 56.8s
1500:	test: 0.9323376	best: 0.9323376 (1500)	total: 1m 24s	remaining: 28.1s
1999:	test: 0.9323418	best: 0.9327025 (1700)	total: 1m 51s	remaining: 0us
bestTest = 0.9327024853
bestIteration = 1700
Shrink model to first 1701 iterations.
{'{other}': 0.9058123602283604, '{}': 0.911306525211984, '{α, β}': 0.8863593445712651, '{α, γ}': 0.9194221880927145, '{α, δ}': 0.9018860536839133, '{α, ε, η}': 0.9466117175768718, '{α, ε, θ}': 0.9195848020299777, '{α, ε, ψ}': 0.912265154200638, '{α, ε}': 0.8816310283345288, '{α, η}': 0.960541941757482, '{α, θ}': 0.8767100925840814, '{α, λ}': 0.8522780665853615, '{α, μ}': 0.8735406221991254, '{α, π}': 0.8863845970076283, '{α, ψ}': 0.9290088790700634, '{α}': 0.8583100422082656, '{λ}': 0.9178955697130619}
model 3: score = 0.9013


Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.6980729	best: 0.6980729 (0)	total: 148ms	remaining: 4m 55s
500:	test: 0.9261186	best: 0.9261186 (500)	total: 29s	remaining: 1m 26s
1000:	test: 0.9345497	best: 0.9345497 (1000)	total: 56.7s	remaining: 56.6s
1500:	test: 0.9356424	best: 0.9359900 (1440)	total: 1m 24s	remaining: 28s
1999:	test: 0.9344260	best: 0.9359900 (1440)	total: 1m 52s	remaining: 0us
bestTest = 0.9359900397
bestIteration = 1440
Shrink model to first 1441 iterations.
{'{other}': 0.9059876751186441, '{}': 0.9077137212220833, '{α, β}': 0.8753699643727982, '{α, γ}': 0.9224526476705515, '{α, δ}': 0.8891915227862168, '{α, ε, η}': 0.9540549369291786, '{α, ε, θ}': 0.895330805778232, '{α, ε, ψ}': 0.9608383461781277, '{α, ε}': 0.8769858470284895, '{α, η}': 0.9586063093197112, '{α, θ}': 0.8912478797812453, '{α, λ}': 0.9297135087901649, '{α, μ}': 0.8509712183385587, '{α, π}': 0.9583649318721612, '{α, ψ}': 0.9271846451469592, '{α}': 0.8569486066813203, '{λ}': 0.9192978991602027}
model 4: score = 0.9101
mean score = 0.90

In [144]:
predictions = clf.predict(X_val)
score(y_val, predictions / predictions.sum(axis=1).reshape(-1, 1)).round(4)

{'{other}': 0.8999071693373473, '{}': 0.9072328419351423, '{α, β}': 0.8806086218817131, '{α, γ}': 0.9154504232905452, '{α, δ}': 0.90494671230305, '{α, ε, η}': 0.9330267065487007, '{α, ε, θ}': 0.9131255740168657, '{α, ε, ψ}': 0.9248734176381884, '{α, ε}': 0.8734116779340073, '{α, η}': 0.9611213086438922, '{α, θ}': 0.8856308288660313, '{α, λ}': 0.8849825291068488, '{α, μ}': 0.8561198352291737, '{α, π}': 0.972307641024359, '{α, ψ}': 0.9370152219430486, '{α}': 0.857196417213885, '{λ}': 0.8627691230526868}


0.9021

In [145]:
df_features = clf.model.get_feature_importance(prettified=True).set_index('Feature Id')
df_features = df_features.sort_values('Importances', ascending=False)

In [148]:
df_features.head(30)

Unnamed: 0_level_0,Importances
Feature Id,Unnamed: 1_level_1
start_cluster,21.494838
okved,8.72453
max_founderpres,6.278127
ogrn_exist_months,6.090779
balance_amt_min,4.862454
channel_code,4.787789
sum_of_paym_1y,4.421366
segment,4.320693
cnt_null_features,4.013097
balance_amt_max,3.723152


# Предсказания для теста, используя вероятности классов для month_6 уже обученной модели

In [144]:
test = get_test_data()
test = test.convert_dtypes()

In [145]:
test.head()

Unnamed: 0,id,date,balance_amt_avg,balance_amt_max,balance_amt_min,balance_amt_day_avg,channel_code,city,city_type,index_city_code,...,sum_cred_g_oper_3m,cnt_cred_g_oper_3m,cnt_days_cred_g_oper_3m,sum_deb_h_oper_3m,cnt_deb_h_oper_3m,cnt_days_deb_h_oper_3m,sum_cred_h_oper_3m,cnt_cred_h_oper_3m,cnt_days_cred_h_oper_3m,start_cluster
0,200000,month_4,-0.096224,0.335496,-0.125995,-0.095578,channel_code_12,city_14,city_type_0,,...,0.010952,0.946066,0.407762,-0.15395,0.548895,0.54102,0.031742,0.257278,0.561353,{α}
1,200000,month_5,-0.024255,-0.059806,-0.124295,-0.023381,channel_code_12,city_14,city_type_0,,...,0.006812,0.945281,0.396267,-0.150505,0.549468,0.552131,0.237817,0.264211,0.715199,{α}
2,200000,month_6,0.045988,0.049418,-0.125995,0.047079,channel_code_12,city_14,city_type_0,,...,0.006812,0.945281,0.396267,-0.1528,0.549468,0.54102,0.387566,0.268543,0.836079,
3,200001,month_4,-0.156722,-0.20492,-0.125856,-0.156258,channel_code_9,city_76,city_type_0,,...,-0.028584,,,-0.165588,,,-0.201123,,,{α}
4,200001,month_5,-0.156722,-0.20492,-0.125856,-0.156258,channel_code_9,city_76,city_type_0,,...,-0.028584,,,-0.165588,,,-0.201123,,,{α}


In [146]:
probs = np.load('/kaggle/input/probabilities-0/prob0.npy')

test = imputer.transform(test)
test = feature_extractor.transform(test, train=False)
test = test[test.date == 'month_6']
test = test.drop(to_drop, axis=1)

test = test.loc[test.index.repeat(len(clusters))]
test['start_cluster'] = clusters * (test.shape[0] // len(clusters))

  X.loc[:, column] = self.les[ind].transform(X[column])
  X.loc[:, column] = self.les[ind].transform(X[column])
  X.loc[:, column] = self.les[ind].transform(X[column])
  X.loc[:, column] = self.les[ind].transform(X[column])
  X.loc[:, column] = self.les[ind].transform(X[column])
  X.loc[:, column] = self.les[ind].transform(X[column])
  X.loc[:, column] = self.les[ind].transform(X[column])
  X.loc[:, column] = self.les[ind].transform(X[column])
  X.loc[:, column] = self.les[ind].transform(X[column])


In [147]:
test.loc[:, 'start_cluster'] = imputer.les[-1].transform(test['start_cluster']).astype(int)

In [148]:
test['start_cluster'] = test['start_cluster'].astype(int)

In [149]:
predictions = clf.predict(test)
predictions = predictions / predictions.sum(axis=1).reshape(-1, 1)
predictions = predictions.reshape((100000, 17, 17))
predictions = get_final_proba(probs, predictions)

In [151]:
sample_submission = get_sample_submission()
assert sample_submission.columns.tolist() == ['id'] + clusters

sample_submission.loc[:, clusters] = predictions

sample_submission.to_csv('submission.csv', index=False)

In [44]:
sample_submission.head()

Unnamed: 0,id,{other},{},"{α, β}","{α, γ}","{α, δ}","{α, ε, η}","{α, ε, θ}","{α, ε, ψ}","{α, ε}","{α, η}","{α, θ}","{α, λ}","{α, μ}","{α, π}","{α, ψ}",{α},{λ}
0,200000,0.013394,0.009504,0.022087,0.029783,0.007132,0.000393,0.002276,0.000773,0.014474,0.004992,0.019094,0.00057,0.001962,1.1e-05,0.002971,0.870573,1e-05
1,200001,0.007447,0.504575,0.000628,0.001795,0.000462,0.000128,0.000328,2.4e-05,0.001744,0.00859,0.001223,0.000297,0.001158,7e-06,0.00054,0.470957,9.8e-05
2,200002,0.577468,0.005104,0.005103,0.130668,0.010659,0.001265,0.002723,0.009384,0.019038,0.006043,0.013294,0.009871,0.001987,1.7e-05,0.052434,0.154908,3.4e-05
3,200003,0.031014,0.629658,0.000732,0.001709,0.000513,0.000751,0.000285,5.7e-05,0.001191,0.027887,0.00238,6.6e-05,0.001258,6e-06,0.000385,0.302087,2.1e-05
4,200004,0.061991,0.22822,0.013942,0.038092,0.003518,0.002798,0.001412,0.000172,0.006334,0.089316,0.010204,0.000903,0.003924,2e-06,0.002423,0.536519,0.00023
