In [2]:
import pandas as pd
import numpy as np
import os

def set_path(new_path: str):
    os.environ["DATA_PATH"] = new_path

def get_data_path():
    return os.environ.get("DATA_PATH", "data/")

def get_cluster_weights():
    path = get_data_path()
    return pd.read_excel(path + "cluster_weights.xlsx").set_index("cluster")

def get_train_data():
    path = get_data_path()
    return pd.read_parquet(path + "train_data.pqt")

def get_test_data():
    path = get_data_path()
    return pd.read_parquet(path + "test_data.pqt")

def get_sample_submission():
    path = get_data_path()
    return pd.read_csv(path + "sample_submission.csv")

def get_final_proba(test_start_cluster_proba: pd.DataFrame, transition_proba: np.array):
    # test_start_cluster_proba: (n_samples, n_clusters)
    # transition_proba: (n_samples, n_clusters, n_clusters)
    # return (n_samples, n_clusters)
    
    return np.einsum("ij,ijk->ik", test_start_cluster_proba, transition_proba)

clusters = [
    '{other}',
    '{}',
    '{α, β}',
    '{α, γ}',
    '{α, δ}',
    '{α, ε, η}',
    '{α, ε, θ}',
    '{α, ε, ψ}',
    '{α, ε}',
    '{α, η}',
    '{α, θ}',
    '{α, λ}',
    '{α, μ}',
    '{α, π}',
    '{α, ψ}',
    '{α}',
    '{λ}'
]

In [5]:
set_path('/kaggle/input/it-purple-hack-alpha/')

In [20]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score

def weighted_roc_auc(y_true, y_pred, labels, weights_dict):
    unnorm_weights = np.array([weights_dict[label] for label in labels])
    weights = unnorm_weights / unnorm_weights.sum()
    classes_roc_auc = roc_auc_score(y_true, y_pred, labels=labels,
                                    multi_class="ovr", average=None)
    return sum(weights * classes_roc_auc)

cluster_weights_ = get_cluster_weights()
weights_dict_ = cluster_weights_["unnorm_weight"].to_dict()

def score(y_true, y_pred):
    return weighted_roc_auc(y_true, y_pred, labels=clusters, weights_dict=weights_dict_)

In [110]:
class Imputer:
    def fit(self, X, y=None):
        ignore_columns = ['start_cluster']
        self.string_fill_value_ = X.drop(columns=ignore_columns).select_dtypes(exclude='number').mode().iloc[0]
        self.number_fill_value_ = 0
    
    def transform(self, X, y=None):
        X = X.copy()
        X = X.fillna(self.string_fill_value_)
        X = X.fillna(self.number_fill_value_)
        return X
    
    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)
    

class FeatureExtractor:
    def __init__(self, columns):
        self.columns = columns
    
    def fit(self, X, y=None):
        pass
    
    def transform(self, X, y=None):
        return X[self.columns]
    
    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)
    

In [111]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


KAGGLE = True
if KAGGLE:
    set_path('/kaggle/input/it-purple-hack-alpha/')

In [112]:
train = get_train_data().convert_dtypes()
test = get_test_data().convert_dtypes()

In [113]:
from sklearn.model_selection import train_test_split
train, val = train_test_split(train, test_size=0.2, shuffle=False)
assert len(train) % 3 == len(val) % 3 == 0

In [114]:
val.head()

Unnamed: 0,id,date,balance_amt_avg,balance_amt_max,balance_amt_min,balance_amt_day_avg,channel_code,city,city_type,index_city_code,...,cnt_cred_g_oper_3m,cnt_days_cred_g_oper_3m,sum_deb_h_oper_3m,cnt_deb_h_oper_3m,cnt_days_deb_h_oper_3m,sum_cred_h_oper_3m,cnt_cred_h_oper_3m,cnt_days_cred_h_oper_3m,start_cluster,end_cluster
480000,160000,month_1,-0.140437,-0.173369,-0.104965,-0.139923,channel_code_12,city_88,city_type_0,,...,0.945281,0.407762,-0.165588,0.546889,0.407687,-0.201123,0.250924,0.37454,{α},{}
480001,160000,month_2,-0.148302,-0.198397,-0.106751,-0.147812,channel_code_12,city_88,city_type_0,,...,0.944889,0.396267,-0.165588,0.546889,0.407687,-0.201123,0.250924,0.37454,{α},{}
480002,160000,month_3,-0.149538,-0.199147,-0.10943,-0.149052,channel_code_12,city_88,city_type_0,,...,0.944889,0.396267,-0.165588,0.546889,0.407687,-0.201123,0.250924,0.37454,{α},{}
480003,160001,month_1,-0.151228,-0.185547,-0.125976,-0.150748,channel_code_0,city_6,city_type_0,,...,0.946458,0.442244,-0.11111,0.550615,0.596576,-0.11314,0.254101,0.473441,{α},{α}
480004,160001,month_2,-0.14948,-0.178898,-0.125842,-0.148994,channel_code_0,city_6,city_type_0,,...,0.946066,0.43075,-0.111074,0.550471,0.596576,-0.152004,0.253234,0.451463,{α},{α}


In [115]:
test.id.value_counts().value_counts()

count
3    90120
2     9880
Name: count, dtype: Int64

In [116]:
# Add row with date=month_4 for those ids where it is missing (copy from month_5)
all_ids = test[test.date == 'month_5'].id.unique()
month4_ids = test[test.date == 'month_4'].id
missing_ids = list(set(all_ids) - set(month4_ids))

print(len(missing_ids))

month5_df = test.loc[test.date == 'month_5', :]
month5_df.date = 'month_4'
month5_df.start_cluster = '{}'
month5_df = month5_df[month5_df.id.isin(missing_ids)]
assert month5_df.shape[0] == len(missing_ids)

test = pd.concat([test, month5_df], axis=0).reset_index(drop=True).sort_values(by=['id', 'date'])
len(test)

9880


300000

In [117]:
imputer = Imputer()
imputer.fit(train)

train = imputer.transform(train)
val = imputer.transform(val)
test = imputer.transform(test)

columns = train.columns.tolist()
columns.remove('id')
columns.remove('date')
columns.remove('end_cluster')

feature_extractor = FeatureExtractor(columns)
feature_extractor.fit(train)

X_train = feature_extractor.transform(train)
X_val = feature_extractor.transform(val)
X_test = feature_extractor.transform(test)

y_train = train['end_cluster']
y_val = val['end_cluster']

### Модель краткосрочного предсказания

In [118]:
def get_month_dataset(X):

    month1 = X.iloc[::3, :].copy().reset_index(drop=True)
    month2 = X.iloc[1::3, :].copy().reset_index(drop=True)
    month3 = X.iloc[2::3, :].copy().reset_index(drop=True)

    assert month1.shape[0] == month2.shape[0] == month3.shape[0]

    result = pd.concat([month1, month2, month3], axis=1)

    result.columns = [f'{col}_{i}' for i in range(1, 4) for col in X.columns]
    
    return result


month_X_train = get_month_dataset(X_train)
month_X_val = get_month_dataset(X_val)
month_X_test = get_month_dataset(X_test)

month_X_train, month_y_train = month_X_train.drop(columns=['start_cluster_3']), month_X_train['start_cluster_3']
month_X_val, month_y_val = month_X_val.drop(columns=['start_cluster_3']), month_X_val['start_cluster_3']

In [119]:
month_X_train.head()

Unnamed: 0,balance_amt_avg_1,balance_amt_max_1,balance_amt_min_1,balance_amt_day_avg_1,channel_code_1,city_1,city_type_1,index_city_code_1,ogrn_days_end_month_1,ogrn_days_end_quarter_1,...,cnt_days_deb_g_oper_3m_3,sum_cred_g_oper_3m_3,cnt_cred_g_oper_3m_3,cnt_days_cred_g_oper_3m_3,sum_deb_h_oper_3m_3,cnt_deb_h_oper_3m_3,cnt_days_deb_h_oper_3m_3,sum_cred_h_oper_3m_3,cnt_cred_h_oper_3m_3,cnt_days_cred_h_oper_3m_3
0,0.744845,0.705492,1.287207,0.748101,channel_code_5,city_23,city_type_0,index_city_code_39,-0.488553,-0.135063,...,0.728507,0.033607,0.946458,0.442244,0.87705,0.551044,0.663243,0.810065,0.294829,0.956958
1,-0.081586,-0.09186,-0.11404,-0.08089,channel_code_2,city_14,city_type_0,index_city_code_46,0.324343,1.258747,...,0.772463,-0.019422,0.948027,0.488221,0.043221,0.560788,0.707687,-0.167905,0.259011,0.605309
2,-0.154685,-0.186795,-0.122805,-0.154215,channel_code_12,city_613,city_type_306,index_city_code_46,-0.256297,-1.257854,...,0.728507,-0.028584,0.944497,0.384773,-0.165588,0.546889,0.407687,-0.176302,0.252368,0.429485
3,-0.156643,-0.204861,-0.12566,-0.156179,channel_code_14,city_21,city_type_0,index_city_code_46,-1.185321,-0.367365,...,0.728507,-0.028584,0.944497,0.384773,-0.165588,0.546889,0.407687,-0.201123,0.250924,0.37454
4,-0.138847,-0.182486,-0.12563,-0.138328,channel_code_8,city_21,city_type_0,index_city_code_46,-1.417577,-0.444799,...,0.871364,0.053048,0.957051,0.637647,-0.078297,0.556919,0.64102,-0.172533,0.253523,0.462452


Для начала попробуем обычную матрицу веротяностей

In [120]:
class AAA:
    clusters = None
data = AAA()
data.clusters = clusters

In [122]:
class DummyPredictor:
    def fit(self, X, y):
        pass
    
    def predict(self, X):
        return pd.get_dummies(X['start_cluster_2'])

    
class MonthProbabilityEstimator:
    def fit(self, X, y):
        X = X.copy()
        X = X[['start_cluster_1', 'start_cluster_2']]
        X['start_cluster'] = X['start_cluster_1'].astype(str) + '_' + X['start_cluster_2'].astype(str)
        X = X.drop(columns=['start_cluster_1', 'start_cluster_2'])

        self.probs = pd.crosstab(X['start_cluster'], y, normalize='index')
    
    def predict(self, X):
        X = X.copy()
        X = X[['start_cluster_1', 'start_cluster_2']]
        X['start_cluster'] = X['start_cluster_1'].astype(str) + '_' + X['start_cluster_2'].astype(str)
        X = X.drop(columns=['start_cluster_1', 'start_cluster_2'])

        for cluster2 in X['start_cluster'].unique():
            if cluster2 not in self.probs.index:
                cluster = cluster2.split('_')[1]
                probs = np.zeros(len(data.clusters))
                probs[data.clusters.index(cluster)] = 1
                self.probs = pd.concat([self.probs, pd.DataFrame([probs], columns=self.probs.columns, index=[cluster2])])

        return self.probs.loc[X['start_cluster']].values
    
    
from catboost import CatBoostClassifier
class CatboostEstimator:
    def fit(self, X, y, X_val, y_val):
        X = X.copy()
        self.model = CatBoostClassifier(cat_features=X.select_dtypes('string').columns.tolist(), verbose=1 if not KAGGLE else 0,
                                       loss_function='MultiClassOneVsAll', eval_metric='AUC', task_type='GPU' if KAGGLE else 'CPU',
                                       iterations=1 if not KAGGLE else None)

        self.model.fit(X, y, verbose=1 if not KAGGLE else 500,
                    eval_set=(X_val, y_val))
    
    def predict(self, X):
        return self.model.predict_proba(X)

dummy_predictor = DummyPredictor()
dummy_predictor.fit(month_X_train, month_y_train)

month_probability_estimator = MonthProbabilityEstimator()
month_probability_estimator.fit(month_X_train, month_y_train)

catboost_estimator = CatboostEstimator()
catboost_estimator.fit(month_X_train, month_y_train, month_X_val, month_y_val)

Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.7718740	best: 0.7718740 (0)	total: 109ms	remaining: 1m 48s
500:	test: 0.9945296	best: 0.9945296 (500)	total: 23.1s	remaining: 23.1s
999:	test: 0.9959888	best: 0.9959888 (999)	total: 43.1s	remaining: 0us
bestTest = 0.9959887685
bestIteration = 999


In [125]:
from sklearn.metrics import accuracy_score, roc_auc_score

month_y_val_pred = dummy_predictor.predict(month_X_val)
print('Dummy predictor score:', score(month_y_val, month_y_val_pred))

month_y_val_pred = month_probability_estimator.predict(month_X_val)
print('Probability estimator score:', score(month_y_val, month_y_val_pred))

month_y_val_pred = catboost_estimator.predict(month_X_val)
month_y_val_pred = month_y_val_pred / month_y_val_pred.sum(axis=1).reshape(-1, 1)
print('Catboost estimator score:', score(month_y_val, month_y_val_pred))

Dummy predictor score: 0.9134134410143546
Probability estimator score: 0.9444630401101588
Catboost estimator score: 0.9876391228948176


In [138]:
test_month6_estimation = catboost_estimator.predict(month_X_test)
test_month6_estimation = test_month6_estimation / test_month6_estimation.sum(axis=1).reshape(-1, 1)

In [137]:
val_month3_estimation = catboost_estimator.predict(month_X_val)
val_month3_estimation = val_month3_estimation / val_month3_estimation.sum(axis=1).reshape(-1, 1)
val_month3_estimation.shape

(40000, 17)

### Долгосрочное предсказание

In [69]:
from sklearn.ensemble import RandomForestClassifier
from tqdm import tqdm
from catboost import CatBoostClassifier

class YearProbabilityEstimator:
    def fit(self, X, y, X_val, y_val):
        start_cluster = X['start_cluster']
        start_cluster_val = X_val['start_cluster']
        X = X.copy().drop(columns=['start_cluster'])

        X_val = X_val.copy().drop(columns=['start_cluster'])
        
        self.models = {}
        for cluster in tqdm(data.clusters):

            model = CatBoostClassifier(cat_features=X.select_dtypes('string').columns.tolist(), verbose=1 if not KAGGLE else 0,
                                       loss_function='MultiClassOneVsAll', eval_metric='AUC', task_type='GPU' if KAGGLE else 'CPU',
                                       iterations=1 if not KAGGLE else None)
            
            X_train_new = X[start_cluster == cluster]
            y_train_new = y[start_cluster == cluster]
            
            X_val_new = X_val[start_cluster_val == cluster]
            y_val_new = y_val[start_cluster_val == cluster]
            
            X_val_new = X_val_new[y_val_new.isin(y_train_new.unique())]
            y_val_new = y_val_new[y_val_new.isin(y_train_new.unique())]
            
            model.fit(X_train_new, y_train_new, verbose=1 if not KAGGLE else 500,
                        eval_set=(X_val_new, y_val_new))

            self.models[cluster] = model
    
    def predict(self, X):
        result = pd.DataFrame(index=X.index, columns=data.clusters)
        start_cluster = X['start_cluster']
        X = X.copy().drop(columns=['start_cluster'])

        for cluster, model in self.models.items():
            if cluster not in start_cluster.unique():
                continue
            result.loc[start_cluster == cluster, model.classes_] = model.predict_proba(X[start_cluster == cluster])

        return result.fillna(0).values

In [70]:
probability_estimator = YearProbabilityEstimator()
probability_estimator.fit(X_train, y_train, X_val, y_val)

  0%|          | 0/17 [00:00<?, ?it/s]Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.6186580	best: 0.6186580 (0)	total: 52.3ms	remaining: 52.3s
500:	test: 0.7391796	best: 0.7394312 (495)	total: 12.2s	remaining: 12.1s


  6%|▌         | 1/17 [00:24<06:27, 24.19s/it]

999:	test: 0.7451736	best: 0.7456299 (890)	total: 23.2s	remaining: 0us
bestTest = 0.745629934
bestIteration = 890
Shrink model to first 891 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.5124808	best: 0.5124808 (0)	total: 57.6ms	remaining: 57.5s
500:	test: 0.5452845	best: 0.5490341 (420)	total: 11.2s	remaining: 11.2s


 12%|█▏        | 2/17 [00:47<05:56, 23.76s/it]

999:	test: 0.5305284	best: 0.5490341 (420)	total: 21.9s	remaining: 0us
bestTest = 0.5490340533
bestIteration = 420
Shrink model to first 421 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.2897564	best: 0.2897564 (0)	total: 42.1ms	remaining: 42s
500:	test: 0.4010966	best: 0.4010966 (500)	total: 9.81s	remaining: 9.78s


 18%|█▊        | 3/17 [01:07<05:09, 22.08s/it]

999:	test: 0.4063927	best: 0.4068318 (650)	total: 19.2s	remaining: 0us
bestTest = 0.4068318264
bestIteration = 650
Shrink model to first 651 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.3896041	best: 0.3896041 (0)	total: 35.4ms	remaining: 35.4s
500:	test: 0.4704280	best: 0.4704280 (500)	total: 10.6s	remaining: 10.6s


 24%|██▎       | 4/17 [01:30<04:51, 22.43s/it]

999:	test: 0.4585890	best: 0.4711861 (615)	total: 21.8s	remaining: 0us
bestTest = 0.4711861234
bestIteration = 615
Shrink model to first 616 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.3506344	best: 0.3506344 (0)	total: 43.1ms	remaining: 43.1s
500:	test: 0.4124482	best: 0.4163020 (275)	total: 9.58s	remaining: 9.55s


 29%|██▉       | 5/17 [01:50<04:18, 21.51s/it]

999:	test: 0.4083274	best: 0.4163020 (275)	total: 19s	remaining: 0us
bestTest = 0.4163019762
bestIteration = 275
Shrink model to first 276 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.1494326	best: 0.1494326 (0)	total: 31.7ms	remaining: 31.7s
500:	test: 0.1991223	best: 0.2002645 (415)	total: 8.51s	remaining: 8.47s


 35%|███▌      | 6/17 [02:08<03:44, 20.41s/it]

999:	test: 0.1950131	best: 0.2002645 (415)	total: 17.5s	remaining: 0us
bestTest = 0.2002644998
bestIteration = 415
Shrink model to first 416 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.1546447	best: 0.1546447 (0)	total: 23.5ms	remaining: 23.4s
500:	test: 0.2210885	best: 0.2251697 (350)	total: 9s	remaining: 8.96s


 41%|████      | 7/17 [02:28<03:21, 20.11s/it]

999:	test: 0.2158237	best: 0.2251697 (350)	total: 18.7s	remaining: 0us
bestTest = 0.2251697168
bestIteration = 350
Shrink model to first 351 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.2485230	best: 0.2485230 (0)	total: 27.6ms	remaining: 27.6s
500:	test: 0.3882218	best: 0.3882218 (500)	total: 9.1s	remaining: 9.07s


 47%|████▋     | 8/17 [02:45<02:53, 19.32s/it]

999:	test: 0.3907445	best: 0.3923760 (955)	total: 16.9s	remaining: 0us
bestTest = 0.3923760146
bestIteration = 955
Shrink model to first 956 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.5483010	best: 0.5483010 (0)	total: 52ms	remaining: 51.9s
500:	test: 0.6371953	best: 0.6417777 (485)	total: 9.95s	remaining: 9.91s


 53%|█████▎    | 9/17 [03:07<02:39, 19.93s/it]

999:	test: 0.6363686	best: 0.6440266 (820)	total: 20.4s	remaining: 0us
bestTest = 0.6440266272
bestIteration = 820
Shrink model to first 821 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.5407475	best: 0.5407475 (0)	total: 49.6ms	remaining: 49.5s
500:	test: 0.5821079	best: 0.5924660 (390)	total: 10.3s	remaining: 10.2s


 59%|█████▉    | 10/17 [03:28<02:23, 20.48s/it]

999:	test: 0.5687394	best: 0.5924660 (390)	total: 20.5s	remaining: 0us
bestTest = 0.5924660083
bestIteration = 390
Shrink model to first 391 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.4736446	best: 0.4736446 (0)	total: 51.2ms	remaining: 51.2s
500:	test: 0.5219943	best: 0.5269959 (400)	total: 10.6s	remaining: 10.5s


 65%|██████▍   | 11/17 [03:50<02:04, 20.69s/it]

999:	test: 0.5209824	best: 0.5269959 (400)	total: 20.4s	remaining: 0us
bestTest = 0.5269958596
bestIteration = 400
Shrink model to first 401 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.3593726	best: 0.3593726 (0)	total: 41.1ms	remaining: 41s
500:	test: 0.4355444	best: 0.4531497 (55)	total: 9.32s	remaining: 9.28s


 71%|███████   | 12/17 [04:08<01:40, 20.01s/it]

999:	test: 0.4287836	best: 0.4531497 (55)	total: 17.7s	remaining: 0us
bestTest = 0.4531497005
bestIteration = 55
Shrink model to first 56 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.1929718	best: 0.1929718 (0)	total: 26ms	remaining: 26s
500:	test: 0.2336467	best: 0.2383824 (360)	total: 9.08s	remaining: 9.04s


 76%|███████▋  | 13/17 [04:26<01:17, 19.49s/it]

999:	test: 0.2307084	best: 0.2383824 (360)	total: 17.5s	remaining: 0us
bestTest = 0.2383823683
bestIteration = 360
Shrink model to first 361 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.0000000	best: 0.0000000 (0)	total: 13.6ms	remaining: 13.5s
500:	test: 0.1000000	best: 0.1000000 (25)	total: 6.77s	remaining: 6.75s


 82%|████████▏ | 14/17 [04:41<00:54, 18.06s/it]

999:	test: 0.1000000	best: 0.1000000 (25)	total: 14s	remaining: 0us
bestTest = 0.1
bestIteration = 25
Shrink model to first 26 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.2746480	best: 0.2746480 (0)	total: 26.1ms	remaining: 26s
500:	test: 0.3017646	best: 0.3092263 (465)	total: 8.62s	remaining: 8.59s


 88%|████████▊ | 15/17 [04:59<00:35, 17.94s/it]

999:	test: 0.2877955	best: 0.3092263 (465)	total: 16.8s	remaining: 0us
bestTest = 0.3092263221
bestIteration = 465
Shrink model to first 466 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.6118200	best: 0.6118200 (0)	total: 163ms	remaining: 2m 42s
500:	test: 0.7731584	best: 0.7731584 (500)	total: 22.9s	remaining: 22.8s
999:	test: 0.7878065	best: 0.7922729 (895)	total: 45.5s	remaining: 0us
bestTest = 0.7922728817
bestIteration = 895
Shrink model to first 896 iterations.


 94%|█████████▍| 16/17 [05:48<00:27, 27.47s/it]Default metric period is 5 because AUC is/are not implemented for GPU
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.2500000	best: 0.2500000 (0)	total: 20.5ms	remaining: 20.5s
500:	test: 0.2182540	best: 0.2500000 (0)	total: 6.89s	remaining: 6.87s


100%|██████████| 17/17 [06:03<00:00, 21.36s/it]

999:	test: 0.1944444	best: 0.2500000 (0)	total: 13.4s	remaining: 0us
bestTest = 0.25
bestIteration = 0
Shrink model to first 1 iterations.





In [74]:
predictions = probability_estimator.predict(X_val)
score(y_val, predictions / predictions.sum(axis=1).reshape(-1, 1))

  return result.fillna(0).values


0.898878155162233

### Комбнация вероятностей

In [89]:
import warnings
warnings.filterwarnings('ignore')

val_prob = np.zeros((len(X_val) // 3, len(data.clusters), len(data.clusters)))
for i, cluster in tqdm(enumerate(data.clusters)):
    val_to_predict = X_val.iloc[2::3].copy()
    val_to_predict['start_cluster'] = cluster
    val_prob[:, i, :] = probability_estimator.predict(val_to_predict)

17it [00:11,  1.46it/s]


In [141]:
result = get_final_proba(val_month3_estimation, val_prob / val_prob.sum(axis=2)[:, :, np.newaxis])
score(y_val.iloc[2::3], result)

0.8972407397818536

In [105]:
test_prob = np.zeros((100000, len(data.clusters), len(data.clusters)))
for i, cluster in tqdm(enumerate(data.clusters)):
    test_to_predict = X_test.iloc[2::3].copy()
    test_to_predict['start_cluster'] = cluster
    result = probability_estimator.predict(test_to_predict)
    result /= result.sum(axis=1).reshape(-1, 1)
    test_prob[:, i, :] = result

17it [00:29,  1.75s/it]


In [142]:
result = get_final_proba(test_month6_estimation, test_prob)
result.shape

(100000, 17)

In [143]:
sample_submission = get_sample_submission()
assert sample_submission.columns.tolist() == ['id'] + data.clusters

sample_submission.loc[:, data.clusters] = result

sample_submission.to_csv('submission.csv', index=False)

In [144]:
sample_submission.head()

Unnamed: 0,id,{other},{},"{α, β}","{α, γ}","{α, δ}","{α, ε, η}","{α, ε, θ}","{α, ε, ψ}","{α, ε}","{α, η}","{α, θ}","{α, λ}","{α, μ}","{α, π}","{α, ψ}",{α},{λ}
0,200000,0.013816,0.021262,0.016709,0.025686,0.01092,0.001039,0.002969,0.000786,0.022095,0.005956,0.014436,0.001295,0.002043,1.9e-05,0.0037,0.857202,6.7e-05
1,200001,0.007638,0.596125,0.000843,0.002271,0.00059,0.000188,0.000416,5.1e-05,0.00195,0.010276,0.001561,0.000181,0.000859,1.1e-05,0.000618,0.376338,8.4e-05
2,200002,0.619606,0.003267,0.002878,0.066757,0.012371,0.001807,0.004401,0.010169,0.066949,0.004721,0.023269,0.004221,0.002522,6e-06,0.064609,0.112074,0.000372
3,200003,0.021033,0.653097,0.001123,0.001626,0.000487,0.000658,0.00025,7.1e-05,0.001306,0.021795,0.002925,0.000163,0.000883,1.6e-05,0.000511,0.294003,5.4e-05
4,200004,0.060303,0.163008,0.00776,0.016375,0.002499,0.00181,0.000412,8.6e-05,0.005512,0.11099,0.005574,0.000302,0.010122,1.7e-05,0.000854,0.614313,6.3e-05


In [145]:
assert (abs(sample_submission.drop(columns='id').sum(axis=1) - 1) < 1e-6).all()