In [None]:
import json
import os
from glob import glob
from collections import defaultdict

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

In [None]:
from google.colab import drive
drive.mount('/content/myDrive')

Drive already mounted at /content/myDrive; to attempt to forcibly remount, call drive.mount("/content/myDrive", force_remount=True).


In [None]:
# os.chdir('myDrive/MyDrive/ufru')

In [None]:
# !rm -rf public_data
# !rm -rf train_data

In [None]:
# !unzip public_data.zip > public_data_std_out
# !unzip train_data.zip > train_data_std_out
# !rm public_data_std_out
# !rm train_data_std_out

### Read y_train

In [None]:
y = pd.read_csv('train_anwers.csv', sep='\t').astype(int)
y_train = y.target
y.head()

Unnamed: 0,name,target
0,29970,1
1,18654,0
2,29133,0
3,20927,0
4,5526,0


### Read train data as jsons

In [None]:
raw_data = {}
for name in tqdm(y['name']):
    with open(f'train_data/{name}.json', 'r', encoding='utf-8') as inp:
        raw_data[name] = json.load(inp)

  0%|          | 0/8000 [00:00<?, ?it/s]

### Remove nesting in dict and get keys

In [None]:
flatten_keys = defaultdict(int)
def get_flatten_keys(d, parent_key='', sep='.'):
    if d is None:
        return
    next_sep = sep
    sep = sep if parent_key else ''
    if isinstance(d, dict):
        for k, v in d.items():
            get_flatten_keys(v, f"{parent_key}{sep}{k}", sep=next_sep)
    elif isinstance(d, list):
        for i, item in enumerate(d):
            get_flatten_keys(item, f"{parent_key}{sep}{i}", sep=next_sep)
    else:
        flatten_keys[parent_key] += 1

In [None]:
for data in tqdm(raw_data.values()):
    get_flatten_keys(data)
flatten_keys = dict(flatten_keys)

  0%|          | 0/8000 [00:00<?, ?it/s]

In [None]:
len(flatten_keys)

93270

### Delete keys where NaN more than 20%

In [None]:
useless_keys = set()
ln = 8000
for key, val in flatten_keys.items():
    if val < ln * 0.8:
        useless_keys.add(key)

In [None]:
len(useless_keys)

92746

In [None]:
for key in useless_keys:
    del flatten_keys[key]

In [None]:
len(flatten_keys)

524

### Get data by keys

In [None]:
def extract_features(data, features_names):
    def get_value(obj, keys_list):
        if not keys_list:
            return obj
        key = keys_list.pop(0)
        key = int(key) if key.isdigit() else key
        try:
            return get_value(obj[key], keys_list)
        except:
            return None
        
    train_data = defaultdict(list)

    for i, game_data in tqdm(enumerate(data.values())):
        for key_str in features_names:
            keys = key_str.split('.')
            val = get_value(game_data, keys)
                    
            train_data[key_str].append(val)
    df = pd.DataFrame(train_data)
    return df
        

In [None]:
def preprocess_features(data):
    df = data.copy()
    # replace nans with median value
    for col in tqdm(df.columns):
        if df[col].isna().sum() > 0:
            df[col].fillna(df[col].median(), inplace=True)

    # bool features to int
    for col in df.select_dtypes(include=['object']).columns.tolist():
        df[col] = df[col].astype(int)

    return df

In [None]:
X_train = extract_features(raw_data, flatten_keys)

0it [00:00, ?it/s]

In [None]:
X_train2 = preprocess_features(X_train)

  0%|          | 0/524 [00:00<?, ?it/s]

In [None]:
X_train2

Unnamed: 0,cluster,dire_score,duration,engine,first_blood_time,game_mode,human_players,leagueid,lobby_type,match_seq_num,...,players.9.kills_per_min,players.9.kda,players.9.abandons,players.9.is_subscriber,patch,region,players.7.ability_upgrades_arr.0,players.7.ability_upgrades_arr.1,players.7.ability_upgrades_arr.2,players.7.ability_upgrades_arr.3
0,183,34,2022,1,100,0,10,0,0,5780373336,...,0.059347,0.0,0.0,0,51,8,5143.0,5146.0,5147.0,5154.0
1,-123,47,2648,0,54,22,10,0,7,0,...,0.226586,2.0,0.0,0,51,3,5651.0,5653.0,5652.0,5652.0
2,273,65,2684,1,101,18,10,0,0,5780373039,...,0.166436,1.0,0.0,0,51,3,5143.0,5146.0,5147.0,5154.0
3,184,83,0,0,20,22,-1,0,7,5780347103,...,0.328125,2.0,0.0,0,51,8,5359.0,5357.0,5359.0,5358.0
4,154,65,0,1,101,22,10,0,7,5780418598,...,0.246533,0.0,0.0,0,0,-1,5143.0,5146.0,5147.0,5154.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,-40,56,2531,1,110,22,0,0,7,5780379689,...,0.189648,2.0,0.0,0,51,9,5143.0,5146.0,5147.0,5154.0
7996,184,33,1158,1,158,23,10,0,0,5780312140,...,-0.175013,0.0,0.0,0,-1,-3,-1618.0,6483.0,0.0,6483.0
7997,274,40,88,1,104,22,0,0,7,5780365863,...,-0.017416,0.0,0.0,0,0,3,5228.0,5227.0,5228.0,5227.0
7998,153,20,2200,0,76,7,10,0,7,5780378245,...,0.109091,1.0,0.0,0,51,0,5450.0,5448.0,5450.0,7320.0


### Remove features that correlates with target less that 0.05

In [None]:
df_corr = X_train2.copy()
df_corr.insert(0, 'y', y['target'].astype(int))
corrs = df_corr.corr()['y'].abs() > 0.05
to_drop = []
for col in corrs.index:
    if not corrs[col]:
        to_drop.append(col)

In [None]:
X_train3 = X_train2.drop(columns=to_drop)

In [None]:
final_features = X_train3.columns

In [None]:
X_train3

Unnamed: 0,dire_score,radiant_score,players.0.assists,players.0.deaths,players.0.gold,players.0.gold_per_min,players.0.gold_spent,players.0.hero_damage,players.0.hero_healing,players.0.item_1,...,players.9.leaver_status,players.9.level,players.9.net_worth,players.9.tower_damage,players.9.xp_per_min,players.9.total_gold,players.9.total_xp,players.9.kills_per_min,players.9.kda,players.9.abandons
0,34,63,8.0,7.0,661.0,470.0,6072.0,9691.0,0.0,125.0,...,0.0,15.0,9268.0,1374.0,-91.0,1055.0,8797.0,0.059347,0.0,0.0
1,47,40,6.0,6.0,4116.0,720.0,23460.0,-24498.0,0.0,65.0,...,0.0,25.0,20277.0,3023.0,790.0,14457.0,20434.0,0.226586,2.0,0.0
2,65,0,10.0,6.0,1252.0,460.0,14400.0,17896.0,0.0,100.0,...,0.0,22.0,14559.0,512.0,685.0,13800.0,22679.0,0.166436,1.0,0.0
3,83,40,8.0,14.0,390.0,477.0,18395.0,28543.0,0.0,100.0,...,0.0,26.0,21259.0,8046.0,858.0,22976.0,36608.0,0.328125,2.0,0.0
4,65,85,-11.0,12.0,0.0,505.0,25800.0,31391.0,9054.0,244.0,...,0.0,30.0,33631.0,5916.0,777.0,41860.0,8280.0,0.246533,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,56,39,21.0,0.0,342.0,-57.0,10870.0,14321.0,0.0,188.0,...,0.0,0.0,12994.0,5618.0,762.0,10358.0,32143.0,0.189648,2.0,0.0
7996,33,18,10.0,6.0,624.0,-95.0,9640.0,10344.0,0.0,23.0,...,0.0,27.0,21213.0,2952.0,2298.0,22195.0,44351.0,-0.175013,0.0,0.0
7997,40,54,17.0,12.0,303.0,409.0,15105.0,27578.0,0.0,36.0,...,0.0,0.0,8697.0,0.0,520.0,4299.0,21736.0,-0.017416,0.0,0.0
7998,20,41,15.0,4.0,281.0,432.0,15055.0,12823.0,0.0,114.0,...,0.0,22.0,16249.0,167.0,-221.0,15906.0,15339.0,0.109091,1.0,0.0


## Data augmentation

In [None]:
x_as_dict = X_train3.to_dict(orient='index')

In [None]:
sep = '.'
aug_x = {}
for k in tqdm(x_as_dict.keys()):
    j, z = np.random.randint(5, 10, size=2) # будем свапать игроков из ондной группы в reversed_game
    reversed_game = {}
    for key in x_as_dict[k].keys():
        parts_of_key = key.split(sep)
        if parts_of_key[0] == 'players':
            i = int(parts_of_key[1])
            w = i
            if i == j:
                w = z
            elif i == z:
                w = j
            reversed_key = parts_of_key[0] + sep + str(9 - i) + sep + sep.join(parts_of_key[2:])
            if reversed_key not in x_as_dict[k]:
                reversed_game[key] = None
            else:
                reversed_game[key] = x_as_dict[k][reversed_key]
        else:
            reversed_game[key] = x_as_dict[k][key]
    reversed_game['radiant_score'] = x_as_dict[k]['dire_score']
    reversed_game['dire_score'] = x_as_dict[k]['radiant_score']
    x_as_dict[k]['target'] = y_train[k]
    reversed_game['target'] = 1 - y_train[k]
    aug_x[f'{k}-0'] = x_as_dict[k]
    aug_x[f'{k}-1'] = reversed_game

  0%|          | 0/8000 [00:00<?, ?it/s]

In [None]:
from sklearn.utils import shuffle
X = shuffle(pd.DataFrame.from_dict(aug_x, orient='index'))
X

Unnamed: 0,dire_score,radiant_score,players.0.assists,players.0.deaths,players.0.gold,players.0.gold_per_min,players.0.gold_spent,players.0.hero_damage,players.0.hero_healing,players.0.item_1,...,players.9.level,players.9.net_worth,players.9.tower_damage,players.9.xp_per_min,players.9.total_gold,players.9.total_xp,players.9.kills_per_min,players.9.kda,players.9.abandons,target
4163-1,66,16,6.0,13.0,0.0,204.0,4955.0,5542.0,199.0,,...,15.0,9003.0,2677.0,465.0,9343.0,6283.0,0.234506,2.0,0.0,0
996-1,40,22,6.0,8.0,1881.0,781.0,14035.0,10934.0,0.0,,...,30.0,24501.0,6153.0,2194.0,24911.0,43002.0,0.459184,2.0,0.0,0
6526-0,22,10,10.0,6.0,1252.0,460.0,14400.0,17896.0,0.0,100.0,...,22.0,14559.0,512.0,685.0,13800.0,22679.0,0.166436,1.0,0.0,0
3540-1,62,51,-7.0,5.0,6070.0,743.0,27990.0,58830.0,0.0,,...,-6.0,22318.0,-638.0,-389.0,26323.0,38416.0,0.164722,3.0,0.0,1
7997-0,40,54,17.0,12.0,303.0,409.0,15105.0,27578.0,0.0,36.0,...,0.0,8697.0,0.0,520.0,4299.0,21736.0,-0.017416,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143-1,28,0,14.0,6.0,1746.0,817.0,21255.0,22593.0,0.0,,...,27.0,21620.0,1729.0,1483.0,6719.0,40387.0,0.293758,0.0,0.0,1
5600-0,17,48,3.0,5.0,263.0,487.0,14565.0,22903.0,991.0,1.0,...,0.0,5494.0,0.0,540.0,14200.0,18171.0,0.029718,0.0,0.0,1
6642-1,0,-23,23.0,1.0,2187.0,0.0,-12578.0,68390.0,0.0,,...,30.0,34870.0,554.0,-341.0,32906.0,49315.0,0.358102,4.0,0.0,1
6865-1,20,38,8.0,0.0,1968.0,392.0,11255.0,23239.0,4228.0,,...,14.0,6296.0,0.0,344.0,4760.0,11329.0,0.091093,0.0,0.0,1


In [None]:
X_train = X[X.columns[:-1]]
y_train = X['target']

In [None]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median")
X_train = imputer.fit_transform(X_train)

# Training

In [None]:
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.1.1-cp39-none-manylinux1_x86_64.whl (76.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.6/76.6 MB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.1.1


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
import lightgbm as lgb
import warnings
warnings.filterwarnings("ignore")

In [None]:
rf_param_grid = {
    "n_estimators": [100, 200, 500],
    "max_depth": [None, 5, 10],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
}

cat_param_grid = {
    "iterations": [100, 200, 500],
    "learning_rate": [0.01, 0.05, 0.1],
    "depth": [4, 6, 8],
}

lgb_param_grid = {
    "n_estimators": [100, 200, 500],
    "learning_rate": [0.01, 0.05, 0.1],
    "max_depth": [-1, 5, 10],
}

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(
#     X_train, y_train, test_size=0.2, random_state=42
# )

In [None]:
big_X_train = X_train
big_y_train = y_train

In [None]:
X_train = X_train[:500]
y_train = y_train[:500]

In [None]:
rf_grid = GridSearchCV(RandomForestClassifier(random_state=42), rf_param_grid, cv=5, n_jobs=16, verbose=10)

In [None]:
rf_grid.fit(X_train, y_train)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


In [None]:
catboost = CatBoostClassifier(random_state=42, silent=True, task_type="GPU")

In [None]:
result = catboost.grid_search(
    cat_param_grid,
    X_train, y_train,
    cv=5,
    partition_random_seed=42,
    refit=False,
    shuffle=False
)

bestTest = 0.3343429184
bestIteration = 99
0:	loss: 0.3343429	best: 0.3343429 (0)	total: 14s	remaining: 6m 3s
bestTest = 0.1759800148
bestIteration = 99
1:	loss: 0.1759800	best: 0.1759800 (1)	total: 16.1s	remaining: 3m 20s
bestTest = 0.1408140564
bestIteration = 99
2:	loss: 0.1408141	best: 0.1408141 (2)	total: 18s	remaining: 2m 23s
bestTest = 0.2402734375
bestIteration = 199
3:	loss: 0.2402734	best: 0.1408141 (2)	total: 22s	remaining: 2m 6s
bestTest = 0.1434482956
bestIteration = 199
4:	loss: 0.1434483	best: 0.1408141 (2)	total: 28.8s	remaining: 2m 6s
bestTest = 0.1214274406
bestIteration = 193
5:	loss: 0.1214274	best: 0.1214274 (5)	total: 32.6s	remaining: 1m 54s
bestTest = 0.1631298256
bestIteration = 499
6:	loss: 0.1631298	best: 0.1214274 (5)	total: 44.8s	remaining: 2m 7s
bestTest = 0.122412796
bestIteration = 499
7:	loss: 0.1224128	best: 0.1214274 (5)	total: 57.7s	remaining: 2m 17s
bestTest = 0.110448513
bestIteration = 333
8:	loss: 0.1104485	best: 0.1104485 (8)	total: 1m 8s	remaini

In [None]:
result['params']

{'depth': 6, 'iterations': 500, 'learning_rate': 0.05}

In [None]:
catboost = CatBoostClassifier(
    random_seed=42,
    iterations=result['params']['iterations'],
    learning_rate=result['params']['learning_rate'],
    depth=result['params']['depth']
)

In [None]:
lgb_grid = GridSearchCV(lgb.LGBMClassifier(random_state=42, n_jobs=-1), lgb_param_grid, cv=5, n_jobs=-1, verbose=10)


In [None]:
lgb_grid.fit(X_train, y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


In [None]:
rf = RandomForestClassifier(**rf_grid.best_params_)
lgb = lgb.LGBMClassifier(**lgb_grid.best_params_, n_jobs=-1)

In [None]:
X_train = big_X_train
y_train = big_y_train

In [None]:
rf.fit(X_train, y_train)
lgb.fit(X_train, y_train)
catboost.fit(X_train, y_train)

0:	learn: 0.6376350	total: 64.6ms	remaining: 32.2s
1:	learn: 0.5881904	total: 112ms	remaining: 27.8s
2:	learn: 0.5466944	total: 159ms	remaining: 26.4s
3:	learn: 0.5074040	total: 203ms	remaining: 25.2s
4:	learn: 0.4767468	total: 249ms	remaining: 24.6s
5:	learn: 0.4482002	total: 299ms	remaining: 24.6s
6:	learn: 0.4232248	total: 344ms	remaining: 24.2s
7:	learn: 0.4031594	total: 392ms	remaining: 24.1s
8:	learn: 0.3838021	total: 437ms	remaining: 23.8s
9:	learn: 0.3659458	total: 489ms	remaining: 24s
10:	learn: 0.3511442	total: 541ms	remaining: 24.1s
11:	learn: 0.3383266	total: 587ms	remaining: 23.9s
12:	learn: 0.3242402	total: 632ms	remaining: 23.7s
13:	learn: 0.3116775	total: 681ms	remaining: 23.6s
14:	learn: 0.3007113	total: 728ms	remaining: 23.5s
15:	learn: 0.2908411	total: 776ms	remaining: 23.5s
16:	learn: 0.2818677	total: 821ms	remaining: 23.3s
17:	learn: 0.2725373	total: 864ms	remaining: 23.1s
18:	learn: 0.2641679	total: 911ms	remaining: 23.1s
19:	learn: 0.2569452	total: 956ms	remainin

<catboost.core.CatBoostClassifier at 0x7fce78f1b5e0>

In [None]:
# from sklearn.ensemble import GradientBoostingClassifier
# from sklearn.pipeline import Pipeline

# model = GradientBoostingClassifier(random_state=0)


# # preprocessor = ColumnTransformer(transformers=[
# #     ('numeric', num_transformer)
# # ])

# my_pipeline = Pipeline(steps=[
#     ('preprocessor', imputer),
#     ('model', model)
# ])
# my_pipeline.fit(X_train, y_train)

In [None]:
raw_data_to_predict = {}
for file in tqdm(glob('public_data/*')):
    key = int(os.path.basename(file).split('.')[0])
    with open(file, 'r', encoding='utf-8') as inp:
        raw_data_to_predict[key] = json.load(inp) 

  0%|          | 0/1000 [00:00<?, ?it/s]

In [None]:
X_test = extract_features(raw_data_to_predict, final_features)
X_to_predict = preprocess_features(X_test)

0it [00:00, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

In [None]:
x_test_as_dict = X_to_predict.to_dict(orient='index')

In [None]:
sep = '.'
aug_x = {}
for k in tqdm(x_test_as_dict.keys()):
    j, z = np.random.randint(5, 10, size=2) # будем свапать игроков из ондной группы в reversed_game
    reversed_game = {}
    for key in x_test_as_dict[k].keys():
        parts_of_key = key.split(sep)
        if parts_of_key[0] == 'players':
            i = int(parts_of_key[1])
            w = i
            if i == j:
                w = z
            elif i == z:
                w = j
            reversed_key = parts_of_key[0] + sep + str(9 - i) + sep + sep.join(parts_of_key[2:])
            if reversed_key not in x_test_as_dict[k]:
                reversed_game[key] = None
            else:
                reversed_game[key] = x_test_as_dict[k][reversed_key]
        else:
            reversed_game[key] = x_test_as_dict[k][key]
    reversed_game['radiant_score'] = x_test_as_dict[k]['dire_score']
    reversed_game['dire_score'] = x_test_as_dict[k]['radiant_score']
    aug_x[f'{k}-0'] = x_test_as_dict[k]
    aug_x[f'{k}-1'] = reversed_game

  0%|          | 0/1000 [00:00<?, ?it/s]

In [None]:
X_test = pd.DataFrame.from_dict(aug_x, orient='index')
X_test

Unnamed: 0,dire_score,radiant_score,players.0.assists,players.0.deaths,players.0.gold,players.0.gold_per_min,players.0.gold_spent,players.0.hero_damage,players.0.hero_healing,players.0.item_1,...,players.9.leaver_status,players.9.level,players.9.net_worth,players.9.tower_damage,players.9.xp_per_min,players.9.total_gold,players.9.total_xp,players.9.kills_per_min,players.9.kda,players.9.abandons
0-0,55,46,13,10,476,528,23835,33411,0,116.0,...,0,28,30370,24183,892,33698,43187,0.309811,3,0
0-1,46,55,12,6,1645,696,32265,45925,0,,...,0,24,22351,2142,695,25564,33649,0.185886,2,0
1-0,55,58,27,10,1049,367,14630,19329,195,108.0,...,0,28,23610,4283,927,31595,45129,0.328655,2,0
1-1,58,55,13,13,1035,649,23185,45172,0,,...,0,26,13849,648,770,17866,37486,0.225950,3,0
2-0,47,17,7,9,297,715,19095,24226,0,123.0,...,0,30,25085,2689,1723,25688,45257,0.304569,10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
997-1,60,20,9,14,900,243,7230,11556,2730,,...,0,20,9621,1263,645,11116,22908,0.084467,7,0
998-0,24,39,23,3,2326,461,15720,14834,0,125.0,...,0,14,6615,101,295,8448,11028,0.053500,1,0
998-1,39,24,14,8,2340,226,6005,9595,0,,...,0,20,17006,4682,617,17233,23065,0.053500,6,0
999-0,57,23,6,13,1342,226,6445,28798,195,178.0,...,0,25,20568,8225,944,22092,34471,0.246463,2,0


In [None]:
X_test = preprocess_features(X_test)
X_test

  0%|          | 0/225 [00:00<?, ?it/s]

Unnamed: 0,dire_score,radiant_score,players.0.assists,players.0.deaths,players.0.gold,players.0.gold_per_min,players.0.gold_spent,players.0.hero_damage,players.0.hero_healing,players.0.item_1,...,players.9.leaver_status,players.9.level,players.9.net_worth,players.9.tower_damage,players.9.xp_per_min,players.9.total_gold,players.9.total_xp,players.9.kills_per_min,players.9.kda,players.9.abandons
0-0,55,46,13,10,476,528,23835,33411,0,116.0,...,0,28,30370,24183,892,33698,43187,0.309811,3,0
0-1,46,55,12,6,1645,696,32265,45925,0,116.0,...,0,24,22351,2142,695,25564,33649,0.185886,2,0
1-0,55,58,27,10,1049,367,14630,19329,195,108.0,...,0,28,23610,4283,927,31595,45129,0.328655,2,0
1-1,58,55,13,13,1035,649,23185,45172,0,116.0,...,0,26,13849,648,770,17866,37486,0.225950,3,0
2-0,47,17,7,9,297,715,19095,24226,0,123.0,...,0,30,25085,2689,1723,25688,45257,0.304569,10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
997-1,60,20,9,14,900,243,7230,11556,2730,116.0,...,0,20,9621,1263,645,11116,22908,0.084467,7,0
998-0,24,39,23,3,2326,461,15720,14834,0,125.0,...,0,14,6615,101,295,8448,11028,0.053500,1,0
998-1,39,24,14,8,2340,226,6005,9595,0,116.0,...,0,20,17006,4682,617,17233,23065,0.053500,6,0
999-0,57,23,6,13,1342,226,6445,28798,195,178.0,...,0,25,20568,8225,944,22092,34471,0.246463,2,0


In [None]:
rf_preds = rf.predict_proba(X_test)
cat_preds = catboost.predict_proba(X_test)
lgb_preds = lgb.predict_proba(X_test)
probs = (rf_preds + cat_preds + lgb_preds) / 3

In [None]:
# probs = my_pipeline.predict_proba(X_test)

In [None]:
probs_for_game = probs[0::2][:, 1]
probs_for_reversed_game = probs[1::2][:, 0]
probs = (probs_for_game + probs_for_reversed_game) / 2

In [None]:
problem_indexes = np.where(np.abs(probs - 0.5) <= 0.2)[0]
len(problem_indexes)

29

In [None]:
answer_i_2_df_i = {i: j for i, j in enumerate(X_test.index)}

In [None]:
import itertools

sep = '.'
sustainable_probs = []
for i in tqdm(problem_indexes):
    problem_index_augmentated = {}
    df_i = answer_i_2_df_i[2 * i] # умножаем на 2 по построению dataframe
    problem_game = X_test.loc[df_i].to_dict()
    new_problem_game = problem_game.copy()
    indicies = [0, 1, 2, 3, 4]
    k = 0
    for permutation in itertools.permutations(indicies):
        for key in problem_game.keys():
            key_parts = key.split(sep)
            if key_parts[0] == 'players' and int(key_parts[1]) < 5: # перестановка внутри первой команды
                new_key = key_parts[0] + sep + str(permutation[int(key_parts[1])]) + sep + sep.join(key_parts[2:])
                if new_key not in problem_game:
                    new_problem_game[key] = None
                else:
                    new_problem_game[key] = problem_game[new_key]
        for permutation2 in itertools.permutations(indicies):
            new_problem_game2 = new_problem_game.copy()
            for key in problem_game.keys():
                key_parts = key.split(sep)
                if key_parts[0] == 'players' and int(key_parts[1]) >= 5: # перестановка внутри второй команды
                    new_key = key_parts[0] + sep + str(permutation[int(key_parts[1]) - 5] + 5) + sep + sep.join(key_parts[2:])
                    if new_key not in problem_game:
                        new_problem_game2[key] = None
                    else:
                        new_problem_game2[key] = problem_game[new_key]
            problem_index_augmentated[k] = new_problem_game2
            k += 1
    X_problem = shuffle(pd.DataFrame.from_dict(problem_index_augmentated, orient='index'))
    X_problem = preprocess_features(X_problem)

    rf_preds = rf.predict_proba(X_problem)
    cat_preds = catboost.predict_proba(X_problem)
    lgb_preds = lgb.predict_proba(X_problem)
    preds_problem = (rf_preds + cat_preds + lgb_preds) / 3

    sustainable_probs.append(np.mean(preds_problem[:, 1]))

  0%|          | 0/29 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

  0%|          | 0/225 [00:00<?, ?it/s]

In [None]:
sustainable_probs

[0.6898051316094539,
 0.32072852087936254,
 0.4873612683509516,
 0.42237232470641695,
 0.5401850019729002,
 0.5359303219643703,
 0.2366018012712437,
 0.6861265339181442,
 0.2396618178160131,
 0.4320222195515231,
 0.7566939927748478,
 0.46746557173671655,
 0.6680508989668165,
 0.27871125215249715,
 0.6505416512509592,
 0.5169797547223912,
 0.332026254059192,
 0.5248654138389949,
 0.7628061758981337,
 0.49947198411911786,
 0.3079266245249001,
 0.6007772657760494,
 0.6189608046243886,
 0.6401814528149771,
 0.59396123295397,
 0.37966077834195655,
 0.6264852665884546,
 0.650550264652856,
 0.49934781408541984]

In [None]:
for i, pr in enumerate(sustainable_probs):
    probs[problem_indexes[i]] = pr

In [None]:
labels = (probs >= 0.5).astype(int)
labels[:10]

array([0, 0, 0, 0, 1, 1, 1, 1, 0, 1])

In [None]:
index = [id[:-2] for id in X_test.index[0::2]]

In [None]:
answer = pd.DataFrame()
answer['name'] = raw_data_to_predict.keys()
answer['target'] = labels

In [None]:
answer.to_csv('simple_answer.csv', sep='\t', index=False)

In [None]:
answer

Unnamed: 0,name,target
0,10013,0
1,10020,0
2,10037,0
3,10059,0
4,10113,1
...,...,...
995,9808,0
996,9860,1
997,9862,1
998,9866,1
