In [1]:
import json
import os
from glob import glob
from collections import defaultdict

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
import torch

train_on_gpu = torch.cuda.is_available()

if not train_on_gpu:
    print('CUDA is not available.  Training on CPU ...')
else:
    print('CUDA is available!  Training on GPU ...')

CUDA is available!  Training on GPU ...


In [3]:
from google.colab import drive
drive.mount('/content/myDrive')

Mounted at /content/myDrive


In [4]:
os.chdir('myDrive/MyDrive/ufru')

In [15]:
# !rm -rf public_data
!rm -rf train_data

In [16]:
# !unzip public_data.zip > public_data_std_out
!unzip train_data.zip > train_data_std_out
# !rm public_data_std_out
!rm train_data_std_out

### Read y_train

In [17]:
y = pd.read_csv('train_anwers.csv', sep='\t').astype(int)
y_train = y.target
y

Unnamed: 0,name,target
0,29970,1
1,18654,0
2,29133,0
3,20927,0
4,5526,0
...,...,...
7995,5139,0
7996,19046,0
7997,20987,1
7998,32929,1


### Read train data as jsons

In [18]:
raw_data = {}
for name in tqdm(y['name']):
    with open(f'train_data/{name}.json', 'r', encoding='utf-8') as inp:
        raw_data[name] = json.load(inp)

  0%|          | 0/8000 [00:00<?, ?it/s]

### Remove nesting in dict and get keys

In [20]:
flatten_keys = defaultdict(int)
def get_flatten_keys(d, parent_key='', sep='.'):
    if d is None:
        return
    next_sep = sep
    sep = sep if parent_key else ''
    if isinstance(d, dict):
        for k, v in d.items():
            get_flatten_keys(v, f"{parent_key}{sep}{k}", sep=next_sep)
    elif isinstance(d, list):
        for i, item in enumerate(d):
            get_flatten_keys(item, f"{parent_key}{sep}{i}", sep=next_sep)
    else:
        flatten_keys[parent_key] += 1

In [21]:
for data in tqdm(raw_data.values()):
    get_flatten_keys(data)
flatten_keys = dict(flatten_keys)

  0%|          | 0/8000 [00:00<?, ?it/s]

In [22]:
len(flatten_keys)

93270

### Delete keys where NaN more than 70%

In [23]:
useless_keys = set()
ln = 8000
for key, val in flatten_keys.items():
    if val < ln * 0.3:
        useless_keys.add(key)

In [24]:
len(useless_keys)

92351

In [25]:
for key in useless_keys:
    del flatten_keys[key]

In [26]:
len(flatten_keys)

919

### Get data by keys

In [27]:
def extract_features(data, features_names):
    def get_value(obj, keys_list):
        if not keys_list:
            return obj
        key = keys_list.pop(0)
        key = int(key) if key.isdigit() else key
        try:
            return get_value(obj[key], keys_list)
        except:
            return None
        
    train_data = defaultdict(list)

    for i, game_data in tqdm(enumerate(data.values())):
        for key_str in features_names:
            keys = key_str.split('.')
            val = get_value(game_data, keys)
                    
            train_data[key_str].append(val)
    print(train_data)
    df = pd.DataFrame(train_data)
    return df
        

In [93]:
def preprocess_features(data):
    df = data.copy()
    # replace nans with median value
    for col in tqdm(df.columns):
        if df[col].isna().sum() > 0:
            if df[col].dtype in ['int64', 'float64', 'bool']:
                df[col].fillna(df[col].median(), inplace=True)
            else:
                ind = df[col].value_counts().index
                if len(ind) != 0:
                    df[col].fillna(ind[0], inplace=True)
                df[col].fillna(0, inplace=True)

    # bool features to int
    for col in df.select_dtypes(include=['bool']).columns.tolist():
        df[col] = df[col].astype(int)

    return df

In [29]:
X_train = extract_features(raw_data, flatten_keys)

0it [00:00, ?it/s]

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [30]:
X_train2 = preprocess_features(X_train)

  0%|          | 0/919 [00:00<?, ?it/s]

In [31]:
X_train2

Unnamed: 0,cluster,dire_score,duration,engine,first_blood_time,game_mode,human_players,leagueid,lobby_type,match_seq_num,...,players.4.ability_upgrades_arr.18,players.0.permanent_buffs.0.permanent_buff,players.0.permanent_buffs.0.stack_count,players.0.permanent_buffs.0.grant_time,players.1.benchmarks.stuns_per_min.raw,players.1.benchmarks.stuns_per_min.pct,players.4.personaname,players.4.rank_tier,players.3.personaname,players.3.rank_tier
0,183,34,2022,1,100,0,10,0,0,5780373336,...,6090.0,6.0,0.0,1012.5,0.0,0.00,< blank >,31.0,.,31.0
1,-123,47,2648,0,54,22,10,0,7,0,...,6090.0,6.0,0.0,1012.5,0.0,0.00,< blank >,31.0,.,31.0
2,273,65,2684,1,101,18,10,0,0,5780373039,...,6090.0,6.0,0.0,1012.5,0.0,0.00,< blank >,31.0,.,31.0
3,184,83,0,0,20,22,-1,0,7,5780347103,...,6090.0,6.0,0.0,1012.5,0.0,0.00,< blank >,31.0,.,31.0
4,154,65,0,1,101,22,10,0,7,5780418598,...,-313.0,6.0,0.0,1012.5,0.0,0.00,< blank >,31.0,.,31.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,-40,56,2531,1,110,22,0,0,7,5780379689,...,6090.0,6.0,1.0,2047.0,0.0,0.00,< blank >,31.0,.,31.0
7996,184,33,1158,1,158,23,10,0,0,5780312140,...,6090.0,12.0,0.0,810.0,0.0,0.00,< blank >,31.0,.,31.0
7997,274,40,88,1,104,22,0,0,7,5780365863,...,6090.0,6.0,0.0,1012.5,0.0,0.02,< blank >,31.0,.,31.0
7998,153,20,2200,0,76,7,10,0,7,5780378245,...,6090.0,6.0,0.0,1012.5,0.0,0.00,< blank >,31.0,Perfect Blue,0.0


### Remove features that correlates with target less that 0.05

In [32]:
len(y_train)

8000

In [33]:
df_corr = X_train2.copy()
df_corr['target'] = y_train
corrs = df_corr.corr()['target'].abs() > 0.05
to_drop = []
for col in corrs.index:
    if not corrs[col]:
        if col != 'target':
            to_drop.append(col)

In [34]:
X_train3 = X_train2.drop(columns=to_drop)

In [35]:
final_features = X_train3.columns

In [36]:
X_train3

Unnamed: 0,dire_score,radiant_score,players.0.assists,players.0.deaths,players.0.gold,players.0.gold_per_min,players.0.gold_spent,players.0.hero_damage,players.0.hero_healing,players.0.item_1,...,players.4.benchmarks.hero_healing_per_min.raw,players.5.benchmarks.xp_per_min.raw,players.5.benchmarks.xp_per_min.pct,players.7.personaname,players.7.benchmarks.kills_per_min.raw,players.7.benchmarks.kills_per_min.pct,players.1.ability_upgrades_arr.18,players.4.ability_upgrades_arr.18,players.4.personaname,players.3.personaname
0,34,63,8.0,7.0,661.0,470.0,6072.0,9691.0,0.0,125.0,...,0.000000,689.0,0.718750,.,0.158103,0.500000,6072.0,6090.0,< blank >,.
1,47,40,6.0,6.0,4116.0,720.0,23460.0,-24498.0,0.0,65.0,...,0.000000,689.0,0.718750,.,0.158103,0.500000,6072.0,6090.0,< blank >,.
2,65,0,10.0,6.0,1252.0,460.0,14400.0,17896.0,0.0,100.0,...,0.000000,689.0,0.718750,.,0.158103,0.500000,6072.0,6090.0,< blank >,.
3,83,40,8.0,14.0,390.0,477.0,18395.0,28543.0,0.0,100.0,...,0.000000,825.0,1.000000,Спуди Мун,0.773438,1.000000,6072.0,6090.0,< blank >,.
4,65,85,-11.0,12.0,0.0,505.0,25800.0,31391.0,9054.0,244.0,...,0.000000,626.0,0.834483,.,0.323575,0.800000,6501.0,-313.0,< blank >,.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,56,39,21.0,0.0,342.0,-57.0,10870.0,14321.0,0.0,188.0,...,10.904781,689.0,0.718750,.,0.426709,0.030604,6280.0,6090.0,< blank >,.
7996,33,18,10.0,6.0,624.0,-95.0,9640.0,10344.0,0.0,23.0,...,10.103627,689.0,0.718750,.,0.051813,0.174419,6072.0,6090.0,< blank >,.
7997,40,54,17.0,12.0,303.0,409.0,15105.0,27578.0,0.0,36.0,...,0.000000,563.0,0.851852,Hƴdαяηeṧ,0.158103,0.500000,1072.0,6090.0,< blank >,.
7998,20,41,15.0,4.0,281.0,432.0,15055.0,12823.0,0.0,114.0,...,0.000000,498.0,0.000000,吃藕丑,-0.007182,0.420690,730.0,6090.0,< blank >,Perfect Blue


## Data augmentation

In [37]:
x_as_dict = X_train3.to_dict(orient='index')

In [38]:
y_aug = np.zeros(32000)
for i, x in enumerate(y_train):
    y_aug[4 * i] = x
    y_aug[4 * i + 1] = 1- x
    y_aug[4 * i + 2] = x
    y_aug[4 * i + 3] = 1 - x
y_aug.shape

(32000,)

In [39]:
from sklearn.utils import shuffle

In [97]:
def get_w(i, j, z):
    if i == j:
        return z
    elif i == z:
        return j
    return i


def make_augmented_dict(x_as_dict, x4=True):
    sep = '.'
    aug_x = {}

    for k in tqdm(x_as_dict.keys()):
        j1, z1 = np.random.randint(0, 5, size=2)
        j2, z2 = np.random.randint(5, 10, size=2) # будем свапать игроков из ондной группы в reversed_game
        game2 = {}
        reversed_game = {}
        reversed_game2 = {}
        for key in x_as_dict[k].keys():
            parts_of_key = key.split(sep)
            if parts_of_key[0] == 'players':
                i = int(parts_of_key[1])

                if x4:
                    w1 = get_w(i, j1, z1)
                    new_key = parts_of_key[0] + sep + str(w1) + sep + sep.join(parts_of_key[2:])
                    if new_key not in x_as_dict[k]:
                        game2[key] = None
                    else:
                        game2[key] = x_as_dict[k][new_key]

                w2 = get_w(i, j2, z2)
                reversed_key = parts_of_key[0] + sep + str(9 - w2) + sep + sep.join(parts_of_key[2:])
                if reversed_key not in x_as_dict[k]:
                    reversed_game[key] = None
                else:
                    reversed_game[key] = x_as_dict[k][reversed_key]
                
                if x4:
                    if reversed_key not in game2:
                        reversed_game2[key] = None
                    else:
                        reversed_game2[key] = game2[reversed_key]
            else:
                val = x_as_dict[k][key]
                reversed_game[key] = val
                if x4:
                    reversed_game2[key] = val
                    game2[key] = val
        reversed_game['radiant_score'] = x_as_dict[k]['dire_score']
        reversed_game['dire_score'] = x_as_dict[k]['radiant_score']
        if x4:
            reversed_game2['radiant_score'] = x_as_dict[k]['dire_score']
            reversed_game2['dire_score'] = x_as_dict[k]['radiant_score']
        aug_x[f'{k}-0'] = x_as_dict[k]
        aug_x[f'{k}-1'] = reversed_game
        if x4:
            aug_x[f'{k}-2'] = game2
            aug_x[f'{k}-3'] = reversed_game2
    return aug_x

In [41]:
aug_x = make_augmented_dict(x_as_dict)
X_standart = pd.DataFrame.from_dict(aug_x, orient='index')

  0%|          | 0/8000 [00:00<?, ?it/s]

In [42]:
cat_cols = [col for col in X_standart.columns if X_standart[col].dtype not in ['int64', 'float64']]
X_standart.drop(columns=cat_cols, inplace=True)
X_standart = preprocess_features(X_standart)

  0%|          | 0/368 [00:00<?, ?it/s]

In [43]:
X_standart.reset_index(drop=True, inplace=True)

In [49]:
indicies = shuffle(np.arange(X_standart.shape[0]))
X_train = X_standart.loc[indicies]
y_train = y_aug[indicies]

# Training

In [50]:
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.1.1-cp39-none-manylinux1_x86_64.whl (76.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.6/76.6 MB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.1.1


In [51]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
import lightgbm as lgb
import warnings
warnings.filterwarnings("ignore")

In [52]:
rf_param_grid = {
    "n_estimators": [100, 200, 500],
    "max_depth": [None, 5, 10],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
}

cat_param_grid = {
    "iterations": [100, 200, 500],
    "learning_rate": [0.01, 0.05, 0.1],
    "depth": [4, 6, 8],
}

lgb_param_grid = {
    "n_estimators": [100, 200, 500],
    "learning_rate": [0.01, 0.05, 0.1],
    "max_depth": [-1, 5, 10],
}

In [53]:
big_X_train = X_train
big_y_train = y_train

In [60]:
X_train = big_X_train[:500]
y_train = big_y_train[:500]

In [61]:
rf_grid = GridSearchCV(RandomForestClassifier(random_state=42), rf_param_grid, cv=5, n_jobs=-1, verbose=10)

In [62]:
rf_grid.fit(X_train, y_train)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


In [63]:
catboost = CatBoostClassifier(random_state=42, silent=True, task_type="GPU")

In [64]:
result = catboost.grid_search(
    cat_param_grid,
    X_train, y_train,
    cv=5,
    partition_random_seed=42,
    refit=False,
    shuffle=False
)

bestTest = 0.3928816986
bestIteration = 99
0:	loss: 0.3928817	best: 0.3928817 (0)	total: 9.55s	remaining: 4m 8s
bestTest = 0.2076425362
bestIteration = 99
1:	loss: 0.2076425	best: 0.2076425 (1)	total: 11.8s	remaining: 2m 28s
bestTest = 0.1733242989
bestIteration = 99
2:	loss: 0.1733243	best: 0.1733243 (2)	total: 14s	remaining: 1m 52s
bestTest = 0.2986716843
bestIteration = 199
3:	loss: 0.2986717	best: 0.1733243 (2)	total: 20.6s	remaining: 1m 58s
bestTest = 0.1613334656
bestIteration = 199
4:	loss: 0.1613335	best: 0.1613335 (4)	total: 25s	remaining: 1m 50s
bestTest = 0.1561271858
bestIteration = 188
5:	loss: 0.1561272	best: 0.1561272 (5)	total: 29.4s	remaining: 1m 42s
bestTest = 0.2110348511
bestIteration = 499
6:	loss: 0.2110349	best: 0.1561272 (5)	total: 43s	remaining: 2m 2s
bestTest = 0.1395226765
bestIteration = 499
7:	loss: 0.1395227	best: 0.1395227 (7)	total: 56.2s	remaining: 2m 13s
bestTest = 0.1384287071
bestIteration = 499
8:	loss: 0.1384287	best: 0.1384287 (8)	total: 1m 9s	rem

In [65]:
result['params']

{'depth': 8, 'iterations': 500, 'learning_rate': 0.1}

In [66]:
catboost = CatBoostClassifier(
    random_seed=42,
    iterations=result['params']['iterations'],
    learning_rate=result['params']['learning_rate'],
    depth=result['params']['depth']
)

In [67]:
lgb_grid = GridSearchCV(lgb.LGBMClassifier(random_state=42, n_jobs=-1), lgb_param_grid, cv=5, n_jobs=-1, verbose=10)


In [68]:
lgb_grid.fit(X_train, y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


In [69]:
rf = RandomForestClassifier(**rf_grid.best_params_)
lgb = lgb.LGBMClassifier(**lgb_grid.best_params_, n_jobs=-1)

In [70]:
X_train = big_X_train
y_train = big_y_train

In [71]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier


estimators = [
    ('rf', rf),
    ('svr', lgb),
    ('catboost', catboost)
]
model = StackingClassifier(
    estimators=estimators, final_estimator=GradientBoostingClassifier()
)

In [72]:
model.fit(X_train, y_train)

0:	learn: 0.5715565	total: 335ms	remaining: 2m 47s
1:	learn: 0.4937886	total: 578ms	remaining: 2m 23s
2:	learn: 0.4336272	total: 823ms	remaining: 2m 16s
3:	learn: 0.3866671	total: 1.08s	remaining: 2m 13s
4:	learn: 0.3485065	total: 1.32s	remaining: 2m 10s
5:	learn: 0.3162718	total: 1.57s	remaining: 2m 9s
6:	learn: 0.2902221	total: 1.81s	remaining: 2m 7s
7:	learn: 0.2701200	total: 2.13s	remaining: 2m 10s
8:	learn: 0.2512866	total: 2.54s	remaining: 2m 18s
9:	learn: 0.2361271	total: 2.97s	remaining: 2m 25s
10:	learn: 0.2227409	total: 3.42s	remaining: 2m 32s
11:	learn: 0.2127058	total: 3.86s	remaining: 2m 37s
12:	learn: 0.2033507	total: 4.28s	remaining: 2m 40s
13:	learn: 0.1955661	total: 4.72s	remaining: 2m 43s
14:	learn: 0.1884058	total: 5.17s	remaining: 2m 47s
15:	learn: 0.1825340	total: 5.71s	remaining: 2m 52s
16:	learn: 0.1766716	total: 6.08s	remaining: 2m 52s
17:	learn: 0.1716530	total: 6.33s	remaining: 2m 49s
18:	learn: 0.1668103	total: 6.58s	remaining: 2m 46s
19:	learn: 0.1622493	tot

In [164]:
import pickle
model_dump = pickle.dumps(model)
with open('stacking_dump.pt', 'wb') as f:
    f.write(model_dump)

In [166]:
with open('stacking_dump.pt', 'rb') as f:
    readed_dump = f.read()
readed_model = pickle.loads(readed_dump)

In [73]:
!rm -rf public_data
!unzip public_data.zip > public_data_std_out
!rm public_data_std_out

In [74]:
raw_data_to_predict = {}
for file in tqdm(glob('public_data/*')):
    key = int(os.path.basename(file).split('.')[0])
    with open(file, 'r', encoding='utf-8') as inp:
        raw_data_to_predict[key] = json.load(inp)
assert len(raw_data_to_predict.keys()) == 1000

  0%|          | 0/1000 [00:00<?, ?it/s]

In [75]:
X_test = extract_features(raw_data_to_predict, final_features)
X_to_predict = preprocess_features(X_test)

0it [00:00, ?it/s]

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



  0%|          | 0/378 [00:00<?, ?it/s]

In [98]:
x_test_as_dict = X_to_predict.to_dict(orient='index')

In [99]:
# получить X_standart_test
aug_x_test = make_augmented_dict(x_test_as_dict, x4=False)
X_standart_test = pd.DataFrame.from_dict(aug_x_test, orient='index')
cat_cols = [col for col in X_standart_test.columns if X_standart_test[col].dtype not in ['int64', 'float64']]
X_standart_test.drop(columns=cat_cols, inplace=True)
X_standart_test = preprocess_features(X_standart_test)

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/368 [00:00<?, ?it/s]

In [100]:
X_test = X_standart_test

In [126]:
probs = model.predict_proba(X_test)

In [127]:
probs_for_game = probs[0::2][:, 1]
probs_for_reversed_game = probs[1::2][:, 0]
probs = (probs_for_game + probs_for_reversed_game) / 2

In [128]:
problem_indexes = np.where(np.abs(probs - 0.5) <= 0.2)[0]
len(problem_indexes)

22

In [129]:
from numpy.random import default_rng

rng = default_rng()

In [130]:
def make_reverse_game(game):
    sep = '.'
    aug_x = {}

    reversed_game = {}
    for key in game.keys():
        parts_of_key = key.split(sep)
        if parts_of_key[0] == 'players':
            i = int(parts_of_key[1])

            reversed_key = parts_of_key[0] + sep + str(9 - i) + sep + sep.join(parts_of_key[2:])
            if reversed_key not in game:
                reversed_game[key] = None
            else:
                reversed_game[key] = game[reversed_key]
        else:
            reversed_game[key] = game[key]
    reversed_game['radiant_score'] = game['dire_score']
    reversed_game['dire_score'] = game['radiant_score']
    return reversed_game

In [131]:
import itertools

sep = '.'
sustainable_probs = []
for i in tqdm(problem_indexes):
    problem_augmentated = {}
    problem_augmentated_reversed = {}
    problem_game = x_test_as_dict[i]
    new_problem_game = problem_game.copy()
    reversed_problem_game = make_reverse_game(new_problem_game)
    new_reversed_problem_game = reversed_problem_game.copy()
    indicies = [0, 1, 2, 3, 4]
    k = 0
    skip_indicies = rng.choice(120, size=60, replace=False)
    for perm_index, permutation in enumerate(itertools.permutations(indicies)):
        # if perm_index in skip_indicies:
        #     continue
        for key in problem_game.keys():
            key_parts = key.split(sep)
            if key_parts[0] == 'players' and int(key_parts[1]) < 5: # перестановка внутри первой команды
                new_key = key_parts[0] + sep + str(permutation[int(key_parts[1])]) + sep + sep.join(key_parts[2:])
                if new_key not in problem_game:
                    new_problem_game[key] = None
                else:
                    new_problem_game[key] = problem_game[new_key]
                
                if new_key not in reversed_problem_game:
                    new_reversed_problem_game[key] = None
                else:
                    new_reversed_problem_game[key] = reversed_problem_game[new_key]
        skip_indicies2 = rng.choice(120, size=60, replace=False)
        for perm2_index, permutation2 in enumerate(itertools.permutations(indicies)):
            # if perm2_index in skip_indicies2:
            #     continue
            new_problem_game2 = new_problem_game.copy()
            new_reversed_problem_game2 = new_reversed_problem_game.copy()
            for key in problem_game.keys():
                key_parts = key.split(sep)
                if key_parts[0] == 'players' and int(key_parts[1]) >= 5: # перестановка внутри второй команды
                    new_key = key_parts[0] + sep + str(permutation[int(key_parts[1]) - 5] + 5) + sep + sep.join(key_parts[2:])
                    if new_key not in problem_game:
                        new_problem_game2[key] = None
                    else:
                        new_problem_game2[key] = problem_game[new_key]
                    
                    
                    if new_key not in reversed_problem_game:
                        new_reversed_problem_game2[key] = None
                    else:
                        new_reversed_problem_game2[key] = reversed_problem_game[new_key]
            problem_augmentated[k] = new_problem_game2
            problem_augmentated_reversed[k] = new_reversed_problem_game2
            k += 1
    X_problem = pd.DataFrame.from_dict(problem_augmentated, orient='index')
    cat_cols = [col for col in X_problem.columns if X_problem[col].dtype not in ['int64', 'float64']]
    X_problem.drop(columns=cat_cols, inplace=True)
    X_problem = preprocess_features(X_problem)
    X_problem.reset_index(drop=True, inplace=True)
    preds_problem = model.predict_proba(X_problem)


    X_problem = pd.DataFrame.from_dict(problem_augmentated_reversed, orient='index')
    X_problem.drop(columns=cat_cols, inplace=True)
    X_problem = preprocess_features(X_problem)
    X_problem.reset_index(drop=True, inplace=True)
    
    preds_problem_reversed = model.predict_proba(X_problem)

    final_preds = (np.mean(preds_problem[:, 1]) + np.mean(preds_problem_reversed[:, 0])) / 2
    final_preds = np.mean(preds_problem[:, 1])
    sustainable_probs.append(final_preds)

  0%|          | 0/22 [00:00<?, ?it/s]

  0%|          | 0/368 [00:00<?, ?it/s]

  0%|          | 0/368 [00:00<?, ?it/s]

  0%|          | 0/368 [00:00<?, ?it/s]

  0%|          | 0/368 [00:00<?, ?it/s]

  0%|          | 0/368 [00:00<?, ?it/s]

  0%|          | 0/368 [00:00<?, ?it/s]

  0%|          | 0/368 [00:00<?, ?it/s]

  0%|          | 0/368 [00:00<?, ?it/s]

  0%|          | 0/368 [00:00<?, ?it/s]

  0%|          | 0/368 [00:00<?, ?it/s]

  0%|          | 0/368 [00:00<?, ?it/s]

  0%|          | 0/368 [00:00<?, ?it/s]

  0%|          | 0/368 [00:00<?, ?it/s]

  0%|          | 0/368 [00:00<?, ?it/s]

  0%|          | 0/368 [00:00<?, ?it/s]

  0%|          | 0/368 [00:00<?, ?it/s]

  0%|          | 0/368 [00:00<?, ?it/s]

  0%|          | 0/368 [00:00<?, ?it/s]

  0%|          | 0/368 [00:00<?, ?it/s]

  0%|          | 0/368 [00:00<?, ?it/s]

  0%|          | 0/368 [00:00<?, ?it/s]

  0%|          | 0/368 [00:00<?, ?it/s]

In [138]:
sustainable_probs

[0.682029712988723,
 0.39192306185986,
 0.7352598612963865,
 0.03863787330598246,
 0.4063479143718305,
 0.16697400739132598,
 0.6279412023975912,
 0.33250982977498783,
 0.09692377091596564,
 0.2446997030755833,
 0.49314620336392323,
 0.1289914473913153,
 0.10113108557144557,
 0.7385901667131457,
 0.5682092085777225,
 0.6928098727045228,
 0.5213802786331283,
 0.37208744779517244,
 0.9992873520519677,
 0.16979678945046578,
 0.9103705193869948,
 0.25207823701192467]

In [139]:
for i, pr in enumerate(sustainable_probs):
    probs[problem_indexes[i]] = pr

In [150]:
labels = (probs >= 0.5).astype(int)
labels[:10]

array([0, 0, 0, 0, 1, 1, 1, 1, 0, 1])

In [151]:
answer = pd.DataFrame()
answer['name'] = raw_data_to_predict.keys()
answer['target'] = labels

In [152]:
answer.to_csv('simple_answer.csv', sep='\t', index=False)

In [153]:
answer

Unnamed: 0,name,target
0,10013,0
1,10020,0
2,10037,0
3,10059,0
4,10113,1
...,...,...
995,9808,0
996,9860,1
997,9862,1
998,9866,1
