In [None]:
import json
import os
from glob import glob
from collections import defaultdict

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

In [None]:
import torch

train_on_gpu = torch.cuda.is_available()

if not train_on_gpu:
    print('CUDA is not available.  Training on CPU ...')
else:
    print('CUDA is available!  Training on GPU ...')

CUDA is not available.  Training on CPU ...


In [None]:
from google.colab import drive
drive.mount('/content/myDrive')

Drive already mounted at /content/myDrive; to attempt to forcibly remount, call drive.mount("/content/myDrive", force_remount=True).


In [None]:
os.chdir('myDrive/MyDrive/ufru')

In [None]:
!rm -rf public_data
!rm -rf train_data

In [None]:
!unzip public_data.zip > public_data_std_out
!unzip train_data.zip > train_data_std_out
!rm public_data_std_out
!rm train_data_std_out

### Read y_train

In [None]:
y = pd.read_csv('train_anwers.csv', sep='\t').astype(int)
y_train = y.target
y.head()

Unnamed: 0,name,target
0,29970,1
1,18654,0
2,29133,0
3,20927,0
4,5526,0


### Read train data as jsons

In [None]:
raw_data = {}
for name in tqdm(y['name']):
    with open(f'train_data/{name}.json', 'r', encoding='utf-8') as inp:
        raw_data[name] = json.load(inp)

  0%|          | 0/8000 [00:00<?, ?it/s]

### Remove nesting in dict and get keys

In [None]:
flatten_keys = defaultdict(int)
def get_flatten_keys(d, parent_key='', sep='.'):
    if d is None:
        return
    next_sep = sep
    sep = sep if parent_key else ''
    if isinstance(d, dict):
        for k, v in d.items():
            get_flatten_keys(v, f"{parent_key}{sep}{k}", sep=next_sep)
    elif isinstance(d, list):
        for i, item in enumerate(d):
            get_flatten_keys(item, f"{parent_key}{sep}{i}", sep=next_sep)
    else:
        flatten_keys[parent_key] += 1

In [None]:
for data in tqdm(raw_data.values()):
    get_flatten_keys(data)
flatten_keys = dict(flatten_keys)

  0%|          | 0/8000 [00:00<?, ?it/s]

In [None]:
len(flatten_keys)

93270

### Delete keys where NaN more than 20%

In [None]:
useless_keys = set()
ln = 8000
for key, val in flatten_keys.items():
    if val < ln * 0.3:
        useless_keys.add(key)

In [None]:
len(useless_keys)

92351

In [None]:
for key in useless_keys:
    del flatten_keys[key]

In [None]:
len(flatten_keys)

919

### Get data by keys

In [None]:
def extract_features(data, features_names):
    def get_value(obj, keys_list):
        if not keys_list:
            return obj
        key = keys_list.pop(0)
        key = int(key) if key.isdigit() else key
        try:
            return get_value(obj[key], keys_list)
        except:
            return None
        
    train_data = defaultdict(list)

    for i, game_data in tqdm(enumerate(data.values())):
        for key_str in features_names:
            keys = key_str.split('.')
            val = get_value(game_data, keys)
                    
            train_data[key_str].append(val)
    print(train_data)
    df = pd.DataFrame(train_data)
    return df
        

In [None]:
def preprocess_features(data):
    df = data.copy()
    # replace nans with median value
    for col in tqdm(df.columns):
        if df[col].isna().sum() > 0:
            if df[col].dtype in ['int64', 'float64', 'bool']:
                df[col].fillna(df[col].median(), inplace=True)
            else:
                df[col].fillna(df[col].value_counts().index[0], inplace=True)

    # bool features to int
    for col in df.select_dtypes(include=['bool']).columns.tolist():
        df[col] = df[col].astype(int)

    return df

In [None]:
X_train = extract_features(raw_data, flatten_keys)

0it [00:00, ?it/s]

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
X_train2 = preprocess_features(X_train)

  0%|          | 0/919 [00:00<?, ?it/s]

In [None]:
X_train2

Unnamed: 0,cluster,dire_score,duration,engine,first_blood_time,game_mode,human_players,leagueid,lobby_type,match_seq_num,...,players.4.ability_upgrades_arr.18,players.0.permanent_buffs.0.permanent_buff,players.0.permanent_buffs.0.stack_count,players.0.permanent_buffs.0.grant_time,players.1.benchmarks.stuns_per_min.raw,players.1.benchmarks.stuns_per_min.pct,players.4.personaname,players.4.rank_tier,players.3.personaname,players.3.rank_tier
0,183,34,2022,1,100,0,10,0,0,5780373336,...,6090.0,6.0,0.0,1012.5,0.0,0.00,< blank >,31.0,.,31.0
1,-123,47,2648,0,54,22,10,0,7,0,...,6090.0,6.0,0.0,1012.5,0.0,0.00,< blank >,31.0,.,31.0
2,273,65,2684,1,101,18,10,0,0,5780373039,...,6090.0,6.0,0.0,1012.5,0.0,0.00,< blank >,31.0,.,31.0
3,184,83,0,0,20,22,-1,0,7,5780347103,...,6090.0,6.0,0.0,1012.5,0.0,0.00,< blank >,31.0,.,31.0
4,154,65,0,1,101,22,10,0,7,5780418598,...,-313.0,6.0,0.0,1012.5,0.0,0.00,< blank >,31.0,.,31.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,-40,56,2531,1,110,22,0,0,7,5780379689,...,6090.0,6.0,1.0,2047.0,0.0,0.00,< blank >,31.0,.,31.0
7996,184,33,1158,1,158,23,10,0,0,5780312140,...,6090.0,12.0,0.0,810.0,0.0,0.00,< blank >,31.0,.,31.0
7997,274,40,88,1,104,22,0,0,7,5780365863,...,6090.0,6.0,0.0,1012.5,0.0,0.02,< blank >,31.0,.,31.0
7998,153,20,2200,0,76,7,10,0,7,5780378245,...,6090.0,6.0,0.0,1012.5,0.0,0.00,< blank >,31.0,Perfect Blue,0.0


### Remove features that correlates with target less that 0.05

In [None]:
y_train

0       1
1       0
2       0
3       0
4       0
       ..
7995    0
7996    0
7997    1
7998    1
7999    1
Name: target, Length: 8000, dtype: int64

In [None]:
df_corr = X_train2.copy()
df_corr['target'] = y_train
corrs = df_corr.corr()['target'].abs() > 0.05
to_drop = []
for col in corrs.index:
    if not corrs[col]:
        if col != 'target':
            to_drop.append(col)

In [None]:
X_train3 = X_train2.drop(columns=to_drop)

In [None]:
final_features = X_train3.columns

In [None]:
X_train3

Unnamed: 0,dire_score,radiant_score,players.0.assists,players.0.deaths,players.0.gold,players.0.gold_per_min,players.0.gold_spent,players.0.hero_damage,players.0.hero_healing,players.0.item_1,...,players.4.benchmarks.hero_healing_per_min.raw,players.5.benchmarks.xp_per_min.raw,players.5.benchmarks.xp_per_min.pct,players.7.personaname,players.7.benchmarks.kills_per_min.raw,players.7.benchmarks.kills_per_min.pct,players.1.ability_upgrades_arr.18,players.4.ability_upgrades_arr.18,players.4.personaname,players.3.personaname
0,34,63,8.0,7.0,661.0,470.0,6072.0,9691.0,0.0,125.0,...,0.000000,689.0,0.718750,.,0.158103,0.500000,6072.0,6090.0,< blank >,.
1,47,40,6.0,6.0,4116.0,720.0,23460.0,-24498.0,0.0,65.0,...,0.000000,689.0,0.718750,.,0.158103,0.500000,6072.0,6090.0,< blank >,.
2,65,0,10.0,6.0,1252.0,460.0,14400.0,17896.0,0.0,100.0,...,0.000000,689.0,0.718750,.,0.158103,0.500000,6072.0,6090.0,< blank >,.
3,83,40,8.0,14.0,390.0,477.0,18395.0,28543.0,0.0,100.0,...,0.000000,825.0,1.000000,Спуди Мун,0.773438,1.000000,6072.0,6090.0,< blank >,.
4,65,85,-11.0,12.0,0.0,505.0,25800.0,31391.0,9054.0,244.0,...,0.000000,626.0,0.834483,.,0.323575,0.800000,6501.0,-313.0,< blank >,.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,56,39,21.0,0.0,342.0,-57.0,10870.0,14321.0,0.0,188.0,...,10.904781,689.0,0.718750,.,0.426709,0.030604,6280.0,6090.0,< blank >,.
7996,33,18,10.0,6.0,624.0,-95.0,9640.0,10344.0,0.0,23.0,...,10.103627,689.0,0.718750,.,0.051813,0.174419,6072.0,6090.0,< blank >,.
7997,40,54,17.0,12.0,303.0,409.0,15105.0,27578.0,0.0,36.0,...,0.000000,563.0,0.851852,Hƴdαяηeṧ,0.158103,0.500000,1072.0,6090.0,< blank >,.
7998,20,41,15.0,4.0,281.0,432.0,15055.0,12823.0,0.0,114.0,...,0.000000,498.0,0.000000,吃藕丑,-0.007182,0.420690,730.0,6090.0,< blank >,Perfect Blue


## Data augmentation

In [None]:
x_as_dict = X_train3.to_dict(orient='index')

In [None]:
players_cols = [x for x in X_train3.columns if x.startswith('players')]
other_cols = [x for x in X_train3.columns if not x.startswith('players')]

In [None]:
def make_players_dict_for_tensor(x_as_dict, augmentation=False):
    sep = '.'
    players = {}

    for k in tqdm(x_as_dict.keys()):
        j, z = np.random.randint(5, 10, size=2) # будем свапать игроков из ондной группы в reversed_game
        reversed_game_players = {} # признаки players
        game_players = {}
        for key in x_as_dict[k].keys():
            parts_of_key = key.split(sep)
            if parts_of_key[0] == 'players':
                i = int(parts_of_key[1])
                w = i
                if i == j:
                    w = z
                elif i == z:
                    w = j
                feature_tail = sep.join(parts_of_key[2:])
                if i not in game_players:
                    game_players[i] = {}
                game_players[i][feature_tail] = x_as_dict[k][key]

                if augmentation:
                    if i not in reversed_game_players:
                        reversed_game_players[i] = {}
                    reversed_key = parts_of_key[0] + sep + str(9 - w) + sep + feature_tail
                    if reversed_key not in x_as_dict[k]:
                        reversed_game_players[i][feature_tail] = None
                    else:
                        reversed_game_players[i][feature_tail] = x_as_dict[k][reversed_key]
        
        players[f'{k}-0'] = game_players
        if augmentation:
            players[f'{k}-1'] = reversed_game_players
    return players

In [None]:
players = make_players_dict_for_tensor(x_as_dict, augmentation=True)

  0%|          | 0/8000 [00:00<?, ?it/s]

### Запишем всю инфу в тензор. Теперь одна игра это картинка 10x26, так как 10 игроков, 26 фич для каждого

In [None]:
def make_players_tensor(players):
    images = []
    for key in tqdm(players.keys()):
        images.append(pd.DataFrame.from_dict(players[key], orient='index'))
    images_as_df = pd.concat(images, axis=0)
    images_as_df.reset_index(inplace=True)
    images_as_df.drop(columns=['index', 'personaname'], inplace=True)
    items = [col for col in images_as_df.columns if col.startswith('item')]
    images_as_df[items] = images_as_df[items].fillna(0)
    for col in images_as_df.columns:
        images_as_df[col] = images_as_df[col].fillna(images_as_df[col].median())
        if images_as_df[col].isna().sum() != 0:
            images_as_df[col] = images_as_df[col].fillna(0)
    players_tensor = torch.zeros((len(players), 10, len(images_as_df.columns)))

    for i in tqdm(range(0, images_as_df.shape[0], 10)):
        image = images_as_df.loc[i:i + 9]
        players_tensor[i//10] = torch.tensor(image.values)
    return players_tensor

In [None]:
players_tensor = make_players_tensor(players)

  0%|          | 0/16000 [00:00<?, ?it/s]

  0%|          | 0/16000 [00:00<?, ?it/s]

In [None]:
players_tensor = players_tensor.permute(0, 2, 1)
players_tensor.shape

torch.Size([16000, 42, 10])

### CNN

In [None]:
import torch.nn as nn


class CNN(nn.Module):
    def __init__(self, n_classes=2):
        super().__init__()
        self._features = None
        # [bs, 42, 10]
        self.conv1 = nn.Sequential(
            nn.Conv1d(in_channels=42, out_channels=16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2),
            nn.BatchNorm1d(16)
        )
        # # # [bs, 8, 5]
        self.conv2 = nn.Sequential(
            nn.Conv1d(in_channels=16, out_channels=32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2),
            nn.BatchNorm1d(32)
        )
        # [bs, 32, 2]
        self.conv3 = nn.Sequential(
            nn.Conv1d(in_channels=32, out_channels=64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2),
            nn.BatchNorm1d(64)
        )
        # [bs, 64, 1]
        self.linear1 = nn.Linear(64, 32)
        self.out = nn.Linear(32, n_classes)
  
  
    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)

        x = x.view(x.size(0), -1)
        
        x = self.linear1(x)
        # [bs, 32]
        self._features = x
        logits = self.out(x)
        return logits
    
    def get_features(self):
        return self._features

In [None]:
DEVICE = torch.device("cuda") if train_on_gpu else torch.device("cpu")


def fit_epoch(model, inputs, labels, criterion, optimizer):
    running_loss = 0.0
    running_corrects = 0
    processed_data = 0

    
    optimizer.zero_grad()

    outputs = model(inputs)[:, 1]
    outputs = torch.sigmoid(outputs)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()
    preds = (outputs >= 0.5).int()

    return loss.item(), torch.sum(preds == labels.data) / len(preds)

In [None]:
cnn = CNN()
cnn.forward(players_tensor[:64])
cnn.get_features().shape

torch.Size([64, 32])

In [None]:
def eval_epoch(model, inputs, labels, criterion):
    model.eval()
    running_loss = 0.0
    running_corrects = 0
    processed_size = 0

    with torch.set_grad_enabled(False):
        outputs = model(inputs)[:, 1]
        outputs = torch.sigmoid(outputs)
        loss = criterion(outputs, labels)
        preds = (outputs >= 0.5).int()

    
    return loss.item(), torch.sum(preds == labels.data) / len(preds)

In [None]:
def train(inputs_train, labels_train, inputs_eval, labels_eval, model, epochs):

    history = []
    log_template = "\nEpoch {ep:03d} train_loss: {t_loss:0.4f} \
    val_loss {v_loss:0.4f} train_acc {t_acc:0.4f} val_acc {v_acc:0.4f}"

    with tqdm(desc="epoch", total=epochs) as pbar_outer:
        opt = torch.optim.AdamW(model.parameters(), lr=0.001)
        criterion = nn.BCELoss()

        for epoch in range(epochs):
            train_loss, train_acc = fit_epoch(model, inputs_train, labels_train, criterion, opt)
            print("loss", train_loss)
            
            val_loss, val_acc = eval_epoch(model, inputs_eval, labels_eval, criterion)
            history.append((train_loss, train_acc, val_loss, val_acc))
            
            pbar_outer.update(1)
            tqdm.write(log_template.format(ep=epoch+1, t_loss=train_loss,\
                                           v_loss=val_loss, t_acc=train_acc, v_acc=val_acc))
            
    return history

In [None]:
y_aug = np.zeros(16000)
for i, x in enumerate(y_train):
    y_aug[2 * i] = x
    y_aug[2 * i + 1] = 1 - x
y_aug.shape

(16000,)

In [None]:
from sklearn.utils import shuffle

indicies = shuffle(np.arange(16000))
new_players_tensor = players_tensor[indicies]
new_y = y_aug[indicies]

In [None]:
train_size = 15000
cnn = CNN()

cnn = cnn.to(DEVICE)
inputs_train = torch.FloatTensor(new_players_tensor[:train_size]).to(DEVICE)
labels_train = torch.FloatTensor(new_y[:train_size]).to(DEVICE)
inputs_eval = torch.FloatTensor(new_players_tensor[train_size:]).to(DEVICE)
labels_eval = torch.FloatTensor(new_y[train_size:]).to(DEVICE)

In [None]:
history = train(inputs_train, labels_train, inputs_eval, labels_eval, model=cnn, epochs=150)

epoch:   0%|          | 0/150 [00:00<?, ?it/s]

loss 0.7052014470100403

Epoch 001 train_loss: 0.7052     val_loss 0.7004 train_acc 0.5088 val_acc 0.5100
loss 0.7056057453155518

Epoch 002 train_loss: 0.7056     val_loss 0.6787 train_acc 0.5009 val_acc 0.6050
loss 0.6802384257316589

Epoch 003 train_loss: 0.6802     val_loss 0.6690 train_acc 0.5970 val_acc 0.6030
loss 0.668199360370636

Epoch 004 train_loss: 0.6682     val_loss 0.6626 train_acc 0.6151 val_acc 0.5240
loss 0.6608624458312988

Epoch 005 train_loss: 0.6609     val_loss 0.6515 train_acc 0.5478 val_acc 0.5440
loss 0.64956134557724

Epoch 006 train_loss: 0.6496     val_loss 0.6348 train_acc 0.5674 val_acc 0.6610
loss 0.6334905624389648

Epoch 007 train_loss: 0.6335     val_loss 0.6161 train_acc 0.6541 val_acc 0.7690
loss 0.6158823370933533

Epoch 008 train_loss: 0.6159     val_loss 0.5974 train_acc 0.7675 val_acc 0.8230
loss 0.5983362197875977

Epoch 009 train_loss: 0.5983     val_loss 0.5779 train_acc 0.8189 val_acc 0.8260
loss 0.5798856616020203

Epoch 010 train_loss: 0.

In [None]:
def make_augmented_dict(x_as_dict):
    sep = '.'
    aug_x = {}

    for k in tqdm(x_as_dict.keys()):
        j, z = np.random.randint(5, 10, size=2) # будем свапать игроков из ондной группы в reversed_game
        reversed_game = {} # признаки players
        other = {} # другие
        for key in x_as_dict[k].keys():
            parts_of_key = key.split(sep)
            if parts_of_key[0] == 'players':
                i = int(parts_of_key[1])
                w = i
                if i == j:
                    w = z
                elif i == z:
                    w = j
                reversed_key = parts_of_key[0] + sep + str(9 - w) + sep + sep.join(parts_of_key[2:])
                if reversed_key not in x_as_dict[k]:
                    reversed_game[key] = None
                else:
                    reversed_game[key] = x_as_dict[k][reversed_key]
            else:
                reversed_game[key] = x_as_dict[k][key]
        reversed_game['radiant_score'] = x_as_dict[k]['dire_score']
        reversed_game['dire_score'] = x_as_dict[k]['radiant_score']
        aug_x[f'{k}-0'] = x_as_dict[k]
        aug_x[f'{k}-1'] = reversed_game
    return aug_x

In [None]:
aug_x = make_augmented_dict(x_as_dict)
X_standart = pd.DataFrame.from_dict(aug_x, orient='index')

  0%|          | 0/8000 [00:00<?, ?it/s]

In [None]:
cat_cols = [col for col in X_standart.columns if X_standart[col].dtype not in ['int64', 'float64']]
X_standart.drop(columns=cat_cols, inplace=True)
X_standart = preprocess_features(X_standart)

  0%|          | 0/368 [00:00<?, ?it/s]

In [None]:
cnn(players_tensor)
features = cnn.get_features().detach().numpy()
X_cnn_features = pd.DataFrame(features, index=X_standart.index, columns=[str(x) for x in range(features.shape[1])])
X_cnn_features

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0-0,-0.911390,-1.832451,1.538542,4.490929,-5.625395,-1.646562,2.261098,3.093775,3.846343,-4.708835,...,-2.391081,-0.049591,-3.488404,0.183131,1.458383,2.875709,-3.462032,-4.367146,0.386624,-3.272480
0-1,2.356487,3.217848,-6.022466,-2.283538,-1.301587,3.464529,5.128327,-2.169663,1.377150,-0.794839,...,3.311063,2.403677,2.567648,6.448635,-2.456280,-2.510772,0.603591,2.594390,-1.958952,4.483195
1-0,1.979865,2.010241,-5.123304,-0.736231,-2.482010,2.021299,4.930630,-0.734211,1.281071,-1.796522,...,2.919312,2.154733,2.223374,6.035308,-1.682058,-1.631124,-0.772538,1.351878,-1.700432,2.977622
1-1,-0.653508,-1.215879,0.138825,3.877326,-6.017920,-0.889959,3.140182,2.704417,4.232992,-4.976400,...,-1.005397,-0.476914,-1.973592,2.739127,0.874217,2.162786,-3.688410,-3.074884,-0.575263,-1.491382
2-0,0.397368,-0.029136,-1.689969,1.412369,-2.579473,-0.151218,2.453276,0.413054,1.724412,-2.152044,...,0.445227,0.250533,0.179661,2.903827,-0.438603,0.463269,-1.582438,-0.976498,-0.549799,0.606969
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7997-1,3.548205,3.362902,-8.349366,-2.901870,-1.101904,2.796231,6.613019,-2.528656,0.536794,-0.699577,...,5.823636,2.913874,4.389775,9.501709,-3.250012,-3.398377,0.649354,3.247562,-2.220257,5.968434
7998-0,-1.877629,-4.007197,4.358930,7.596986,-8.254025,-3.593861,1.871211,4.919135,5.970925,-7.367979,...,-5.183195,-2.053093,-6.131748,-1.882890,3.160294,5.975877,-6.304434,-7.544742,0.536172,-5.736194
7998-1,4.106575,5.112864,-9.815290,-4.183114,-0.650725,4.615342,7.081157,-4.107001,-0.348086,0.252865,...,6.409194,3.451411,5.380387,10.117524,-3.649011,-4.563091,2.056247,4.362735,-1.411954,8.160917
7999-0,-0.327746,-1.509487,0.775821,3.629793,-4.568142,-1.467005,1.617450,2.357815,3.012903,-3.852591,...,-2.111464,-0.744068,-2.115219,1.125474,1.173102,2.582439,-2.781500,-3.352735,-0.202052,-1.587737


In [None]:
X_extended = pd.concat((X_standart, X_cnn_features), axis=1)

In [None]:
X_extended.reset_index(drop=True, inplace=True)

In [None]:
indicies = shuffle(np.arange(16000))
X_train = X_extended.loc[indicies]
y_train = y_aug[indicies]

In [None]:
from sklearn.impute import SimpleImputer

# Training

In [None]:
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.1.1-cp39-none-manylinux1_x86_64.whl (76.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.6/76.6 MB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.1.1


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
import lightgbm as lgb
import warnings
warnings.filterwarnings("ignore")

In [None]:
rf_param_grid = {
    "n_estimators": [100, 200, 500],
    "max_depth": [None, 5, 10],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
}

cat_param_grid = {
    "iterations": [100, 200, 500],
    "learning_rate": [0.01, 0.05, 0.1],
    "depth": [4, 6, 8],
}

lgb_param_grid = {
    "n_estimators": [100, 200, 500],
    "learning_rate": [0.01, 0.05, 0.1],
    "max_depth": [-1, 5, 10],
}

In [None]:
big_X_train = X_train
big_y_train = y_train

In [None]:
X_train = big_X_train[:500]
y_train = big_y_train[:500]

In [None]:
rf_grid = GridSearchCV(RandomForestClassifier(random_state=42), rf_param_grid, cv=5, n_jobs=-1, verbose=10)

In [None]:
rf_grid.fit(X_train, y_train)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


In [None]:
catboost = CatBoostClassifier(random_state=42, silent=True, task_type="GPU")

In [None]:
result = catboost.grid_search(
    cat_param_grid,
    X_train, y_train,
    cv=5,
    partition_random_seed=42,
    refit=False,
    shuffle=False
)


bestTest = 0.1566704103
bestIteration = 99

0:	loss: 0.1566704	best: 0.1566704 (0)	total: 6.66s	remaining: 2m 53s

bestTest = 0.08258291298
bestIteration = 99

1:	loss: 0.0825829	best: 0.0825829 (1)	total: 11.2s	remaining: 2m 19s

bestTest = 0.06431133883
bestIteration = 96

2:	loss: 0.0643113	best: 0.0643113 (2)	total: 13.7s	remaining: 1m 49s

bestTest = 0.1061281708
bestIteration = 199

3:	loss: 0.1061282	best: 0.0643113 (2)	total: 19.8s	remaining: 1m 53s

bestTest = 0.06660419633
bestIteration = 192

4:	loss: 0.0666042	best: 0.0643113 (2)	total: 26.7s	remaining: 1m 57s

bestTest = 0.06431133883
bestIteration = 96

5:	loss: 0.0643113	best: 0.0643113 (2)	total: 31.7s	remaining: 1m 50s

bestTest = 0.08510452469
bestIteration = 498

6:	loss: 0.0851045	best: 0.0643113 (2)	total: 46.3s	remaining: 2m 12s

bestTest = 0.06626217437
bestIteration = 235

7:	loss: 0.0662622	best: 0.0643113 (2)	total: 1m 1s	remaining: 2m 25s

bestTest = 0.06431133883
bestIteration = 96

8:	loss: 0.0643113	best:

In [None]:
result['params']

{'depth': 4, 'iterations': 100, 'learning_rate': 0.1}

In [None]:
catboost = CatBoostClassifier(
    random_seed=42,
    iterations=result['params']['iterations'],
    learning_rate=result['params']['learning_rate'],
    depth=result['params']['depth']
)

In [None]:
lgb_grid = GridSearchCV(lgb.LGBMClassifier(random_state=42, n_jobs=-1), lgb_param_grid, cv=5, n_jobs=-1, verbose=10)


In [None]:
lgb_grid.fit(X_train, y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


In [None]:
rf = RandomForestClassifier(**rf_grid.best_params_)
lgb = lgb.LGBMClassifier(**lgb_grid.best_params_, n_jobs=-1)

In [None]:
X_train = big_X_train
y_train = big_y_train

In [None]:
rf.fit(X_train, y_train)
lgb.fit(X_train, y_train)
catboost.fit(X_train, y_train)

0:	learn: 0.5210042	total: 55.5ms	remaining: 5.49s
1:	learn: 0.4047378	total: 104ms	remaining: 5.11s
2:	learn: 0.2960700	total: 154ms	remaining: 4.99s
3:	learn: 0.2342691	total: 205ms	remaining: 4.93s
4:	learn: 0.1889000	total: 262ms	remaining: 4.99s
5:	learn: 0.1615405	total: 324ms	remaining: 5.08s
6:	learn: 0.1434263	total: 374ms	remaining: 4.97s
7:	learn: 0.1314960	total: 424ms	remaining: 4.87s
8:	learn: 0.1220398	total: 478ms	remaining: 4.83s
9:	learn: 0.1150527	total: 529ms	remaining: 4.76s
10:	learn: 0.1094468	total: 582ms	remaining: 4.71s
11:	learn: 0.1040004	total: 631ms	remaining: 4.63s
12:	learn: 0.1020953	total: 679ms	remaining: 4.54s
13:	learn: 0.0986061	total: 732ms	remaining: 4.5s
14:	learn: 0.0951425	total: 783ms	remaining: 4.44s
15:	learn: 0.0929799	total: 828ms	remaining: 4.35s
16:	learn: 0.0909332	total: 886ms	remaining: 4.32s
17:	learn: 0.0890743	total: 940ms	remaining: 4.28s
18:	learn: 0.0868693	total: 991ms	remaining: 4.22s
19:	learn: 0.0857637	total: 1.04s	remaini

<catboost.core.CatBoostClassifier at 0x7f2fbfac6cd0>

In [None]:
raw_data_to_predict = {}
for file in tqdm(glob('public_data/*')):
    key = int(os.path.basename(file).split('.')[0])
    with open(file, 'r', encoding='utf-8') as inp:
        raw_data_to_predict[key] = json.load(inp) 

  0%|          | 0/1000 [00:00<?, ?it/s]

In [None]:
X_test = extract_features(raw_data_to_predict, final_features)
X_to_predict = preprocess_features(X_test)

0it [00:00, ?it/s]

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



  0%|          | 0/378 [00:00<?, ?it/s]

In [None]:
x_test_as_dict = X_to_predict.to_dict(orient='index')

In [None]:
# получить X_standart_test
aug_x_test = make_augmented_dict(x_test_as_dict)
X_standart_test = pd.DataFrame.from_dict(aug_x_test, orient='index')
cat_cols = [col for col in X_standart_test.columns if X_standart_test[col].dtype not in ['int64', 'float64']]
X_standart_test.drop(columns=cat_cols, inplace=True)
X_standart_test = preprocess_features(X_standart_test)

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/368 [00:00<?, ?it/s]

In [None]:
# получить X_cnn_features_test
players_test = make_players_dict_for_tensor(aug_x_test, augmentation=False)
players_tensor_test = make_players_tensor(players_test)
players_tensor_test = players_tensor_test.permute(0, 2, 1)
cnn(players_tensor_test)
features_test = cnn.get_features().detach().numpy()
X_cnn_features_test = pd.DataFrame(features_test, index=X_standart_test.index, columns=[str(x) for x in range(features_test.shape[1])])

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

In [None]:
# получить X_extended_test
X_extended_test = pd.concat((X_standart_test, X_cnn_features_test), axis=1)
X_extended_test.reset_index(drop=True, inplace=True)

In [None]:
X_test = X_extended_test

In [None]:
rf_preds = rf.predict_proba(X_test)
cat_preds = catboost.predict_proba(X_test)
lgb_preds = lgb.predict_proba(X_test)
probs = (rf_preds + cat_preds + lgb_preds) / 3

In [None]:
probs_for_game = probs[0::2][:, 1]
probs_for_reversed_game = probs[1::2][:, 0]
probs = (probs_for_game + probs_for_reversed_game) / 2

In [None]:
problem_indexes = np.where(np.abs(probs - 0.5) <= 0.2)[0]
len(problem_indexes)

9

In [None]:
from numpy.random import default_rng

rng = default_rng()
rng.choice(120, size=88, replace=False)

array([ 85,  39,   5, 111,  83, 112,  51, 115, 105,  53,  23,  13,  31,
        94,  73,  66,  15, 100,  98,  16,  29,  54,  86, 103, 104,  97,
        95,   0,  62, 117,  61,   1, 101, 114,  91,  20,  88, 113,  41,
        90,  92,  21, 106,  99,  10,  35,  93,  25,   2,  56,  58,   9,
        55,   3,  47,  17,  24,   8,  72,  64,  96,  34,  68,  78, 102,
        71,  80, 110,  57,  49,  74,  70,  36,  48,  89,  44,  38,  84,
        32,   6,  43,   7,  12, 118,  63,  33,  87, 108])

In [None]:
import itertools

sep = '.'
sustainable_probs = []
for i in tqdm(problem_indexes):
    problem_index_augmentated = {}
    problem_game = x_test_as_dict[i]
    new_problem_game = problem_game.copy()
    indicies = [0, 1, 2, 3, 4]
    k = 0
    skip_indicies = rng.choice(120, size=88, replace=False)
    for perm_index, permutation in enumerate(itertools.permutations(indicies)):
        if perm_index in skip_indicies:
            continue
        for key in problem_game.keys():
            key_parts = key.split(sep)
            if key_parts[0] == 'players' and int(key_parts[1]) < 5: # перестановка внутри первой команды
                new_key = key_parts[0] + sep + str(permutation[int(key_parts[1])]) + sep + sep.join(key_parts[2:])
                if new_key not in problem_game:
                    new_problem_game[key] = None
                else:
                    new_problem_game[key] = problem_game[new_key]
        skip_indicies2 = rng.choice(120, size=88, replace=False)
        for perm2_index, permutation2 in enumerate(itertools.permutations(indicies)):
            if perm2_index in skip_indicies2:
                continue
            new_problem_game2 = new_problem_game.copy()
            for key in problem_game.keys():
                key_parts = key.split(sep)
                if key_parts[0] == 'players' and int(key_parts[1]) >= 5: # перестановка внутри второй команды
                    new_key = key_parts[0] + sep + str(permutation[int(key_parts[1]) - 5] + 5) + sep + sep.join(key_parts[2:])
                    if new_key not in problem_game:
                        new_problem_game2[key] = None
                    else:
                        new_problem_game2[key] = problem_game[new_key]
            problem_index_augmentated[k] = new_problem_game2
            k += 1

    problem_dict_for_tensor = make_players_dict_for_tensor(problem_index_augmentated, augmentation=False)
    problem_tensor = make_players_tensor(problem_dict_for_tensor)

    cnn(problem_tensor)
    features = cnn.get_features().detach().numpy()

    rf_preds = rf.predict_proba(features)
    cat_preds = catboost.predict_proba(features)
    lgb_preds = lgb.predict_proba(features)
    preds_problem = (rf_preds + cat_preds + lgb_preds) / 3

    sustainable_probs.append(np.mean(preds_problem[:, 1]))

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/1024 [00:00<?, ?it/s]

  0%|          | 0/1024 [00:00<?, ?it/s]

  0%|          | 0/1024 [00:00<?, ?it/s]

RuntimeError: ignored

In [None]:
sustainable_probs

[0.5833078577871071,
 0.8499930377069209,
 0.5351551035045741,
 0.905672534385458,
 0.8688703851605486,
 0.08557495456453945,
 0.6912735466740558,
 0.4445490959742112,
 0.6535273929275056,
 0.35775344735251946,
 0.5772904096098225,
 0.8553106237787756,
 0.3834919351000415,
 0.6636861467385039,
 0.475503266274795,
 0.6602333380432515,
 0.7865362149007975,
 0.11999530942808012]

In [None]:
for i, pr in enumerate(sustainable_probs):
    probs[problem_indexes[i]] = pr

In [None]:
probs[problem_indexes] = 1 - probs[problem_indexes]

In [None]:
labels = (probs >= 0.5).astype(int)
labels[:10]

array([0, 0, 0, 0, 1, 1, 1, 1, 1, 1])

In [None]:
answer = pd.DataFrame()
answer['name'] = raw_data_to_predict.keys()
answer['target'] = labels

In [None]:
answer.to_csv('simple_answer.csv', sep='\t', index=False)

In [None]:
answer

Unnamed: 0,name,target
0,10013,0
1,10020,0
2,10037,0
3,10059,0
4,10113,1
...,...,...
995,9808,0
996,9860,1
997,9862,1
998,9866,1
