In [1]:
import json
import os
from glob import glob

import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

## Подключение google drive

In [2]:
from google.colab import drive
drive.mount('/content/myDrive')

Drive already mounted at /content/myDrive; to attempt to forcibly remount, call drive.mount("/content/myDrive", force_remount=True).


In [3]:
os.chdir('myDrive/MyDrive/ufru')

In [4]:
!ls

public_data	 simple_answer.csv  train_data
public_data.zip  train_anwers.csv   train_data.zip


## Разархивирование данных

In [21]:
!unzip public_data.zip > public_data_std_out

In [22]:
!unzip train_data.zip > train_data_std_out

In [23]:
!rm public_data_std_out
!rm train_data_std_out

## Загрузка json

In [5]:
y = pd.read_csv('train_anwers.csv', sep='\t')
print(y.dtypes)
y.set_index('name', inplace=True)
y.sort_values(by='name', axis=0, inplace=True)
y.head()

name      int64
target     bool
dtype: object


Unnamed: 0_level_0,target
name,Unnamed: 1_level_1
1,True
7,False
16,False
24,False
29,True


In [6]:
from tqdm import tqdm


raw_data = {}
for name in tqdm(y.index):
    with open(f'train_data/{name}.json', 'r', encoding='utf-8') as inp:
        raw_data[name] = json.load(inp)

100%|██████████| 8000/8000 [00:20<00:00, 399.03it/s]


## Выбор важных численных признаков у катки (без players) по корреляции с таргетом

In [7]:
for k, v in raw_data[y.index[0]].items():
    if k != 'players':
        print(f'{k}:  {v}')

chat:  None
cluster:  186
cosmetics:  None
dire_score:  38
dire_team_id:  None
draft_timings:  None
duration:  1625
engine:  0
first_blood_time:  9
game_mode:  5
human_players:  -4
leagueid:  0
lobby_type:  0
match_seq_num:  5780322685
negative_votes:  0
objectives:  None
picks_bans:  [{'is_pick': True, 'hero_id': 31, 'team': 0, 'order': 0}, {'is_pick': True, 'hero_id': 11, 'team': 1, 'order': 1}, {'is_pick': True, 'hero_id': 14, 'team': 0, 'order': 2}, {'is_pick': True, 'hero_id': 22, 'team': 0, 'order': 0}, {'is_pick': True, 'hero_id': 106, 'team': 1, 'order': 4}, {'is_pick': True, 'hero_id': 30, 'team': 0, 'order': 5}, {'is_pick': True, 'hero_id': 39, 'team': 0, 'order': 2}, {}, {}, {'is_pick': True, 'hero_id': 83, 'team': 1, 'order': 9}, {'is_pick': False, 'hero_id': 35, 'team': 0, 'order': 10}, {'is_pick': False, 'hero_id': 9, 'team': 0, 'order': 11}, {'is_pick': False, 'hero_id': 119, 'team': 0, 'order': 12}, {'is_pick': False, 'hero_id': 74, 'team': 0, 'order': 13}, {}]
positive

In [8]:
games_matrix_as_dict = {}
for k in tqdm(raw_data.keys()):
    game_without_players = raw_data[k].copy()
    del game_without_players['players']
    game_without_players['target'] = int(y.loc[k])

    games_matrix_as_dict[k] = game_without_players

100%|██████████| 8000/8000 [00:00<00:00, 11640.29it/s]


In [9]:
game_matrix = pd.DataFrame.from_dict(games_matrix_as_dict, orient='index')
game_matrix

Unnamed: 0,chat,cluster,cosmetics,dire_score,dire_team_id,draft_timings,duration,engine,first_blood_time,game_mode,...,series_type,patch,region,target,all_word_counts,my_word_counts,comeback,stomp,throw,loss
1,,186,,38,,,1625,0,9,5,...,0.0,51,0,1,,,,,,
7,,151,,48,,,3337,1,73,4,...,0.0,2,5,0,,,,,,
16,,152,,63,,,2055,1,156,22,...,0.0,51,5,0,,,,,,
24,,182,,57,,,2601,1,7,22,...,0.0,19,8,0,,,,,,
29,,274,,51,,,0,0,82,22,...,0.0,0,3,1,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35540,,274,,0,,,1233,1,12,23,...,0.0,51,3,1,,,,,,
35541,,182,,60,,,2660,1,0,22,...,0.0,51,8,0,,,,,,
35545,,251,,34,,,993,1,72,22,...,0.0,51,15,1,,,,,,
35550,,181,,-2,,,1931,1,92,22,...,0.0,51,8,1,,,,,,


In [10]:
num_cols_game = [col for col in game_matrix.columns
            if game_matrix[col].dtype in ['int64', 'float64']]

cat_cols_game = [col for col in game_matrix.columns if
            game_matrix[col].dtype == "object"]

bool_cols_game = [col for col in game_matrix.columns if
             game_matrix[col].dtype == 'bool']
print(f'num_cols_game: {len(num_cols_game)}')
print(f'cat_cols_game: {len(cat_cols_game)}')
print(f'bool_cols_game: {len(bool_cols_game)}')
assert len(num_cols_game) + len(cat_cols_game) + len(bool_cols_game) == game_matrix.shape[1]

num_cols_game: 25
cat_cols_game: 13
bool_cols_game: 0


In [11]:
na_count = game_matrix[num_cols_game].isna().sum().sort_values(ascending=False)
na_count

stomp               7966
comeback            7966
loss                7963
throw               7963
version             7929
series_type           54
series_id             54
replay_salt           54
start_time             0
target                 0
region                 0
patch                  0
cluster                0
dire_score             0
positive_votes         0
negative_votes         0
match_seq_num          0
lobby_type             0
leagueid               0
human_players          0
game_mode              0
first_blood_time       0
engine                 0
duration               0
radiant_score          0
dtype: int64

In [12]:
num_cols_game_without_na = na_count[na_count <= game_matrix.shape[0] - 100].index
num_cols_game_without_na

Index(['series_type', 'series_id', 'replay_salt', 'start_time', 'target',
       'region', 'patch', 'cluster', 'dire_score', 'positive_votes',
       'negative_votes', 'match_seq_num', 'lobby_type', 'leagueid',
       'human_players', 'game_mode', 'first_blood_time', 'engine', 'duration',
       'radiant_score'],
      dtype='object')

In [13]:
corr = game_matrix[num_cols_game_without_na].corr()['target'].sort_values(ascending=False)
corr

target              1.000000
radiant_score       0.287782
match_seq_num       0.018218
game_mode           0.012491
start_time          0.009667
engine              0.006457
cluster             0.001642
human_players      -0.002494
region             -0.002527
first_blood_time   -0.008031
replay_salt        -0.014683
lobby_type         -0.016479
patch              -0.021456
duration           -0.027636
dire_score         -0.330406
series_type              NaN
series_id                NaN
positive_votes           NaN
negative_votes           NaN
leagueid                 NaN
Name: target, dtype: float64

Убедимся, что не можем использовать колонки в которых не посчиталась корреляция.

In [14]:
for x in corr[corr.isna()].index:
    print(np.unique(game_matrix[x]))

[ 0. nan]
[ 0. nan]
[0]
[0]
[0]


In [15]:
corr = corr[corr.isna() != True]

In [16]:
important_num_game_cols = corr[np.abs(corr).sort_values(ascending=False) >= 0.1].index[1:]
important_num_game_cols # важные числовые признаки game

Index(['radiant_score', 'dire_score'], dtype='object')

radiant_score - скор команды с нижней базы

dire_score - скор команды с верхней базы

## Выбор важных численных признаков у players по корреляции с таргетом.

In [17]:
players_matrix_as_dict = {}
for k in tqdm(raw_data.keys()):
    for i, player in enumerate(raw_data[k]['players']):
        if i >= 5:
            break
        player['target'] = int(y.loc[k])
        players_matrix_as_dict[f'{k}-{i}'] = player

100%|██████████| 8000/8000 [00:06<00:00, 1311.28it/s]


In [18]:
players_matrix = pd.DataFrame.from_dict(players_matrix_as_dict, orient='index')
players_matrix

Unnamed: 0,player_slot,ability_targets,ability_upgrades_arr,ability_uses,actions,additional_units,assists,backpack_0,backpack_1,backpack_2,...,item_usage,purchase_tpscroll,actions_per_min,life_state_dead,purchase_ward_observer,lane_efficiency,lane_efficiency_pct,purchase_ward_sentry,purchase_gem,purchase_rapier
1-0,0,,"[5175, 5173, 5173, 5174, 5175, 5176, 5175, 517...",,,,17,218,0,0,...,,,,,,,,,,
1-1,1,,"[5110, 5111, 641, 5111, 5111, 5113, 5110, 641,...",,,,-5,0,0,0,...,,,,,,,,,,
1-2,2,,"[5075, 5076, 5076, 5074, 5076, 5077, 5076, 507...",,,,10,41,216,39,...,,,,,,,,,,
1-3,3,,"[5395, 5397, 5395, 5397, -318, 5398, 5395, 539...",,,,25,0,0,0,...,,,,,,,,,,
1-4,0,,"[5140, 5138, 5138, 5140, 5138, 0, 5138, 219, 5...",,,,19,0,0,0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35552-0,0,,"[5260, 1645, 5260, 5259, 5259, 5262, 5259, 525...",,,,7,0,0,0,...,,,,,,,,,,
35552-1,1,,"[5164, 0, 5164, 0, 5164]",,,,-1,0,0,0,...,,,,,,,,,,
35552-2,0,,"[5341, 5340, 5341, 5340, 5339, 2353, 5341, 533...",,,,0,0,0,0,...,,,,,,,,,,
35552-3,3,,"[5486, 1694, 5486, 5488, 5486, 5487]",,,,2,0,0,0,...,,,,,,,,,,


In [19]:
num_cols_players = [col for col in players_matrix.columns
            if players_matrix[col].dtype in ['int64', 'float64']]

cat_cols_players = [col for col in players_matrix.columns if
            players_matrix[col].dtype == "object"]

bool_cols_players = [col for col in players_matrix.columns if
             players_matrix[col].dtype == 'bool']
print(f'num_cols: {len(num_cols_players)}')
print(f'cat_cols: {len(cat_cols_players)}')
print(f'bool_cols: {len(bool_cols_players)}')
assert len(num_cols_players) + len(cat_cols_players) + len(bool_cols_players) == players_matrix.shape[1]

num_cols: 78
cat_cols: 55
bool_cols: 3


In [20]:
na_count = players_matrix[num_cols_players].isna().sum().sort_values(ascending=False)
na_count

purchase_rapier           35842
purchase_gem              35838
purchase_ward_sentry      35701
purchase_ward_observer    35681
purchase_tpscroll         35659
                          ...  
start_time                    0
duration                      0
cluster                       0
lobby_type                    0
xp_per_min                    0
Length: 78, dtype: int64

In [21]:
num_cols_without_na = na_count[na_count <= players_matrix.shape[0] - 100].index

In [22]:
corr = players_matrix[num_cols_without_na].corr()['target'].sort_values(ascending=False)
corr

target                1.000000
tower_damage          0.399670
kda                   0.382888
towers_killed         0.329270
tower_kills           0.267105
                        ...   
buyback_count        -0.068369
leaver_status        -0.105042
life_state_dead      -0.154737
deaths               -0.285179
necronomicon_kills         NaN
Name: target, Length: 76, dtype: float64

In [23]:
important_num_player_cols = corr[np.abs(corr).sort_values(ascending=False) >= 0.1].index[1:]
important_num_player_cols # важные числовые признаки player 

Index(['tower_damage', 'kda', 'towers_killed', 'tower_kills', 'gold',
       'kills_per_min', 'assists', 'kills', 'roshan_kills', 'net_worth',
       'roshans_killed', 'hero_kills', 'gold_per_min', 'lane_efficiency_pct',
       'rune_pickups', 'total_gold', 'courier_kills', 'gold_spent',
       'actions_per_min', 'lane_kills', 'total_xp', 'purchase_tpscroll',
       'xp_per_min', 'pings', 'camps_stacked', 'leaver_status',
       'life_state_dead', 'deaths'],
      dtype='object')

## Создание X_train с важными числовыми признаками. Data augmentation

In [25]:
X_dict = {}
for k in tqdm(raw_data.keys()):
    game = {}
    reversed_game = {} # меняем команды местами
    radiant_score, dire_score = important_num_game_cols
    game[radiant_score] = raw_data[k][radiant_score]
    game[dire_score] = raw_data[k][dire_score]
    reversed_game[radiant_score] = game[dire_score]
    reversed_game[dire_score] = game[radiant_score]

    j, z = np.random.randint(5, 10, size=2) # будем свапать игроков из ондной группы в reversed_game
    for i, player in enumerate(raw_data[k]['players']):
        w = i
        if i == j:
            w = z
        elif i == z:
            w = j
        for col in important_num_player_cols:
            if col not in player:
                game[f'{col}-{i}'] = None
                reversed_game[f'{col}-{9 - w}'] = None
            else:
                game[f'{col}-{i}'] = player[col]
                reversed_game[f'{col}-{9 - w}'] = player[col]
    game['target'] = int(y.loc[k])
    reversed_game['target'] = 1 - game['target']
    X_dict[f'{k}-0'] = game
    X_dict[f'{k}-1'] = reversed_game

100%|██████████| 8000/8000 [00:06<00:00, 1273.98it/s]


Аугментированные данные необходимо перешивать

In [27]:
from sklearn.utils import shuffle
X = shuffle(pd.DataFrame.from_dict(X_dict, orient='index'))
X

Unnamed: 0,radiant_score,dire_score,tower_damage-0,kda-0,towers_killed-0,tower_kills-0,gold-0,kills_per_min-0,assists-0,kills-0,...,lane_kills-9,total_xp-9,purchase_tpscroll-9,xp_per_min-9,pings-9,camps_stacked-9,leaver_status-9,life_state_dead-9,deaths-9,target
19208-1,61,0,6823.0,0.0,,,5264.0,0.038032,14.0,0.0,...,,1972.0,,-155.0,,,0.0,,14.0,0
1267-0,0,32,5001.0,5.0,,,1663.0,0.355030,11.0,12.0,...,,16224.0,,480.0,,,0.0,,12.0,1
25363-0,53,64,3850.0,2.0,,,4223.0,0.270270,22.0,9.0,...,,5418.0,,0.0,,,0.0,,5.0,0
10896-1,50,10,0.0,2.0,,,3906.0,0.206508,13.0,11.0,...,,22882.0,,674.0,,,0.0,,13.0,1
14565-0,23,53,165.0,0.0,,,375.0,0.083449,11.0,3.0,...,,4569.0,,757.0,,,0.0,,6.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1712-0,43,26,486.0,3.0,,,883.0,0.263833,9.0,0.0,...,,26649.0,,781.0,,,0.0,,11.0,1
15532-0,34,0,539.0,0.0,,,339.0,0.002983,0.0,3.0,...,,44352.0,,1584.0,,,0.0,,5.0,0
14743-1,20,56,0.0,0.0,,,0.0,0.207678,10.0,11.0,...,,4637.0,,878.0,,,0.0,,7.0,0
20820-0,5,58,817.0,0.0,,,448.0,0.310766,13.0,14.0,...,,35679.0,,0.0,,,0.0,,10.0,0


In [28]:
X_train = X[X.columns[:-1]]
y_train = X['target']

## Создание пайплайна и кросс валидация

In [80]:
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

model = GradientBoostingClassifier(random_state=0)
num_transformer = SimpleImputer(strategy="median")


# preprocessor = ColumnTransformer(transformers=[
#     ('numeric', num_transformer)
# ])

my_pipeline = Pipeline(steps=[
    ('preprocessor', num_transformer),
    ('model', model)
])
my_pipeline

In [30]:
cross_val_score(my_pipeline, X_train, y_train, scoring='f1')

array([0.95554865, 0.96671851, 0.95589615, 0.95673981, 0.96142992])

In [94]:
cross_val_score(my_pipeline, X_train, y_train, scoring='f1')

array([0.95893872, 0.96340319, 0.95532646, 0.95587321, 0.96270976])

In [None]:
cross_val_score(my_pipeline, X_train, y_train, scoring='f1')

array([0.98175182, 0.9848393 , 0.97949337, 0.97652017, 0.79281902])

## Подготовка тестовых данных

In [31]:
raw_test_data = {}
for _, _, files in os.walk('public_data'):
    for file in files:
        with open(f'public_data/{file}', 'r', encoding='utf-8') as inp:
            raw_test_data[int(file.split('.')[0])] = json.load(inp)

In [32]:
X_test_dict = {}
for k in tqdm(raw_test_data.keys()):
    game = {}
    reversed_game = {}
    radiant_score, dire_score = important_num_game_cols
    game[radiant_score] = raw_test_data[k][radiant_score]
    game[dire_score] = raw_test_data[k][dire_score]
    reversed_game[radiant_score] = game[dire_score]
    reversed_game[dire_score] = game[radiant_score]
    for i, player in enumerate(raw_test_data[k]['players']):
        for col in important_num_player_cols:
            if col not in player:
                game[f'{col}-{i}'] = None
                reversed_game[f'{col}-{9 - i}'] = None
            else:
                game[f'{col}-{i}'] = player[col]
                reversed_game[f'{col}-{9 - i}'] = player[col]
    X_test_dict[f'{k}-0'] = game
    X_test_dict[f'{k}-1'] = reversed_game

100%|██████████| 1000/1000 [00:00<00:00, 1688.12it/s]


In [33]:
X_test = pd.DataFrame.from_dict(X_test_dict, orient='index')
X_test

Unnamed: 0,radiant_score,dire_score,tower_damage-0,kda-0,towers_killed-0,tower_kills-0,gold-0,kills_per_min-0,assists-0,kills-0,...,actions_per_min-9,lane_kills-9,total_xp-9,purchase_tpscroll-9,xp_per_min-9,pings-9,camps_stacked-9,leaver_status-9,life_state_dead-9,deaths-9
10013-0,46,55,2142,2,,,476,0.185886,13,9,...,,,43187,,892,,,0,,6
10013-1,55,46,24183,3,,,1645,0.309811,12,15,...,,,33649,,695,,,0,,10
10020-0,58,55,648,3,,,1049,0.225950,27,11,...,,,45129,,927,,,0,,13
10020-1,55,58,4283,2,,,1035,0.328655,13,16,...,,,37486,,770,,,0,,10
10037-0,17,47,25,0,,,297,0.076142,7,2,...,,,45257,,1723,,,0,,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9862-1,20,60,20,0,,,900,,9,0,...,,,22908,,645,,,0,,3
9866-0,39,24,4682,6,,,2326,0.053500,23,2,...,,,11028,,295,,,0,,8
9866-1,24,39,101,1,,,2340,0.053500,14,2,...,,,23065,,617,,,0,,3
9924-0,23,57,0,0,,,1342,0.054770,6,2,...,,,34471,,944,,,0,,8


## Обучение модели

In [81]:
my_pipeline.fit(X_train, y_train)

## Предсказание модели

In [82]:
probs = my_pipeline.predict_proba(X_test)

In [83]:
probs

array([[0.93810947, 0.06189053],
       [0.09342714, 0.90657286],
       [0.95246055, 0.04753945],
       ...,
       [0.96582234, 0.03417766],
       [0.99562891, 0.00437109],
       [0.0058105 , 0.9941895 ]])

Получим более устойчивые предсказания, взяв среднее из вероятности первой команды победить и второй команды проиграть

In [84]:
probs_for_game = probs[0::2][:, 1]
probs_for_reversed_game = probs[1::2][:, 0]
probs = (probs_for_game + probs_for_reversed_game) / 2

Для некоторых объектов модель дает ответ неуверенно. Для каждого такого объекта сгенерируем 120*120 = 14400 объектов, полученных перестановками игроков внутри команды. Посчитаем их вероятности и усредним.

In [85]:
problem_indexes = np.where(np.abs(probs - 0.5) <= 0.1)[0]
problem_indexes

array([115, 151, 307, 360, 454, 490, 582, 625, 709, 908, 935, 949])

In [86]:
answer_i_2_df_i = {i: j for i, j in enumerate(X_test.index)}

In [112]:
import itertools

sustainable_probs = []
for i in problem_indexes:
    problem_index_augmentated = {}
    df_i = answer_i_2_df_i[2 * i] # умножаем на 2 по построению dataframe
    problem_game = X_test.loc[df_i].to_dict()
    new_problem_game = problem_game.copy()
    indicies = [0, 1, 2, 3, 4]
    k = 0
    for permutation in itertools.permutations(indicies):
        for key in problem_game.keys():
            if key[-1].isdigit() and int(key[-1]) < 5: # перестановка внутри первой команды
                new_problem_game[key] = problem_game[key[:-1] + str(permutation[int(key[-1])])]
        for permutation2 in itertools.permutations(indicies):
            new_problem_game2 = new_problem_game.copy()
            for key in problem_game.keys():
                if key[-1].isdigit() and int(key[-1]) >= 5: # перестановка внутри второй команды
                    new_problem_game2[key] = problem_game[key[:-1] + str(permutation[int(key[-1]) - 5] + 5)]
            problem_index_augmentated[k] = new_problem_game2
            k += 1
    X_problem = shuffle(pd.DataFrame.from_dict(problem_index_augmentated, orient='index'))
    preds_problem = my_pipeline.predict_proba(X_problem)
    sustainable_probs.append(np.mean(preds_problem[:, 1]))
    break

(14400, 282)


In [110]:
sustainable_probs

[0.4687824814855701,
 0.3835349251423075,
 0.4454150990488217,
 0.4009852567998851,
 0.5926201631276893,
 0.47027128175213956,
 0.537816220726901,
 0.40256508925568896,
 0.7402370007755655,
 0.6789577493813014,
 0.6356937840417666,
 0.5244011716436118]

In [114]:
for i, pr in enumerate(sustainable_probs):
    probs[problem_indexes[i]] = pr

In [117]:
labels = (probs >= 0.5).astype(int)
labels[:10]

array([0, 0, 0, 0, 1, 1, 1, 1, 0, 1])

Приведем индекс к исходному состоянию

In [118]:
index = [id[:-2] for id in X_test.index[0::2]]

In [119]:
answer = pd.DataFrame()
answer['name'] = index
answer['target'] = labels

In [120]:
answer.to_csv('simple_answer.csv', sep='\t', index=False)

In [121]:
answer

Unnamed: 0,name,target
0,10013,0
1,10020,0
2,10037,0
3,10059,0
4,10113,1
...,...,...
995,9808,0
996,9860,1
997,9862,1
998,9866,1
