In [312]:
import numpy as np
import pandas as pd

In [8]:
players = pd.read_pickle('chgk/players.pkl')
results = pd.read_pickle('chgk/results.pkl')
tournaments = pd.read_pickle('chgk/tournaments.pkl')

In [87]:
train_t = {x: tournaments[x] for x in tournaments.keys() if '2019' in tournaments[x]['dateStart']}
test_t = {x: tournaments[x] for x in tournaments.keys() if '2020' in tournaments[x]['dateStart']}

In [88]:
len(train_t), len(test_t)

(687, 418)

In [89]:
tournaments_ids = set(train_t.keys())
result_filter = {x : results[x] for x in results.keys() if x in tournaments_ids}

In [105]:
def parse_info(results, tournaments_ids):
    result_filter = {x : results[x] for x in results.keys() if x in tournaments_ids}
    data = []
    columns = [
        'game_id',
        'player_id',
        'player_name',
        'player_surname',
        'player_patronymic',
        'player_rating',
        'player_usedRating',
        'player_flag',
        'team_id',
        'team_position',
        'team_questionsTotal',
        'team_mask'
    ]
    for key_t, game in result_filter.items():
        for j, team in enumerate(game):
            if 'mask' in team.keys():
                for member in team['teamMembers']:
                    row = [
                        key_t,
                        member['player']['id'],
                        member['player']['name'],
                        member['player']['surname'],
                        member['player']['patronymic'],
                        member['rating'],
                        member['usedRating'],
                        member['flag'],
                        team['team']['id'],
                        team['position'],
                        team['questionsTotal'],
                        team['mask']
                    ]
                    data.append(row)
    return pd.DataFrame(data, columns=columns)

In [160]:
train_data = parse_info(results, set(train_t.keys()))
test_data = parse_info(results, set(test_t.keys()))

In [161]:
def parse_t_info(t):
    columns = [
        'game_id',
        'game_name',
        'game_dateStart',
        'game_dateEnd',
        'game_type_id',
        'game_type_name',
        'game_season',
        'game_num_parts',
        'game_total_gues',
    ]
    data = []
    for t, t_info in t.items():
        questionQty = t_info.get('questionQty', dict())
        if questionQty is None:
            questionQty = dict()
        row = [
            t_info['id'],
            t_info['name'],
            t_info['dateStart'],
            t_info['dateEnd'],
            t_info['type']['id'],
            t_info['type']['name'],
            t_info['season'],
            len(questionQty),
            sum(questionQty.values()),
        ]
        data.append(row)
    return pd.DataFrame(data, columns=columns)

In [162]:
t_info = parse_t_info({**train_t, **test_t})

In [163]:
def parse_mask(mask):
    return list(map(int, list(mask.replace('X', '2').replace('?', '3'))))

In [164]:
train_data = train_data[~train_data.team_mask.isnull()]
test_data = test_data[~test_data.team_mask.isnull()]

In [177]:
train_data['mask_parse'] = train_data.team_mask.map(parse_mask)
test_data['mask_parse'] = test_data.team_mask.map(parse_mask)
train_data['ques_range'] = train_data.mask_parse.map(lambda x: list(range(len(x))))
test_data['ques_range'] = test_data.mask_parse.map(lambda x: list(range(len(x))))

In [178]:
train_data = train_data.explode(column=['mask_parse', 'ques_range'])
test_data = test_data.explode(column=['mask_parse', 'ques_range'])

In [195]:
train_data.mask_parse = train_data.mask_parse.astype(int)
test_data.mask_parse = test_data.mask_parse.astype(int)

In [200]:
train_data = train_data[train_data.mask_parse < 2]
test_data = test_data[test_data.mask_parse < 2]

In [201]:
train_data.shape, test_data.shape

((20910740, 14), (4469664, 14))

In [223]:
ques_info_1 = (
    train_data[['game_id', 'team_id', 'ques_range', 'player_rating', 'player_usedRating', 'mask_parse']]
    .groupby(by=['game_id', 'team_id', 'ques_range'])
    .agg('mean')
    .reset_index()
    .rename(columns={
        'player_rating': 'mean_player_rating',
        'player_usedRating': 'mean_player_usedRating',
        'mask_parse': 'mask_parse'
    })
)
ques_info_2 = (
    ques_info_1[['game_id', 'ques_range', 'mean_player_rating', 'mean_player_usedRating', 'mask_parse']]
    .drop_duplicates()
    .groupby(by=['game_id', 'ques_range'])
    .agg('mean')
    .reset_index()
    .rename(columns={
        'mean_player_rating': 'mean_mean_player_rating',
        'mean_player_usedRating': 'mean_mean_player_usedRating',
        'mask_parse': 'mean_mask_parse'
    })
)
ques_info_3 = (
    ques_info_1[['game_id', 'ques_range', 'mean_player_rating', 'mean_player_usedRating', 'mask_parse']]
    .drop_duplicates()
    .groupby(by=['game_id', 'ques_range', 'mask_parse'])
    .agg('mean')
    .reset_index()
    .rename(columns={
        'mean_player_rating': 'mask_mean_mean_player_rating',
        'mean_player_usedRating': 'mask_mean_mean_player_usedRating',
    })
)



In [224]:
train_data.shape

(20910740, 14)

In [225]:
train_data_feat = (
    train_data
    .merge(ques_info_1, on=['game_id', 'team_id', 'ques_range', 'mask_parse'])
    .merge(ques_info_2, on=['game_id', 'ques_range'])
    .merge(ques_info_3, on=['game_id', 'ques_range', 'mask_parse'])
)

In [313]:
train_data_feat.columns

Index(['game_id', 'player_id', 'player_name', 'player_surname',
       'player_patronymic', 'player_rating', 'player_usedRating',
       'player_flag', 'team_id', 'team_position', 'team_questionsTotal',
       'team_mask', 'mask_parse', 'ques_range', 'mean_player_rating',
       'mean_player_usedRating', 'mean_mean_player_rating',
       'mean_mean_player_usedRating', 'mean_mask_parse',
       'mask_mean_mean_player_rating', 'mask_mean_mean_player_usedRating',
       'predict'],
      dtype='object')

In [314]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x = train_data_feat[[
    'player_usedRating',
    'team_position',
    'team_questionsTotal',
    'mean_player_rating',
    'mean_player_usedRating',
    'mean_mean_player_rating',
    'mean_mean_player_usedRating',
    'mean_mask_parse',
    'mask_mean_mean_player_rating',
    'mask_mean_mean_player_usedRating'
]]
scaler.fit(x)
x_s = scaler.transform(x)

In [315]:
train_data_feat.mask_parse.value_counts()

0    11821102
1     9089638
Name: mask_parse, dtype: int64

In [316]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0, max_iter=1000).fit(x_s, train_data_feat.mask_parse)

In [317]:
from sklearn.metrics import accuracy_score, roc_auc_score
print(accuracy_score(train_data_feat.mask_parse, clf.predict(x_s)))
print(roc_auc_score(train_data_feat.mask_parse, clf.predict_proba(x_s)[:, 1]))

0.963722852467201
0.9934120258378619


In [318]:
train_data_feat['predict'] = clf.predict_proba(x_s)[:, 1]

In [319]:
rating = (
    train_data_feat[['player_id', 'player_name', 'player_surname', 'predict']]
    .groupby(['player_id', 'player_name', 'player_surname'])
    .agg('mean')
    .reset_index()
    .sort_values(by='predict', ascending=False)
)

Сначала попробуем отранжировать команды по среднему рейтингу игроков, если рейтинг неизвестен - заменяем рейтинг на 0.34 (средний рейтинг).

In [320]:
print('Кол-во игроков с неизвестным рейтингом -', len(set(test_data.player_id) - set(rating.player_id)))

Кол-во игроков с неизвестным рейтингом - 4382


In [321]:
test_data_predict = (
    test_data
    .merge(rating[['player_id', 'predict']], on='player_id', how='left')
    .fillna({'predict': 0.34})
)

In [322]:
test_data_team_rating = (
    test_data_predict[['game_id', 'team_id', 'team_position', 'predict']]
    .groupby(by=['game_id', 'team_id'])
    .agg('mean')
    .reset_index()
    .sort_values(by=['game_id', 'predict'], ascending=False)
)

In [323]:
test_data_team_rating['position_predict'] = (
    test_data_team_rating
    .groupby('game_id')
    .cumcount() + 1
)

In [324]:
import scipy.stats as stats
tau, _ = stats.kendalltau(test_data_team_rating.team_position, test_data_team_rating.position_predict)
print(tau)

0.7240688317678857


In [325]:
cor, _ = stats.spearmanr(test_data_team_rating.team_position, test_data_team_rating.position_predict)
print(cor)

0.8859030730309484
