In [1]:
import pathlib
import pickle

import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn import pipeline
from sklearn import linear_model
from scipy import sparse, stats
from tqdm import tqdm
from itertools import repeat
from numba import jit

import torch

                the kernel may be left running.  Please let us know
                about your system (bitness, Python, etc.) at
                ipython-dev@scipy.org


In [2]:
%load_ext autoreload

%autoreload 2

In [3]:
from rating_model import TeamResults
from rating_model import PICKLE_PROTOCOL
from rating_model import EMRatingModel

In [4]:
tours_datapath = pathlib.Path("data", "pickle_data", "tournaments-dt.pickle")

In [5]:
tours = pd.read_pickle(str(tours_datapath))

In [6]:
players_datapath = pathlib.Path("data", "pickle_data", "players-dt.pickle")

In [7]:
players_info = pd.read_pickle(players_datapath)

In [8]:
team_res_datapath = pathlib.Path(
    "data", "team_res", "train_team_results.pickle")

In [9]:
with open(team_res_datapath, "rb") as dump_file:
    team_res = pickle.load(dump_file)

In [10]:
total_unknown_team_players = 0
total_unknown_answers = 0
for tour_id in team_res.tours:
    for team_id in team_res[tour_id]:
        team = team_res[tour_id][team_id]
        if not team.members:
            total_unknown_team_players += 1
        if not team.mask:
            total_unknown_answers += 1

In [11]:
print("Количество команд без состава команды: ", total_unknown_team_players,
      "Количество команд с неизвестными повопроснами результатами: ", total_unknown_answers, sep="\n")

Количество команд без состава команды: 
109
Количество команд с неизвестными повопроснами результатами: 
173


In [12]:
tours.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1105 entries, 4628 to 6485
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype              
---  ------        --------------  -----              
 0   name          1105 non-null   object             
 1   dateStart     1105 non-null   datetime64[ns, UTC]
 2   dateEnd       1105 non-null   datetime64[ns, UTC]
 3   type          1105 non-null   object             
 4   season        1015 non-null   object             
 5   orgcommittee  1105 non-null   object             
 6   synchData     669 non-null    object             
 7   questionQty   1105 non-null   object             
dtypes: datetime64[ns, UTC](2), object(6)
memory usage: 77.7+ KB


In [13]:
players = team_res.to_player_dataframe(filter_by_mask=True)

Convert to dataframe: 100%|██████████████████████████████████████████████████████████| 682/682 [01:50<00:00,  6.16it/s]


In [14]:
assert players.index.is_monotonic_increasing

In [15]:
players.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17949880 entries, 0 to 17949879
Data columns (total 5 columns):
 #   Column           Dtype
---  ------           -----
 0   tour_id          int64
 1   team_id          int64
 2   player_id        int64
 3   answer_id        int64
 4   is_right_answer  bool 
dtypes: bool(1), int64(4)
memory usage: 564.9 MB


In [16]:
players.head()

Unnamed: 0,tour_id,team_id,player_id,answer_id,is_right_answer
0,4772,45556,6212,0,True
1,4772,45556,6212,1,True
2,4772,45556,6212,2,True
3,4772,45556,6212,3,True
4,4772,45556,6212,4,True


In [17]:
target = players["is_right_answer"].astype(np.int8).to_numpy()

In [18]:
# players["tour_team_id"] = (players["tour_id"].astype(str) + " " + players["team_id"].astype(str)).factorize()[0]
# player_indices_in_team_by_round = []

# for group, data in players.groupby("tour_team_id"):
#     player_indices_in_team_by_round.append(data.index.to_list())

# max_length = len(max(player_indices_in_team_by_round, key=len))
# # pad_index это фейковый индекс и нужен только для того чтобы использовать функцию np.take
# # Значение по этому индексу всегда равно 0
# PAD_INDEX = len(target)
# for i in range(len(player_indices_in_team_by_round)):
#     indices = player_indices_in_team_by_round[i]
#     if len(indices) < max_length:
#         zeroing_mask[indices] = 1
#         player_indices_in_team_by_round[i].extend(repeat(PAD_INDEX, max_length - len(indices)))

In [19]:
# players.drop("tour_team_id", axis="columns", inplace=True)

In [20]:
# player_indices_in_team_by_round = np.array(player_indices_in_team_by_round)

## Построение логистической регрессии для ранжирования игроков

In [21]:
players["is_right_answer"].value_counts(normalize=True)

False    0.568645
True     0.431355
Name: is_right_answer, dtype: float64

In [22]:
feature_dtype = np.float64

In [23]:
skils_encoder = preprocessing.OneHotEncoder(dtype=feature_dtype)

In [24]:
skils_features = skils_encoder.fit_transform(
    players["player_id"].to_numpy().reshape(-1, 1))

In [25]:
questione_complex_encoder = preprocessing.OneHotEncoder(dtype=feature_dtype)

In [26]:
questions_complex = questione_complex_encoder.fit_transform(
    players["answer_id"].to_numpy().reshape(-1, 1))

In [27]:
features = sparse.hstack((skils_features, questions_complex))
features = sparse.csr_matrix(features)
del skils_features
del questions_complex

In [28]:
features

<17949880x91104 sparse matrix of type '<class 'numpy.float64'>'
	with 35899760 stored elements in Compressed Sparse Row format>

In [29]:
dump_model_filepath = pathlib.Path("model", "log-reg")

In [30]:
dump_model_filepath.mkdir(parents=True, exist_ok=True)

In [31]:
force_train = False

In [32]:
model_dump = dump_model_filepath / "log-reg.pickle"

In [33]:
regression = None

In [34]:
if not force_train and model_dump.exists():
    with open(model_dump, "rb") as dump_file:
        regression = pickle.load(dump_file)

In [35]:
if regression is None:
    regression = linear_model.LogisticRegression(
        penalty="none", verbose=2, max_iter=200)
    regression.fit(features, target)

In [36]:
with open(model_dump, "wb") as dump_file:
    pickle.dump(regression,  dump_file, protocol=PICKLE_PROTOCOL)

In [37]:
def get_rating(skill_encoder, coefs) -> pd.DataFrame:
    rows = []
    all_players_ids = skill_encoder.categories_[0]
    for player_id in all_players_ids:
        rows.append({"player_id": player_id, "skill": coefs[np.where(
            all_players_ids == player_id)[0][0]]})
    return pd.DataFrame.from_records(rows, index="player_id",)

In [38]:
player_ratings = get_rating(skils_encoder, regression.coef_[0])

In [39]:
player_ratings.sort_values("skill", inplace=True)

In [40]:
player_ratings.nlargest(5, "skill")

Unnamed: 0_level_0,skill
player_id,Unnamed: 1_level_1
27403,4.053972
4270,3.904938
28751,3.768449
30260,3.658079
30152,3.650698


## Оценка результатов

Для сравнение команд воспользуемся простым правилом. Для каждой команды в турнире возьмём игроков в отсортированном по убыванию силе игроков и отсортируем команды в лексикографическом порядке.

In [41]:
test_team_res_datapath = pathlib.Path(
    "data", "team_res", "test_team_results.pickle")

In [42]:
with open(test_team_res_datapath, "rb") as dump_file:
    team_res_test = pickle.load(dump_file)

In [43]:
test_team_ratings = team_res_test.to_team_rating_by_tour()

In [44]:
test_team_ratings.dropna(axis="index", inplace=True)

In [45]:
test_team_ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21914 entries, 0 to 22430
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   tour_id      21914 non-null  int64  
 1   members      21914 non-null  object 
 2   team_id      21914 non-null  int64  
 3   tour_rating  21914 non-null  float64
dtypes: float64(1), int64(2), object(1)
memory usage: 856.0+ KB


In [46]:
test_team_ratings.sort_values(["tour_id", "tour_rating"], inplace=True)

In [47]:
test_team_ratings.head()

Unnamed: 0,tour_id,members,team_id,tour_rating
0,4957,"(30152, 30270, 27822, 28751, 27403, 4270)",49804,1.0
1,4957,"(34936, 40877, 25177, 113703, 33792, 107161)",4109,2.0
2,4957,"(33620, 21346, 13857, 46339, 37836, 19632)",3875,3.0
3,4957,"(32901, 28689, 19541, 13689, 9801, 18194)",77418,4.0
4,4957,"(6482, 34846, 36120, 32458, 25882, 30475)",2,5.5


In [48]:
def player2ratings(players_id, player_ratings):
    ratings = []
    for player_id in players_id:
        try:
            ratings.append(player_ratings.loc[player_id, "skill"])
        except KeyError:
            pass
    ratings.sort(reverse=True)
    return tuple(ratings)

In [49]:
def rank_teams(teams, player_skills):
    ranking_teams = teams.copy()
    ranking_teams["player_skils"] = ranking_teams["members"].apply(
        lambda x: player2ratings(x, player_skills))
    ranking_teams.sort_values("player_skils", ascending=False, inplace=True)
    ranking_teams.drop("player_skils", axis="columns", inplace=True)
    return ranking_teams

In [50]:
def estimate_rank(team_res, player_ratings):
    kendall_values = []
    for tour_id, teams in team_res.groupby("tour_id"):
        new_teams = teams[["members", "tour_rating"]].copy()
        new_teams.reset_index(inplace=True)
        original_order = new_teams.index.to_numpy()
        new_teams = rank_teams(new_teams, player_ratings)
        rank_order = new_teams.index.to_numpy()
        kendall_values.append(stats.kendalltau(original_order, rank_order)[0])
    return np.nanmean(kendall_values)

### Коэффициент ранговой корреляции Кендалла

In [51]:
estimate_rank(test_team_ratings, player_ratings)

0.5752683352089869

In [52]:
del test_team_ratings
del player_ratings

## EM алгоритм

Рассмртрим ответы команды на вопросы. Если команда $t$ ответила на вопрос $q$, то это означает, что хотя бы один игрок ответил на вопрос. Если команда не ответила на вопрос, то это означает, что ни один игрок также не оветил на вопрос.

Таким образом введём скрытые переменные: $h_{i,q}$- игорок под номером $i$ ответил на вопрос $q$. Они связаны с $x_{t,q}$ следующим соотношением:
$$
x_{t,q} = 
\begin{cases}
0, \text{ то } h_{i,q} = 0 \text{ для всех игроков в команде } t,\\
1, h_{i,q}=1 \text{ для хотя бы одного игрока в команде } t. 
\end{cases}
$$

Тогда веротяность $p\left(h_{i,q} \vert s_i, c_q\right) \sim \sigma\left(b + s_i + c_q\right), s_i-$ сила игрока $i$, $c_q-$ сложность вопроса, $b \in \mathbb{R}-$ глобальное смещение. Условную веротяность будем моделировать с помощью сигмоиды.

Рассмотрим итерация EM-алгоритма для $m \geq 0$.

## E-шаг

$$
\mathrm{M} \left[ h^{(m+1)}_{i,q} \right] = 
\begin{cases}
0, x_{t,q} = 0,\\
p\left( h^{(m)}_{i,q} = 1 \vert \exists j \in t, h^{(m)}_{j,k} = 1\right) =
\dfrac{\sigma \left(b^{(m)} + s^{(m)}_i + c^{(m)}_q\right)}{1-\prod\limits_{k \in t} \left(1 - \sigma\left(b^{(m)} + s^{(m)}_k + c^{(m)}_q\right)\right)}, \text{ если } x_{t,q} = 1.
\end{cases}
$$

## М-шаг

Происходит обучение обучение логистичексой регрессии при известных $\mathrm{M} \left[ h^{(m+1)}_{i,q} \right]$ и уточнение параметров:
$$
\mathrm{M} \left[ h^{(m+1)}_{i,q} \right] \sim \sigma\left(b^{(m+1)} + s^{(m+1)}_k + c^{(m+1)}_q\right)
$$


Пусть параметры модели образуют составляют вектор $w = \left(s_1,s_2,\ldots,s_P, c_1, c_2, \ldots, c_A, b \right)^T,$ где $P-$ общее число игроков, $A-$ общее число вопросов.

In [53]:
# def sigmoid(x):
#     return 1 / (1 + np.exp(-x))

In [54]:
# def update_hidden_values(hidden_variables, indices_player_in_one_team_by_round, predicted_proba, pad_index):
#     predicted_proba_by_groups = np.take(predicted_proba, indices_player_in_one_team_by_round)
#     predicted_proba_by_groups /= (1 - np.prod(1 - predicted_proba_by_groups, axis=1).reshape(-1, 1))
    
#     for i, index in enumerate(indices_player_in_one_team_by_round):
#         not_fake_mask = index != pad_index
#         not_fake_indices = index[not_fake_mask]
#         hidden_variables[not_fake_indices] = predicted_proba_by_groups[i, not_fake_mask]
    
#     np.nan_to_num(hidden_variables, copy=False)

In [55]:
# def expectation(hidden_variables, target, indices_player_in_one_team_by_round, features, w, b, pad_index):
#     hidden_variables.fill(0)
#     predicted_proba = (features @ w).astype(np.float32)
#     predicted_proba += b
#     # Add fake value for vectorizing idexing operations
#     predicted_proba = np.append(predicted_proba, 0)
#     update_hidden_values(hidden_variables, indices_player_in_one_team_by_round, predicted_proba, pad_index)

In [56]:
# hidden_variables = np.zeros_like(target, dtype=np.float32)

In [57]:
# expectation(hidden_variables, target, player_indices_in_team_by_round, features, regression.coef_[0].astype(np.float32), regression.intercept_.astype(np.float32), PAD_INDEX)

In [62]:
coo_features = features.tocoo(copy=False)

In [63]:
coo_features

<17949880x91104 sparse matrix of type '<class 'numpy.float64'>'
	with 35899760 stored elements in COOrdinate format>

In [64]:
model = EMRatingModel(coo_features, target, players, 3, 1e-2, 10, torch.device("cuda"))

2021-03-29 01:25:57,393 INFO em_algo _build_player_team_round_indices Build indices masks


100%|█████████████████████████████████████████████████████████████████████████| 86964/86964 [00:07<00:00, 11299.59it/s]


In [65]:
model.fit()

EM algorithm:   0%|                                                                              | 0/3 [00:00<?, ?it/s]

2021-03-29 01:27:00,879 INFO em_algo _expectation Expectation step


EM algorithm:   0%|                                                                              | 0/3 [00:02<?, ?it/s]


RuntimeError: CUDA out of memory. Tried to allocate 548.00 MiB (GPU 0; 6.00 GiB total capacity; 4.18 GiB already allocated; 482.50 MiB free; 4.19 GiB reserved in total by PyTorch)