In [1]:
import pathlib
import pickle

import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn import pipeline
from sklearn import linear_model
from scipy import sparse, stats
from tqdm import tqdm
import torch
import shutil

                the kernel may be left running.  Please let us know
                about your system (bitness, Python, etc.) at
                ipython-dev@scipy.org


In [2]:
%load_ext autoreload

%autoreload 2

In [3]:
from rating_model import TeamResults
from rating_model import PICKLE_PROTOCOL
from rating_model import EMRatingModel, get_player_skills, estimate_rank
from utils import load_pickle, dump_pickle, optimize_dataframe_numeric_dtypes

In [4]:
tours_datapath = pathlib.Path("data", "pickle_data", "tournaments-dt.pickle")

In [5]:
tours = pd.read_pickle(str(tours_datapath))

In [6]:
players_datapath = pathlib.Path("data", "pickle_data", "players-dt.pickle")

In [7]:
players_info = pd.read_pickle(players_datapath)

In [8]:
team_res_datapath = pathlib.Path(
    "data", "team_res", "train_team_results.pickle")

In [9]:
team_res = load_pickle(team_res_datapath)

In [10]:
total_unknown_team_players = 0
total_unknown_answers = 0
for tour_id in team_res.tours:
    for team_id in team_res[tour_id]:
        team = team_res[tour_id][team_id]
        if not team.members:
            total_unknown_team_players += 1
        if not team.mask:
            total_unknown_answers += 1

In [11]:
print("Количество команд без состава команды: ", total_unknown_team_players,
      "Количество команд с неизвестными повопроснами результатами: ", total_unknown_answers, sep="\n")

Количество команд без состава команды: 
69
Количество команд с неизвестными повопроснами результатами: 
173


In [12]:
tours.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1105 entries, 4628 to 6485
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype              
---  ------        --------------  -----              
 0   name          1105 non-null   object             
 1   dateStart     1105 non-null   datetime64[ns, UTC]
 2   dateEnd       1105 non-null   datetime64[ns, UTC]
 3   type          1105 non-null   object             
 4   season        1015 non-null   object             
 5   orgcommittee  1105 non-null   object             
 6   synchData     669 non-null    object             
 7   questionQty   1105 non-null   object             
dtypes: datetime64[ns, UTC](2), object(6)
memory usage: 77.7+ KB


In [13]:
players_dump = pathlib.Path("dump", "players.pickle")
players_dump.parent.mkdir(exist_ok=True, parents=True)

In [14]:
force_get_players = False

In [15]:
players = None

In [16]:
if not players_dump.exists() or force_get_players:
    players = team_res.to_player_dataframe(filter_by_mask=True)
    optimize_dataframe_numeric_dtypes(players)
    dump_pickle(players_dump, players)
else:
    players = load_pickle(players_dump)

In [17]:
assert players.index.is_monotonic_increasing

In [18]:
players.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17823516 entries, 0 to 17823515
Data columns (total 5 columns):
 #   Column           Dtype
---  ------           -----
 0   tour_id          int16
 1   team_id          int32
 2   player_id        int32
 3   answer_id        int32
 4   is_right_answer  bool 
dtypes: bool(1), int16(1), int32(3)
memory usage: 255.0 MB


In [19]:
players.head()

Unnamed: 0,tour_id,team_id,player_id,answer_id,is_right_answer
0,4772,45556,6212,0,True
1,4772,45556,6212,1,True
2,4772,45556,6212,2,True
3,4772,45556,6212,3,True
4,4772,45556,6212,4,True


In [20]:
target = players["is_right_answer"].astype(np.int8).to_numpy()

## Построение логистической регрессии для ранжирования игроков

In [21]:
players["is_right_answer"].value_counts(normalize=True)

False    0.562729
True     0.437271
Name: is_right_answer, dtype: float64

In [22]:
feature_dtype = np.float64

In [23]:
skils_encoder = preprocessing.OneHotEncoder(dtype=feature_dtype)

In [24]:
skils_features_one_hot = skils_encoder.fit_transform(
    players["player_id"].to_numpy().reshape(-1, 1))

In [25]:
questione_complex_encoder = preprocessing.OneHotEncoder(dtype=feature_dtype)

In [26]:
question_complices_one_hot = questione_complex_encoder.fit_transform(
    players["answer_id"].to_numpy().reshape(-1, 1))

In [27]:
features = sparse.hstack((skils_features_one_hot, question_complices_one_hot), format="csr")
del skils_features_one_hot
del question_complices_one_hot

In [28]:
assert sparse.isspmatrix_csr(features)

In [29]:
features

<17823516x91209 sparse matrix of type '<class 'numpy.float64'>'
	with 35647032 stored elements in Compressed Sparse Row format>

In [30]:
dump_model_filepath = pathlib.Path("model", "log-reg")

In [31]:
dump_model_filepath.mkdir(parents=True, exist_ok=True)

In [32]:
force_train = False

In [33]:
model_dump = dump_model_filepath / "log-reg.pickle"

In [34]:
regression = None

In [35]:
if not force_train and model_dump.exists():
    regression = load_pickle(model_dump)

In [36]:
if regression is None:
    regression = linear_model.LogisticRegression(
        penalty="l1", C=100, solver="liblinear", verbose=2, max_iter=10, fit_intercept=True)
    regression.fit(features, target)
    dump_pickle(model_dump, regression)

In [37]:
player_ratings = get_player_skills(skils_encoder, regression.coef_[0])
player_ratings.sort_values("skill", inplace=True, ascending=True)

In [38]:
player_ratings.nlargest(10, "skill")

Unnamed: 0,skill
27403,3.102096
4270,2.965024
28751,2.912417
27822,2.824027
30270,2.752742
40411,2.749181
30152,2.73985
38175,2.679079
20691,2.642352
199963,2.597687


## Оценка результатов

Для сравнение команд воспользуемся простым правилом. Для каждой команды в турнире возьмём игроков в отсортированном по убыванию силе игроков и отсортируем команды в лексикографическом порядке.

In [39]:
test_team_res_datapath = pathlib.Path(
    "data", "team_res", "test_team_results.pickle")

In [40]:
with open(test_team_res_datapath, "rb") as dump_file:
    team_res_test = pickle.load(dump_file)

In [41]:
test_team_ratings = team_res_test.to_team_rating_by_tour()

In [42]:
test_team_ratings.dropna(axis="index", inplace=True)

In [43]:
test_team_ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21062 entries, 0 to 21578
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   tour_id      21062 non-null  int64  
 1   members      21062 non-null  object 
 2   team_id      21062 non-null  int64  
 3   tour_rating  21062 non-null  float64
dtypes: float64(1), int64(2), object(1)
memory usage: 822.7+ KB


In [44]:
test_team_ratings.sort_values(["tour_id", "tour_rating"], inplace=True)

In [45]:
test_team_ratings.head(n=10)

Unnamed: 0,tour_id,members,team_id,tour_rating
0,4957,"(30152, 30270, 27822, 28751, 27403, 4270)",49804,1.0
1,4957,"(34936, 40877, 25177, 113703, 33792, 107161)",4109,2.0
2,4957,"(33620, 21346, 13857, 46339, 37836, 19632)",3875,3.0
3,4957,"(32901, 28689, 19541, 13689, 9801, 18194)",77418,4.0
4,4957,"(6482, 34846, 36120, 32458, 25882, 30475)",2,5.5
5,4957,"(23178, 19915, 10695, 74382, 26911)",45367,5.5
6,4957,"(16609, 35857, 57018, 34476, 48340, 66471)",4622,7.5
7,4957,"(52183, 93424, 29425, 13345, 34417)",4174,7.5
8,4957,"(21233, 183104, 34395, 9342, 143822, 24006, 10...",75600,9.0
9,4957,"(27009, 54574, 88505, 30236, 105369, 116628)",59580,12.0


### Коэффициент ранговой корреляции Кендалла

In [46]:
baseline = estimate_rank(test_team_ratings, player_ratings)

In [47]:
baseline

{'Kendall': 0.5806720044580711, 'Spearman': 0.7448398338463774}

In [48]:
del player_ratings

## EM алгоритм

Рассмртрим ответы команды на вопросы. Если команда $t$ ответила на вопрос $q$, то это означает, что хотя бы один игрок ответил на вопрос. Если команда не ответила на вопрос, то это означает, что ни один игрок также не оветил на вопрос.

Таким образом введём скрытые переменные: $h_{i,q}$- игорок под номером $i$ ответил на вопрос $q$. Они связаны с $x_{t,q}$ следующим соотношением:
$$
x_{t,q} = 
\begin{cases}
0, \text{ то } h_{i,q} = 0 \text{ для всех игроков в команде } t,\\
1, h_{i,q}=1 \text{ для хотя бы одного игрока в команде } t. 
\end{cases}
$$

Тогда веротяность $p\left(h_{i,q} \vert s_i, c_q\right) \sim \sigma\left(b + s_i + c_q\right), s_i-$ сила игрока $i$, $c_q-$ сложность вопроса, $b \in \mathbb{R}-$ глобальное смещение. Условную веротяность будем моделировать с помощью сигмоиды.

Рассмотрим итерация EM-алгоритма для $m \geq 0$.

## E-шаг

$$
\mathrm{M} \left[ h^{(m+1)}_{i,q} \right] = 
\begin{cases}
0, x_{t,q} = 0,\\
p\left( h^{(m)}_{i,q} = 1 \vert \exists j \in t, h^{(m)}_{j,k} = 1\right) =
\dfrac{\sigma \left(b^{(m)} + s^{(m)}_i + c^{(m)}_q\right)}{1-\prod\limits_{k \in t} \left(1 - \sigma\left(b^{(m)} + s^{(m)}_k + c^{(m)}_q\right)\right)}, \text{ если } x_{t,q} = 1.
\end{cases}
$$

## М-шаг

Происходит обучение обучение логистичексой регрессии при известных $\mathrm{M} \left[ h^{(m+1)}_{i,q} \right]$ и уточнение параметров:
$$
\mathrm{M} \left[ h^{(m+1)}_{i,q} \right] \sim \sigma\left(b^{(m+1)} + s^{(m+1)}_k + c^{(m+1)}_q\right)
$$


Пусть параметры модели образуют составляют вектор $w = \left(s_1,s_2,\ldots,s_P, c_1, c_2, \ldots, c_A, b \right)^T,$ где $P-$ общее число игроков, $A-$ общее число вопросов.

In [49]:
coo_features = features.tocoo(copy=False)

In [50]:
coo_features

<17823516x91209 sparse matrix of type '<class 'numpy.float64'>'
	with 35647032 stored elements in COOrdinate format>

In [56]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [57]:
log_dir = pathlib.Path("em_train", "logs")
if log_dir.exists():
    shutil.rmtree(log_dir)
log_dir.mkdir(exist_ok=True, parents=True)

em_model = EMRatingModel(em_num_iter=6, lr=1e-4,
                         log_reg_num_iter=20, device=device, log_dir=log_dir)

2021-04-03 23:54:59,649 INFO em_algo __init__ Will train logistic regression on cuda


In [58]:
em_model.fit(coo_features, target, players, skils_encoder, test_team_ratings, baseline, regression.coef_, regression.intercept_)

2021-04-03 23:55:10,024 INFO em_algo _build_player_team_round_indices Building mask for zeroing hidden variables
2021-04-03 23:55:10,041 INFO em_algo _build_player_team_round_indices Use 100000 as base value for grouping


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 80385/80385 [00:06<00:00, 11542.71it/s]
M step:   0%|                                                                                                                        | 0/6 [00:06<?, ?it/s]
Train logistic regression:   0%|                                                                                                    | 0/20 [00:00<?, ?it/s][A
Train logistic regression:   0%|                                                          | 0/20 [00:00<?, ?it/s, Binary cross entropy=0.464, MAE=0.000149][A
Train logistic regression:   5%|██▌                                               | 1/20 [00:00<00:12,  1.55it/s, Binary cross entropy=0.464, MAE=0.000149][A
Train logistic regression:   5%|██▌                                               | 1/20 [00:01<00:12,  1.55it/s, Binary cross entropy=0.464, MAE=0.000167][A
Train logistic regression:  10%|█████               

2021-04-03 23:56:00,665 INFO em_algo _validate Absolute difference relative to baseline:
2021-04-03 23:56:00,666 INFO em_algo _validate Kendall +0.000052
2021-04-03 23:56:00,667 INFO em_algo _validate Spearman +0.000076
2021-04-03 23:56:00,677 INFO em_algo _validate Corr coefficients: {'Kendall': 0.5807244838116733, 'Spearman': 0.7449160097307868}


M step:  17%|██████████████████▋                                                                                             | 1/6 [00:29<01:47, 21.52s/it]
Train logistic regression:   0%|                                                                                                    | 0/20 [00:00<?, ?it/s][A
Train logistic regression:   0%|                                                          | 0/20 [00:00<?, ?it/s, Binary cross entropy=0.464, MAE=0.000149][A
Train logistic regression:   5%|██▌                                               | 1/20 [00:00<00:12,  1.51it/s, Binary cross entropy=0.464, MAE=0.000149][A
Train logistic regression:   5%|██▌                                               | 1/20 [00:01<00:12,  1.51it/s, Binary cross entropy=0.464, MAE=0.000166][A
Train logistic regression:  10%|█████                                             | 2/20 [00:01<00:11,  1.55it/s, Binary cross entropy=0.464, MAE=0.000166][A
Train logistic regression:  10%|█████            

2021-04-03 23:56:23,146 INFO em_algo _validate Absolute difference relative to baseline:
2021-04-03 23:56:23,148 INFO em_algo _validate Kendall +0.000054
2021-04-03 23:56:23,151 INFO em_algo _validate Spearman +0.000082
2021-04-03 23:56:23,153 INFO em_algo _validate Absolute difference relative to previous params:
2021-04-03 23:56:23,155 INFO em_algo _validate Kendall +0.000002
2021-04-03 23:56:23,156 INFO em_algo _validate Spearman +0.000006
2021-04-03 23:56:23,158 INFO em_algo _validate Corr coefficients: {'Kendall': 0.5807263445137043, 'Spearman': 0.7449217629397125}


M step:  33%|█████████████████████████████████████▎                                                                          | 2/6 [00:50<01:28, 22.09s/it]
Train logistic regression:   0%|                                                                                                    | 0/20 [00:00<?, ?it/s][A
Train logistic regression:   0%|                                                          | 0/20 [00:00<?, ?it/s, Binary cross entropy=0.464, MAE=0.000149][A
Train logistic regression:   5%|██▌                                               | 1/20 [00:00<00:12,  1.56it/s, Binary cross entropy=0.464, MAE=0.000149][A
Train logistic regression:   5%|██▌                                               | 1/20 [00:01<00:12,  1.56it/s, Binary cross entropy=0.464, MAE=0.000166][A
Train logistic regression:  10%|█████                                             | 2/20 [00:01<00:11,  1.55it/s, Binary cross entropy=0.464, MAE=0.000166][A
Train logistic regression:  10%|█████            

2021-04-03 23:56:45,193 INFO em_algo _validate Absolute difference relative to baseline:
2021-04-03 23:56:45,196 INFO em_algo _validate Kendall +0.000056
2021-04-03 23:56:45,197 INFO em_algo _validate Spearman +0.000087
2021-04-03 23:56:45,198 INFO em_algo _validate Absolute difference relative to previous params:
2021-04-03 23:56:45,201 INFO em_algo _validate Kendall +0.000001
2021-04-03 23:56:45,205 INFO em_algo _validate Spearman +0.000005
2021-04-03 23:56:45,208 INFO em_algo _validate Corr coefficients: {'Kendall': 0.5807277897810972, 'Spearman': 0.7449268433775905}


M step:  50%|████████████████████████████████████████████████████████                                                        | 3/6 [01:12<01:06, 22.07s/it]
Train logistic regression:   0%|                                                                                                    | 0/20 [00:00<?, ?it/s][A
Train logistic regression:   0%|                                                          | 0/20 [00:00<?, ?it/s, Binary cross entropy=0.464, MAE=0.000148][A
Train logistic regression:   5%|██▌                                               | 1/20 [00:00<00:12,  1.53it/s, Binary cross entropy=0.464, MAE=0.000148][A
Train logistic regression:   5%|██▌                                               | 1/20 [00:01<00:12,  1.53it/s, Binary cross entropy=0.464, MAE=0.000166][A
Train logistic regression:  10%|█████                                             | 2/20 [00:01<00:11,  1.54it/s, Binary cross entropy=0.464, MAE=0.000166][A
Train logistic regression:  10%|█████            

2021-04-03 23:57:07,369 INFO em_algo _validate Absolute difference relative to baseline:
2021-04-03 23:57:07,370 INFO em_algo _validate Kendall +0.000038
2021-04-03 23:57:07,372 INFO em_algo _validate Spearman +0.000080
2021-04-03 23:57:07,375 INFO em_algo _validate Absolute difference relative to previous params:
2021-04-03 23:57:07,377 INFO em_algo _validate Kendall -0.000018
2021-04-03 23:57:07,379 INFO em_algo _validate Spearman -0.000007
2021-04-03 23:57:07,382 INFO em_algo _validate Corr coefficients: {'Kendall': 0.5807100811371994, 'Spearman': 0.7449199182100034}


M step:  67%|██████████████████████████████████████████████████████████████████████████▋                                     | 4/6 [01:35<00:44, 22.11s/it]
Train logistic regression:   0%|                                                                                                    | 0/20 [00:00<?, ?it/s][A
Train logistic regression:   0%|                                                          | 0/20 [00:00<?, ?it/s, Binary cross entropy=0.464, MAE=0.000148][A
Train logistic regression:   5%|██▌                                               | 1/20 [00:00<00:12,  1.55it/s, Binary cross entropy=0.464, MAE=0.000148][A
Train logistic regression:   5%|██▌                                               | 1/20 [00:01<00:12,  1.55it/s, Binary cross entropy=0.464, MAE=0.000166][A
Train logistic regression:  10%|█████                                             | 2/20 [00:01<00:11,  1.55it/s, Binary cross entropy=0.464, MAE=0.000166][A
Train logistic regression:  10%|█████            

2021-04-03 23:57:28,979 INFO em_algo _validate Absolute difference relative to baseline:
2021-04-03 23:57:28,980 INFO em_algo _validate Kendall +0.000038
2021-04-03 23:57:28,981 INFO em_algo _validate Spearman +0.000082
2021-04-03 23:57:28,982 INFO em_algo _validate Absolute difference relative to previous params:
2021-04-03 23:57:28,982 INFO em_algo _validate Kendall -0.000000
2021-04-03 23:57:28,983 INFO em_algo _validate Spearman +0.000002
2021-04-03 23:57:28,984 INFO em_algo _validate Corr coefficients: {'Kendall': 0.5807097689527164, 'Spearman': 0.7449223103869334}


M step:  83%|█████████████████████████████████████████████████████████████████████████████████████████████▎                  | 5/6 [01:56<00:21, 21.93s/it]
Train logistic regression:   0%|                                                                                                    | 0/20 [00:00<?, ?it/s][A
Train logistic regression:   0%|                                                          | 0/20 [00:00<?, ?it/s, Binary cross entropy=0.464, MAE=0.000148][A
Train logistic regression:   5%|██▌                                               | 1/20 [00:00<00:12,  1.53it/s, Binary cross entropy=0.464, MAE=0.000148][A
Train logistic regression:   5%|██▌                                               | 1/20 [00:01<00:12,  1.53it/s, Binary cross entropy=0.464, MAE=0.000166][A
Train logistic regression:  10%|█████                                             | 2/20 [00:01<00:11,  1.55it/s, Binary cross entropy=0.464, MAE=0.000166][A
Train logistic regression:  10%|█████            

2021-04-03 23:57:50,784 INFO em_algo _validate Absolute difference relative to baseline:
2021-04-03 23:57:50,785 INFO em_algo _validate Kendall +0.000042
2021-04-03 23:57:50,786 INFO em_algo _validate Spearman +0.000086
2021-04-03 23:57:50,789 INFO em_algo _validate Absolute difference relative to previous params:
2021-04-03 23:57:50,792 INFO em_algo _validate Kendall +0.000004
2021-04-03 23:57:50,793 INFO em_algo _validate Spearman +0.000003
2021-04-03 23:57:50,796 INFO em_algo _validate Corr coefficients: {'Kendall': 0.5807141753648156, 'Spearman': 0.7449254155011309}


M step: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [02:11<00:00, 21.94s/it]
