In [1]:
import pickle
import pandas as pd
import numpy as np
import math as mt
import torch
from torch import nn
from torch import optim
from scipy.stats import kendalltau, spearmanr
from sklearn.preprocessing import OneHotEncoder

In [2]:
results = pickle.load(open('results.pkl', 'rb'))
players = pickle.load(open('players.pkl', 'rb'))
tournaments = pickle.load(open('tournaments.pkl', 'rb'))

***Data example***

In [3]:
results[22][1]

{'team': {'id': 2, 'name': 'Афина', 'town': {'id': 201, 'name': 'Москва'}},
 'mask': '011111110101101001010111011111111111001101111101011101111101111110110101111100100001111111',
 'current': {'name': 'Афина', 'town': {'id': 201, 'name': 'Москва'}},
 'questionsTotal': 65,
 'synchRequest': None,
 'position': 2.5,
 'controversials': [],
 'flags': [],
 'teamMembers': [{'flag': None,
   'usedRating': 0,
   'rating': 0,
   'player': {'id': 707,
    'name': 'Елена',
    'patronymic': 'Андреевна',
    'surname': 'Александрова'}},
  {'flag': None,
   'usedRating': 0,
   'rating': 0,
   'player': {'id': 13551,
    'name': 'Вадим',
    'patronymic': 'Леонидович',
    'surname': 'Карлинский'}},
  {'flag': None,
   'usedRating': 0,
   'rating': 0,
   'player': {'id': 15442,
    'name': 'Дмитрий',
    'patronymic': 'Николаевич',
    'surname': 'Коноваленко'}},
  {'flag': None,
   'usedRating': 0,
   'rating': 0,
   'player': {'id': 25882,
    'name': 'Максим',
    'patronymic': 'Оскарович',
    'sur

In [4]:
players[1]

{'id': 1, 'name': 'Алексей', 'patronymic': None, 'surname': 'Абабилов'}

In [5]:
tournaments[1]

{'id': 1,
 'name': 'Чемпионат Южного Кавказа',
 'dateStart': '2003-07-25T00:00:00+04:00',
 'dateEnd': '2003-07-27T00:00:00+04:00',
 'type': {'id': 2, 'name': 'Обычный'},
 'season': '/seasons/1',
 'orgcommittee': [],
 'synchData': None,
 'questionQty': None}

***START***

***1. Data preparation***

In [6]:
train_data_list =  []
test_data_list = []
for tournament, teams in results.items():
    for team in teams:
        tournament_year = tournaments[tournament]["dateStart"][:4]
        if team.get("mask", None) and tournament_year in ['2019', '2020']:
            for member in team['teamMembers']:
                add = {}
                add['team_id'] = int(team['team']['id'])
                add['team_member'] = int(member['player']['id'])
                add['tournament_id'] = int(tournament)
                add['avg_score'] = team['mask'].count('1') / len(team['mask'])
                add = {
                    "tournament_id": tournament,
                    "tournament_name": tournaments.get(tournament).get("name"),
                    "team_id": team["team"]["id"],
                    "team_name": team["team"]["name"],
                    "player_id": member["player"]["id"],
                    "player_name": member["player"]["surname"] + " " + member["player"]["name"] + " " + member["player"]["patronymic"],
                    "position": team.get("position", None),
                }
                if tournament_year == '2019':
                    add["number_of_questions"] = sum(tournaments.get(tournament).get("questionQty").values())
                    add["questions_mask"] = team["mask"]
                    if add["number_of_questions"] == len(add["questions_mask"]):
                        train_data_list.append(add)
                elif tournament_year == '2020':
                    test_data_list.append(add)

In [7]:
test_data = pd.DataFrame(test_data_list)

del results
del test_data_list

data_by_question = []
for item in train_data_list:
    for _, result in enumerate(item["questions_mask"]):
        if result in ["0", "1"]:
            add = {
                "tournament_id": item["tournament_id"],
                "team_id": item["team_id"],
                "player_id": item["player_id"],
                "question": str(item["tournament_id"]) + "_" + str(_),
                "basic_label": int(result),
            }
            data_by_question.append(add)
            
del train_data_list

***2. Training Logistic regression***

Let's make several assumptions:
- if team don't answer the question, then no player of the team has answered the question
- if team answered the question then every player answered the question (this is quite naive, but we will use it for this step)

Our goal is to build and train model prediction the probability of player i answering the question j

In [21]:
df_by_question = pd.DataFrame(data_by_question).astype({"tournament_id": np.int32,
                                                       "team_id": np.int32,
                                                       "player_id": np.int32,
                                                       "basic_label": np.uint8})

encoder = OneHotEncoder()
train_data = encoder.fit_transform(df_by_question[["player_id", "question"]])
x = torch.sparse.FloatTensor(
    torch.LongTensor(np.vstack((train_data.tocoo().row, train_data.tocoo().col))),
    torch.FloatTensor(train_data.data)
)
y = torch.FloatTensor(df_by_question["basic_label"].values).view(-1, 1)

In [9]:
class LogisticRegression(nn.Module):
    def __init__(self, n_features):
        super().__init__()
        self.lin = nn.Linear(n_features, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        return self.sigmoid(self.lin(x))
    
def train_step(model, x, y, lr=1, n_iter=100):
    model.lin.reset_parameters()
    criteria = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr)
    for i in range(n_iter):
        optimizer.zero_grad()
        loss = criteria(model(x), y)
        loss.backward()
        optimizer.step()

In [10]:
n_features = x.shape[1]
model = LogisticRegression(n_features)
train_step(model, x, y, lr=0.4, n_iter=100)

  allow_unreachable=True)  # allow_unreachable flag


***3. Building top of the players and calculating correlation between train and test data***

If we want to compare our results with test dataset we might want to calculate the team's strengs.
Then we can rank them and compare with there real positions in 2020.

Team's strengs can be described as probabilty of the team to answer an average question. 
Moreover, this probability equals 1 - probability than no one in the team will answer the question. 
As different players in the team answer the question independently we can establish the team's strengs as:

Let $p_{ij}$ - j player's strengs in team i of size $N$, $q_{avg}$ - avg question, $b$ - bias

$\large{team\_probability_i = 1 - \prod_{j = 1}^{N}(1 - \sigma(p_{ij} + q_{avg} + b))}$

In [11]:
weights = [model.lin.weight.data[0].numpy(), model.lin.bias.data[0].numpy()]

In [12]:
def correlations(data, encoder, parameters):
    player_weights = {}
    player_sum = 0
    player_count = 0
    question_sum = 0
    question_count = 0
    for i, feature in enumerate(encoder.get_feature_names()):
        if feature[:3] == "x0_":
            player_weights[int(feature[3:])] = parameters[0][i]
            player_sum += parameters[0][i]
            player_count += 1
        else:
            question_sum += parameters[0][i]
            question_count += 1
    
    # Predicting player probabilities
    data["player_weights"] = data["player_id"].apply(lambda x: player_weights.get(x))
    data["player_weights"].fillna(player_sum / player_count, inplace=True)
    data["player_probability"] = data["player_weights"].apply(lambda x: 1 / 
                                                               (1 + np.exp( - (x + question_sum / question_count + parameters[1]))))

    # Calculating probability for the whole team and grouping by tournament to compare with real team positions
    probas = data.groupby(["tournament_id", "team_id"])["player_probability"].apply(lambda x: 1 - np.prod(1 - x))
    position = data.groupby(["tournament_id", "team_id"])["position"].first()
    group_data = pd.concat([probas, position], axis=1)
    group_data.sort_values(["tournament_id", "player_probability"], ascending=[True, True], inplace=True)
    
    # Correlations
    spearman = group_data.groupby("tournament_id").apply(lambda x: - spearmanr(x["position"], x["player_probability"]).correlation).mean()
    kendall = group_data.groupby("tournament_id").apply(lambda x: - kendalltau(x["position"], x["player_probability"]).correlation).mean()
    print("Spearman correlation: ", spearman)
    print("Kendall correlation: ", kendall)

correlations(test_data, encoder, weights)

Spearman correlation:  0.7841490583742511
Kendall correlation:  0.6272188903866396


In [13]:
def get_top_players_with_amounts_of_plays(encoder, parameters, df_by_question, n=20):
    player_weights = {}
    for i, feature in enumerate(encoder.get_feature_names()):
        if feature[:3] == "x0_":
            player_weights[int(feature[3:])] = parameters[i]
    return [str(_ + 1) + ": " + players[i[0]]["surname"] + " " + players[i[0]]["name"] + " " +
            players[i[0]]["patronymic"] + " played in " + 
            str(df_by_question[df_by_question["player_id"]==i[0]]["tournament_id"].nunique()) + 
            " tournaments" for _, i in enumerate(sorted(player_weights.items(), key=lambda x: - x[1])[:n])]


get_top_players_with_amounts_of_plays(encoder, weights[0], df_by_question, 20)

['1: Руссо Максим Михайлович played in 55 tournaments',
 '2: Брутер Александра Владимировна played in 67 tournaments',
 '3: Семушин Иван Николаевич played in 95 tournaments',
 '4: Савченков Михаил Владимирович played in 79 tournaments',
 '5: Кудинов Дмитрий Сергеевич played in 1 tournaments',
 '6: Спешков Сергей Леонидович played in 87 tournaments',
 '7: Сорожкин Артём Сергеевич played in 124 tournaments',
 '8: Пилипенко Максим Игоревич played in 1 tournaments',
 '9: Мереминский Станислав Григорьевич played in 37 tournaments',
 '10: Подюкова Валентина  played in 1 tournaments',
 '11: Левандовский Михаил Ильич played in 33 tournaments',
 '12: Николенко Сергей Игоревич played in 50 tournaments',
 '13: Прокофьева Ирина Сергеевна played in 26 tournaments',
 '14: Новиков Илья Сергеевич played in 34 tournaments',
 '15: Царёв Михаил Сергеевич played in 12 tournaments',
 '16: Саксонов Антон Владимирович played in 31 tournaments',
 '17: Либер Александр Витальевич played in 92 tournaments',
 '18

***4. Using EM algorithm to correct our predictions with players answers dependency knowledge***

If we define $t_{i}$ as team's $i$ answer, we can define $z_{ij}$ - answer of player $j$ in team $i$ as hidden variables

${p(t = 1|z = 1) = 1}$


Then, what we want on E-step is:
${p(z_{ij} = 1|t_{i} = y)}$, where y - is an answer.


as ${p(z_{ij} = 1|t_{i} = 0) = 0}$ we only need to find:

${p(z_{ij} = 1|t_{i} = 1) = \large\frac{p(t_{i} = 1|z_{ij} = 1)p(z_{ij} = 1)}{p(t_{i} = 1)}}$


Then, using predictions from M-step:

$${p(z_{ij} = 1|t_{i} = 1) = \frac{\sigma(p_i + q_j + \mu)}{1 - \prod_{k=1}^{N}(1 - \sigma(p_{ik} + q_j + \mu))}}$$

In [14]:
def e_step(data, predicts):
    data["label"] = predicts
    data.loc[data["basic_label"] == 0, "label"] = 0
    data.loc[data["basic_label"] == 1, "label"] = data.loc[data["basic_label"] == 1, "label"] / \
    data[data["basic_label"] == 1].groupby(["team_id", "question"])["label"].transform(lambda x: 1 - np.prod(1 - x.values))
    return data


def m_step(model, x, y, lr=1e-4, n_iter=50):
    model.lin.reset_parameters()
    criteria = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr)
    for i in range(n_iter):
        optimizer.zero_grad()
        output = model(x)
        loss = criteria(output, y)
        loss.backward()
        optimizer.step()

In [15]:
# Let's do one EM step
print("First EM step")
predicts = model(x).detach().numpy().flatten()
question_answer_data__ = e_step(df_by_question.copy(), predicts)
y = torch.FloatTensor(question_answer_data__["label"].values).view(-1, 1)
m_step(model, x, y, lr=1, n_iter=100)
weights = [model.lin.weight.data[0].numpy(), model.lin.bias.data[0].numpy()]
correlations(test_data, encoder, weights)

First EM step
Spearman correlation:  0.7939742284850609
Kendall correlation:  0.638210608081932


In [16]:
# Repeat several times
NUMBER_OF_EM_STEPS = 8
predicts = model(x).detach().numpy().flatten()
for i in range(NUMBER_OF_EM_STEPS):
    print("Iteration: ", i + 2)
    question_answer_data__ = e_step(question_answer_data__, predicts)
    y = torch.FloatTensor(question_answer_data__["label"].values).view(-1, 1)
    m_step(model, x, y, lr=1, n_iter=100)
    predicts = model(x).detach().numpy().flatten()
    weights = [model.lin.weight.data[0].numpy(), model.lin.bias.data[0].numpy()]
    correlations(test_data, encoder, weights)

Iteration:  2
Spearman correlation:  0.7940165038369098
Kendall correlation:  0.6381864092709209
Iteration:  3
Spearman correlation:  0.7968135701095866
Kendall correlation:  0.6403350102670844
Iteration:  4
Spearman correlation:  0.7972285131208035
Kendall correlation:  0.6411090201440234
Iteration:  5
Spearman correlation:  0.7998539536546474
Kendall correlation:  0.6448975747469757
Iteration:  6
Spearman correlation:  0.8013588764635735
Kendall correlation:  0.6463833627792295
Iteration:  7
Spearman correlation:  0.8013211652449049
Kendall correlation:  0.6462529804481234
Iteration:  8
Spearman correlation:  0.8010457653707796
Kendall correlation:  0.6459347109064342
Iteration:  9
Spearman correlation:  0.8012001305518531
Kendall correlation:  0.6458371814404334


As we can see correlation is growing slightly, so everything is ok

***5. Creating tournament rating and players top***

In [17]:
question_weights = {}
for i, feature in enumerate(encoder.get_feature_names()):
    if feature[:3] == "x1_":
        question_weights[feature[3:]] = weights[0][i]

In [18]:
tournament_weights = question_answer_data__.groupby("tournament_id")["question"].apply(lambda x: np.mean([question_weights[q] for q in x])).sort_values().reset_index()["tournament_id"].apply(lambda x: tournaments[x]["name"])

In [19]:
tournament_weights.head(20)

0               Чемпионат Санкт-Петербурга. Первая лига
1                                Чемпионат Таджикистана
2                                           Угрюмый Ёрш
3                Зеркало мемориала памяти Михаила Басса
4                            Синхрон высшей лиги Москвы
5                                    Воображаемый музей
6                          Первенство правого полушария
7                       Чемпионат Мира. Этап 2 Группа С
8                      Чемпионат Мира. Этап 2. Группа В
9                      Чемпионат Мира. Этап 3. Группа С
10                        Мемориал памяти Михаила Басса
11    Седьмая октава: Кубок Равноденствия. Лига Наци...
12                                               День D
13                                     Записки охотника
14              Чемпионат Санкт-Петербурга. Высшая лига
15                                     Знание – Сила VI
16                                     Чемпионат России
17                     Чемпионат Мира. Этап 1. Г

In [20]:
get_top_players_with_amounts_of_plays(encoder, weights[0], df_by_question, 20)

['1: Пилипенко Максим Игоревич played in 1 tournaments',
 '2: Немец Илья Сергеевич played in 2 tournaments',
 '3: Бровченко Елена Борисовна played in 1 tournaments',
 '4: Козлова Ольга Александровна played in 1 tournaments',
 '5: Подюкова Валентина  played in 1 tournaments',
 '6: Голудина Мария Юрьевна played in 1 tournaments',
 '7: Кудинов Дмитрий Сергеевич played in 1 tournaments',
 '8: Фрайман Феликс  played in 1 tournaments',
 '9: Лазарева Галина Вячеславовна played in 1 tournaments',
 '10: Петров Игорь  played in 1 tournaments',
 '11: Иоаннидис Андриан  played in 1 tournaments',
 '12: Королихин Анатолий  played in 1 tournaments',
 '13: Луконин Дмитрий  played in 1 tournaments',
 '14: Петров Роман  played in 1 tournaments',
 '15: Спектор Евгений Павлович played in 8 tournaments',
 '16: Прохоров Константин  played in 1 tournaments',
 '17: Войцеховская Станислава Валерьевна played in 1 tournaments',
 '18: Ишин Вадим Александрович played in 1 tournaments',
 '19: Шобанов Андрей  played