In [185]:
import pathlib
import pickle

import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn import pipeline
from sklearn import linear_model
from scipy import sparse, stats

In [186]:
%load_ext autoreload

%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [187]:
from rating_model import TeamResults

In [188]:
tours_datapath = pathlib.Path("data", "pickle_data", "tournaments-dt.pickle")

In [189]:
tours = pd.read_pickle(str(tours_datapath))

In [190]:
players_datapath = pathlib.Path("data", "pickle_data", "players-dt.pickle")

In [191]:
players_info = pd.read_pickle(players_datapath)

In [192]:
team_res_datapath = pathlib.Path("data", "team_res", "train_team_results.pickle")

In [193]:
with open(team_res_datapath, "rb") as dump_file:
    team_res = pickle.load(dump_file)

In [194]:
total_unknown_team_players = 0
total_unknown_answers = 0
for tour_id in team_res.tours:
    for team_id in team_res[tour_id]:
        team = team_res[tour_id][team_id]
        if not team.members:
            total_unknown_team_players += 1
        if not team.mask:
            total_unknown_answers += 1

In [195]:
print("Количество команд без состава команды: ", total_unknown_team_players, "Количество команд с неизвестными повопроснами результатами: ", total_unknown_answers, sep="\n")

Количество команд без состава команды: 
109
Количество команд с неизвестными повопроснами результатами: 
173


In [196]:
tours.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1105 entries, 4628 to 6485
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype              
---  ------        --------------  -----              
 0   name          1105 non-null   object             
 1   dateStart     1105 non-null   datetime64[ns, UTC]
 2   dateEnd       1105 non-null   datetime64[ns, UTC]
 3   type          1105 non-null   object             
 4   season        1015 non-null   object             
 5   orgcommittee  1105 non-null   object             
 6   synchData     669 non-null    object             
 7   questionQty   1105 non-null   object             
dtypes: datetime64[ns, UTC](2), object(6)
memory usage: 77.7+ KB


In [197]:
players = team_res.to_player_dataframe(filter_by_mask=True)

Convert to dataframe: 100%|██████████| 682/682 [01:52<00:00,  6.08it/s]


In [198]:
players.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17949880 entries, 0 to 419579
Data columns (total 5 columns):
 #   Column           Dtype
---  ------           -----
 0   tour_id          int64
 1   team_id          int64
 2   player_id        int64
 3   answer_id        int64
 4   is_right_answer  bool 
dtypes: bool(1), int64(4)
memory usage: 701.9 MB


In [199]:
players.head()

Unnamed: 0,tour_id,team_id,player_id,answer_id,is_right_answer
0,4772,45556,6212,0,True
1,4772,45556,6212,1,True
2,4772,45556,6212,2,True
3,4772,45556,6212,3,True
4,4772,45556,6212,4,True


## Построение логистической регрессии для ранжирования игроков

In [200]:
players["is_right_answer"].value_counts(normalize=True)

False    0.568645
True     0.431355
Name: is_right_answer, dtype: float64

In [201]:
feature_dtype = np.float32

In [202]:
skils_encoder = preprocessing.OneHotEncoder(dtype=feature_dtype)

In [203]:
skils_features = skils_encoder.fit_transform(players["player_id"].to_numpy().reshape(-1, 1))

In [204]:
questione_complex_encoder = preprocessing.OneHotEncoder(dtype=feature_dtype)

In [205]:
questions_complex = questione_complex_encoder.fit_transform(players["answer_id"].to_numpy().reshape(-1, 1))

In [206]:
features = sparse.hstack((skils_features, questions_complex))
del skils_features
del questions_complex

In [207]:
features

<17949880x91104 sparse matrix of type '<class 'numpy.float32'>'
	with 35899760 stored elements in COOrdinate format>

In [208]:
regression = linear_model.LogisticRegression(penalty="none", verbose=3, n_jobs=-1)

In [209]:
target = players["is_right_answer"].astype(np.int32).to_numpy()

In [210]:
regression.fit(features, target)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:  2.3min finished


LogisticRegression(n_jobs=-1, penalty='none', verbose=3)

In [212]:
def get_rating(skill_encoder, coefs) -> pd.DataFrame:
    rows = []
    all_players_ids = skill_encoder.categories_[0]
    for player_id in all_players_ids:
        rows.append({"player_id": player_id, "skill": coefs[np.where(all_players_ids == player_id)[0][0]]})
    return pd.DataFrame.from_records(rows, index="player_id",)

In [213]:
player_ratings = get_rating(skils_encoder, regression.coef_[0])

In [214]:
player_ratings.sort_values("skill", inplace=True)

In [215]:
player_ratings.nlargest(10, "skill")

Unnamed: 0_level_0,skill
player_id,Unnamed: 1_level_1
27403,3.961836
4270,3.943308
27822,3.545685
28751,3.518501
37047,3.420863
22799,3.351339
34328,3.348794
56647,3.333272
30270,3.29508
3843,3.279501


## Оценка результатов

Для сравнение команд воспользуемся простым правилом. Для каждой команды в турнире возьмём игроков в отсортированном по убыванию силе игроков и отсортируем команды в лексикографическом порядке.

In [292]:
test_team_res_datapath = pathlib.Path("data", "team_res", "test_team_results.pickle")

In [293]:
with open(test_team_res_datapath, "rb") as dump_file:
    team_res_test = pickle.load(dump_file)

In [294]:
test_team_ratings = team_res_test.to_team_rating_by_tour()

In [295]:
test_team_ratings.dropna(axis="index", inplace=True)

In [296]:
test_team_ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21914 entries, 0 to 22430
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   tour_id      21914 non-null  int64  
 1   members      21914 non-null  object 
 2   team_id      21914 non-null  int64  
 3   tour_rating  21914 non-null  float64
dtypes: float64(1), int64(2), object(1)
memory usage: 856.0+ KB


In [297]:
test_team_ratings.sort_values(["tour_id", "tour_rating"], inplace=True)

In [298]:
test_team_ratings.head()

Unnamed: 0,tour_id,members,team_id,tour_rating
0,4957,"(30152, 30270, 27822, 28751, 27403, 4270)",49804,1.0
1,4957,"(34936, 40877, 25177, 113703, 33792, 107161)",4109,2.0
2,4957,"(33620, 21346, 13857, 46339, 37836, 19632)",3875,3.0
3,4957,"(32901, 28689, 19541, 13689, 9801, 18194)",77418,4.0
4,4957,"(6482, 34846, 36120, 32458, 25882, 30475)",2,5.5


In [299]:
def player2ratings(players_id, player_ratings):
    ratings = []
    for player_id in players_id:
        try:
            ratings.append(player_ratings.loc[player_id, "skill"])
        except KeyError:
            pass
    ratings.sort(reverse=True)
    return tuple(ratings)

In [303]:
def rank_teams(teams, player_skills):
    ranking_teams = teams.copy()
    ranking_teams["player_skils"] = ranking_teams["members"].apply(lambda x: player2ratings(x, player_skills))
    ranking_teams.sort_values("player_skils", ascending=False, inplace=True)
    ranking_teams.drop("player_skils", axis="columns", inplace=True)
    return ranking_teams

In [304]:
def rank_commad(team_res, player_ratings):
    kendall_values = []
    for tour_id, teams in team_res.groupby("tour_id"):
        new_teams = teams[["members", "tour_rating"]].copy()
        new_teams.reset_index(inplace=True)
        original_order = new_teams.index.to_numpy()
        new_teams = rank_teams(new_teams, player_ratings)
        rank_order = new_teams.index.to_numpy()
        kendall_values.append(stats.kendalltau(original_order, rank_order)[0])
    return np.nanmean(kendall_values)

In [305]:
rank_commad(test_team_ratings, player_ratings)

0.5652927245499592