In [1]:
import pathlib
import pickle

import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn import pipeline
from sklearn import linear_model
from scipy import sparse

from rating_model import TeamResults

In [2]:
tours_datapath = pathlib.Path("data", "pickle_data", "tournaments-dt.pickle")

In [3]:
tours = pd.read_pickle(str(tours_datapath))

In [4]:
players_datapath = pathlib.Path("data", "pickle_data", "players-dt.pickle")

In [5]:
players_info = pd.read_pickle(players_datapath)

In [6]:
team_res_datapath = pathlib.Path("data", "team_res", "train_team_results.pickle")

In [7]:
with open(team_res_datapath, "rb") as dump_file:
    team_res = pickle.load(dump_file)

In [8]:
total_unknown_team_players = 0
total_unknown_answers = 0
for tour_id in team_res.tours:
    for team_id in team_res[tour_id]:
        team = team_res[tour_id][team_id]
        if not team.members:
            total_unknown_team_players += 1
        if not team.mask:
            total_unknown_answers += 1

In [9]:
print("Количество команд без состава команды: ", total_unknown_team_players, "Количество команд с неизвестными повопроснами результатами: ", total_unknown_answers, sep="\n")

Количество команд без состава команды: 
109
Количество команд с неизвестными повопроснами результатами: 
173


In [10]:
tours.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1105 entries, 4628 to 6485
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype              
---  ------        --------------  -----              
 0   name          1105 non-null   object             
 1   dateStart     1105 non-null   datetime64[ns, UTC]
 2   dateEnd       1105 non-null   datetime64[ns, UTC]
 3   type          1105 non-null   object             
 4   season        1015 non-null   object             
 5   orgcommittee  1105 non-null   object             
 6   synchData     669 non-null    object             
 7   questionQty   1105 non-null   object             
dtypes: datetime64[ns, UTC](2), object(6)
memory usage: 77.7+ KB


In [11]:
players = team_res.to_player_dataframe(filter_by_mask=True)

Convert to dataframe: 100%|██████████| 689/689 [02:15<00:00,  5.09it/s]


In [12]:
players.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17949880 entries, 0 to 419579
Data columns (total 5 columns):
 #   Column           Dtype
---  ------           -----
 0   tour_id          int64
 1   team_id          int64
 2   player_id        int64
 3   answer_id        int64
 4   is_right_answer  bool 
dtypes: bool(1), int64(4)
memory usage: 701.9 MB


In [13]:
players.head()

Unnamed: 0,tour_id,team_id,player_id,answer_id,is_right_answer
0,4772,45556,6212,0,True
1,4772,45556,6212,1,True
2,4772,45556,6212,2,True
3,4772,45556,6212,3,True
4,4772,45556,6212,4,True


In [14]:
players["is_right_answer"].value_counts(normalize=True)

False    0.568645
True     0.431355
Name: is_right_answer, dtype: float64

In [15]:
feature_dtype = np.float32

In [16]:
skils_encoder = preprocessing.OneHotEncoder(dtype=feature_dtype)

In [17]:
skils_features = skils_encoder.fit_transform(players["player_id"].to_numpy().reshape(-1, 1))

In [18]:
questione_complex_encoder = preprocessing.OneHotEncoder(dtype=feature_dtype)

In [19]:
questions_complex = questione_complex_encoder.fit_transform(players["answer_id"].to_numpy().reshape(-1, 1))

In [20]:
features = sparse.hstack((skils_features, questions_complex))
del skils_features
del questions_complex

In [21]:
features

<17949880x91104 sparse matrix of type '<class 'numpy.float32'>'
	with 35899760 stored elements in COOrdinate format>

In [22]:
regression = linear_model.LogisticRegression(penalty="none", verbose=3, n_jobs=-1)

In [23]:
target = players["is_right_answer"].astype(np.int32).to_numpy()

In [24]:
regression.fit(features, target)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:  2.7min finished


LogisticRegression(n_jobs=-1, penalty='none', verbose=3)

In [25]:
def get_rating(skill_encoder, coefs) -> pd.DataFrame:
    rows = []
    all_players_ids = skill_encoder.categories_[0]
    for player_id in all_players_ids:
        rows.append({"player_id": player_id, "skill": coefs[np.where(all_players_ids == player_id)[0][0]]})
    return pd.DataFrame.from_records(rows)

In [26]:
player_ratings = get_rating(skils_encoder, regression.coef_[0])

In [27]:
player_ratings.sort_values("skill", inplace=True)

In [28]:
player_ratings.nlargest(10, "skill")

Unnamed: 0,player_id,skill
3888,27403,3.961836
609,4270,3.943308
3962,27822,3.545685
4080,28751,3.518501
5221,37047,3.420863
3247,22799,3.351339
4828,34328,3.348794
6782,56647,3.333272
4290,30270,3.29508
549,3843,3.279501
