In [None]:
import pathlib
import pickle

import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn import pipeline
from sklearn import linear_model
from scipy import sparse

from rating_model import TeamResults

In [None]:
tours_datapath = pathlib.Path("data", "pickle_data", "tournaments-dt.pickle")

In [None]:
tours = pd.read_pickle(str(tours_datapath))

In [None]:
players_datapath = pathlib.Path("data", "pickle_data", "players-dt.pickle")

In [None]:
players_info = pd.read_pickle(players_datapath)

In [None]:
team_res_datapath = pathlib.Path("data", "team_res", "train_team_results.pickle")

In [None]:
with open(team_res_datapath, "rb") as dump_file:
    team_res = pickle.load(dump_file)

In [None]:
tours.info()

In [None]:
players = team_res.to_player_dataframe(filter_by_mask=True)

In [None]:
players.info()

In [None]:
players.head()

In [None]:
players["is_right_answer"].value_counts(normalize=True)

In [None]:
feature_dtype = np.float32

In [None]:
skils_encoder = preprocessing.OneHotEncoder(dtype=feature_dtype)

In [None]:
skils_features = skils_encoder.fit_transform(players["player_id"].to_numpy().reshape(-1, 1))

In [None]:
questione_complex_encoder = preprocessing.OneHotEncoder(dtype=feature_dtype)

In [None]:
questions_complex = questione_complex_encoder.fit_transform(players["answer_id"].to_numpy().reshape(-1, 1))

In [None]:
features = sparse.hstack((skils_features, questions_complex))
del skils_features
del questions_complex

In [None]:
features

In [None]:
regression = linear_model.LogisticRegression(penalty="none", verbose=3, n_jobs=-1)

In [None]:
target = players["is_right_answer"].astype(np.int32).to_numpy()

In [None]:
regression.fit(features, target)

In [None]:
def get_rating(skill_encoder, coefs) -> pd.DataFrame:
    rows = []
    all_players_ids = skill_encoder.categories_[0]
    for player_id in all_players_ids:
        rows.append({"player_id": player_id, "skill": coefs[np.where(all_players_ids == player_id)[0][0]]})
    return pd.DataFrame.from_records(rows)

In [None]:
player_ratings = get_rating(skils_encoder, regression.coef_[0])

In [None]:
player_ratings.sort_values("skill", inplace=True)

In [None]:
player_ratings.nlargest(10, "skill")

In [None]:
players_info.loc[4270, :]