In [None]:
import pandas as pd
import numpy as np

In [None]:
player_stats = pd.read_json("./data/player_all_stat.json")
player_stats

In [None]:
player_stats.describe(include='all').T

In [None]:
player_stats.dtypes

In [None]:
player_stats.nr.duplicated().any()

In [None]:
player_stats.sort_values('mean', ascending=False)

In [None]:
promising_players = player_stats.query("nr >= 5 & mean >= 75 & std <= 50")
promising_players

In [None]:
promising_players.sort_values('mean', ascending=False)

In [None]:
anonymous_player = promising_players.loc[0]
anonymous_player

In [None]:
promising_players = promising_players.drop([0], axis=0)
promising_players

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaled = scaler.fit_transform(promising_players)
scaled = pd.DataFrame(scaled, columns=promising_players.columns, index=promising_players.index)
scaled

In [None]:
scaled['mean'].isna().any()

In [None]:
np.sqrt(scaled['nr']).isna().any()

In [None]:
(1 / (scaled['std'] + 1)).isna().any()

In [None]:
scores = (scaled['mean'] * (1/(scaled['std']+1)) * np.sqrt(scaled['nr']))
scores.isna().any()

In [None]:
scores = scores / scores.max()

In [None]:
scores

In [None]:
scores[scores.isna()]  # no nan

In [None]:
scores.shape

In [None]:
# higher score = better
# lower std = better
# more plays = better

promising_players['score'] = scores
promising_players

In [None]:
promising_players['score'].isna().any()

In [None]:
anonymous_player_df = pd.DataFrame(anonymous_player).T
anonymous_player_df['id'] = anonymous_player_df['id'].astype(int)
anonymous_player_df['nr'] = anonymous_player_df['nr'].astype(int)
anonymous_player_df['score'] = 1
anonymous_player_df

In [None]:
promising_players = pd.concat([anonymous_player_df, promising_players])
promising_players

In [None]:
promising_players = promising_players.sort_values('score', ascending=False)
promising_players

In [None]:
promising_players.to_csv("./data/promising_players.csv")

Now instead of just using promising players, use the n best

In [None]:
from jass.game.const import card_strings
features = np.append(card_strings, ['FH'])
cols = np.append(features, ['user', 'trump'])

trumps = pd.read_csv("./data/2018_10_18_trump.csv", header=None, names=cols)
trumps['FH'] = 1 - trumps['FH']  # for some reason, this dataset has 0 = push is an option, 1 = push is not an option, whereas push should only be an option if FH=1
trumps

In [None]:
player_stats.sort_values('mean', ascending=False).query("nr > 100")

In [None]:
rounded_stats = player_stats.round(0)
rounded_stats

In [None]:
n_games_threshold = 1000

In [None]:
top_threshold = rounded_stats.query(f'nr >= {n_games_threshold}').sort_values(['mean', 'std', 'nr'], ascending=[False, True, False])
top_threshold

In [None]:
top_threshold.describe(include='all').T

75 percentile of mean score for players with over 1000 games is 79 (rounded). For the number of games, the 75 percentile is 5000.

In [None]:
top_n = 250

For top 500, the worst selected player has a mean score of 79 with 4400 games played. Top 1000 drops down to 77. And top 100 gives 80.

According to this sorting of mean DESC, std ASC, nr DESC, the anonymous players are position 715. For fine-tuning, I would only want to choose games from players that are considerable better than average, and I'm making the assumption that the average player might play on anonymous instead of creating their own account.

In the end I settled for 250, which makes the worst player have a mean score of 80 over 2700 games, and results in ~900 samples for each category (trump and fh) for fine-tuning.

In [None]:
best = top_threshold.head(top_n)
best

In [None]:
best.nr.sum()

In [None]:
trumps_from_best = trumps[trumps['user'].isin(best.id)]
trumps_from_best

In [None]:
value_counts = trumps_from_best[['FH', 'trump']].value_counts()
value_counts

In [None]:
min_n = value_counts.min()
min_n

In [None]:
random_state = 42

In [None]:
selected = []
for fh in [1, 0]:
    for trump in range(6+fh):
        df = trumps_from_best.query(f"FH == {fh} & trump == {trump}").sample(min_n, random_state=42)
        selected.append(df)
    
balanced = pd.concat(selected)
balanced

In [None]:
len(balanced)

In [None]:
balanced.to_csv(f"./data/trump_top{top_n}_balanced.csv")