In [22]:
import numpy as np
import pandas as pd
from tqdm import tqdm

def add_basic_stats(row, moves, pref=""):

    row[pref + 'winrate_mean'] = np.mean(moves.winrate_delta)
    row[pref + 'score_mean'] = np.mean(moves.score_delta)
    row[pref + 'score_var'] = np.var(moves.score_delta)
    row[pref + 'selfplay_mean'] = np.mean(moves.selfplay_delta)
    row[pref + 'utility_mean'] = np.mean(moves.utility_delta)
    row[pref + 'winrate_beauty_percent'] = np.mean([x > 0 for x in moves.winrate_delta])
    row[pref + 'score_beauty_percent'] = np.mean([x > 0 for x in moves.score_delta])


def add_advanced_stats(row, moves, pref=""):
    moves.score_delta.sort()
    moves.winrate_delta.sort()

    row[pref + 'score25p'] = moves.score_delta[int(moves.cnt_moves * 0.25)]
    row[pref + 'score75p'] = moves.score_delta[int(moves.cnt_moves * 0.75)]
    row[pref + 'score_max'] = np.max(moves.score_delta)
    row[pref + 'score_min'] = np.min(moves.score_delta)
    row[pref + 'winrate25p'] = moves.winrate_delta[int(moves.cnt_moves * 0.25)]
    row[pref + 'winrate75p'] = moves.winrate_delta[int(moves.cnt_moves * 0.75)]

    row[pref + 'score_five_best_mean'] = np.mean(moves.score_delta[-5:])
    row[pref + 'score_five_worst_mean'] = np.mean(moves.score_delta[:5])

    row[pref + 'stddev_last'] = moves.stddev_delta[-1]
    moves.stddev_delta.sort()
    row[pref + 'stddev_mean'] = np.mean(moves.stddev_delta)
    row[pref + 'stddev50p'] = moves.stddev_delta[int(moves.cnt_moves * 0.5)]

    row[pref + 'score50p'] = moves.score_delta[int(moves.cnt_moves * 0.5)]

    row[pref + 'winrate_midmean'] = np.mean(
        moves.winrate_delta[int(moves.cnt_moves * 0.25):int(moves.cnt_moves * 0.75)])
    row[pref + 'score_midmean'] = np.mean(moves.score_delta[int(moves.cnt_moves * 0.25):int(moves.cnt_moves * 0.75)])


def get_index_dan(rank):
  for i in range(len(rank)):
    if rank[i] in ['k', 'd']:
      return i
  return -1


def get_int_from_rank(rank):
    ind = get_index_dan(rank)
    if ind < 0 or rank[0] == 'P' or not rank[:ind].isdigit():
        return None
    if rank[ind] == 'k':
        return -int(rank[:ind]) + 1
    else:
        return int(rank[:ind])


def get_rank_from_int(x):
    if x > 0:
        return str(x) + "d"
    else:
        return str(-x + 1) + "k"


def int_from_player(player):
    return int(player == 'W')


def player_from_int(x):
    return 'W' if x == 1 else 'B'


def add_meta(row, player='W'):
    if row['Result'] == '?':
        row['game_result'] = 0
    else:
        row['game_result'] = int(row['Result'])
        if player == 'B':
            row['game_result'] = -row['game_result']
    row['color'] = int_from_player(player)
    row['rank'] = get_int_from_rank(row[player + '_rating'])
    row['game_length'] = len(row['W_move'].split()) + len(row['B_move'].split())


def convert_to_lists(df):
    for i, row in tqdm(df.iterrows()):
        try:
            row['W_scoreLead'] = [float(x) for x in row['W_scoreLead'].split()]
            row['B_scoreLead'] = [float(x) for x in row['B_scoreLead'].split()]
            row['W_scoreSelfplay'] = [float(x) for x in row['W_scoreSelfplay'].split()]
            row['B_scoreSelfplay'] = [float(x) for x in row['B_scoreSelfplay'].split()]
            row['W_scoreStdev'] = [float(x) for x in row['W_scoreStdev'].split()]
            row['B_scoreStdev'] = [float(x) for x in row['B_scoreStdev'].split()]
            row['W_utility'] = [float(x) for x in row['W_utility'].split()]
            row['B_utility'] = [float(x) for x in row['B_utility'].split()]
            row['W_winrate'] = [float(x) for x in row['W_winrate'].split()]
            row['B_winrate'] = [float(x) for x in row['B_winrate'].split()]
            df.loc[i] = row
        except Exception as e:
            print(e)
            df = df.drop(index=[i])          
    return df
    

class MovesInfo:
    def __init__(self, row, n_moves=None, player='W'):
        enemy = 'B' if player == 'W' else 'W'
        if player == 'B':
            moves_len = min(len(row['B_winrate']), len(row['W_winrate']) + 1)
        else:
            moves_len = min(len(row['W_winrate']), len(row['B_winrate']))

        if n_moves is None:
            start_ind = 0
        else:
            start_ind = max(moves_len - n_moves - 1, 0)

        end_ind = moves_len
        w_0 = []
        sc_0 = []
        ut_0 = []
        std_0 = []
        sf_0 = []
        if player == 'B':
            if start_ind == 0:
                w_0 = [0.475]
                sc_0 = [0.]
                ut_0 = [0.]
                std_0 = [19. - 0.07666535852999867 * 2]
                sf_0 = [0.]
            else:
                w_0 = [row[enemy + '_winrate'][start_ind - 1]]

                sc_0 = [row[enemy + '_scoreLead'][start_ind - 1]]
                ut_0 = [row[enemy + '_utility'][start_ind - 1]]
                std_0 = [row[enemy + '_scoreStdev'][start_ind - 1]]
                sf_0 = [row[enemy + '_scoreSelfplay'][start_ind - 1]]
        self.winrate_delta = np.array(row[player + '_winrate'][start_ind:end_ind]) - np.array(
            (w_0 + row[enemy + '_winrate'])[start_ind:end_ind])
        self.score_delta = np.array(row[player + '_scoreLead'][start_ind:end_ind]) - np.array(
            (sc_0 + row[enemy + '_scoreLead'])[start_ind:end_ind])
        self.utility_delta = np.array(row[player + '_utility'][start_ind:end_ind]) - np.array(
            (ut_0 + row[enemy + '_utility'])[start_ind:end_ind])
        self.selfplay_delta = np.array(row[player + '_scoreSelfplay'][start_ind:end_ind]) - np.array(
            (sf_0 + row[enemy + '_scoreSelfplay'])[start_ind:end_ind])
        self.stddev_delta = np.array(row[player + '_scoreStdev'][start_ind:end_ind]) - np.array(
            (std_0 + row[enemy + '_scoreStdev'])[start_ind:end_ind])

        if player == 'B':
            self.winrate_delta = -self.winrate_delta
            self.score_delta = -self.score_delta
            self.utility_delta = -self.utility_delta
            self.selfplay_delta = -self.selfplay_delta

        self.move = row[player + '_move'].split()
        self.cnt_moves = end_ind - start_ind


def add_all_game_stats(df, player = 'W'):
    df['winrate_mean'] = None
    df['score_midmean'] = None
    df['score_mean'] = None
    df['score_var'] = None
    df['winrate_beauty_percent'] = None
    df['score_beauty_percent'] = None
    df['utility_mean'] = None
    df['score25p'] = None
    df['score75p'] = None
    df['winrate25p'] = None
    df['winrate75p'] = None
    df['winrate_midmean'] = None
    df['score50p'] = None
    df['selfplay_mean'] = None
    df['stddev_mean'] = None
    df['stddev50p'] = None
    df['stddev_last'] = None
    df['score_max'] = None
    df['score_min'] = None
    df['score_five_best_mean'] = None
    df['score_five_worst_mean'] = None
    for i, row in tqdm(df.iterrows()):
        add_basic_stats(row, MovesInfo(row, player=player))
        add_advanced_stats(row, MovesInfo(row, player=player))
        df.loc[i] = row
    return df            


def get_start_of_yose(margin_moves, no_change_count=5):
    '''
    Находим сколько последних ходов мы будем считать что это йосе

    Будем идти с конца по массиву количества ходов на краю, если в массиве no_change_count одинаковых чисел,
    то мы останавливаемся и говорим, что тут началось йосе
    '''
    no_change = 1
    ans = len(margin_moves)
    for i in range(len(margin_moves) - 2, -1, -1):
        if margin_moves[i] == margin_moves[i + 1]:
            no_change += 1
        else:
            no_change = 1
        if no_change >= no_change_count:
            ans = len(margin_moves) - i - 1
            break
    return ans


def reset_basic_stats(df, pref):
    df[pref + 'winrate_mean'] = None
    df[pref + 'winrate_beauty_percent'] = None
    df[pref + 'score_beauty_percent'] = None
    df[pref + 'score_mean'] = None
    df[pref + 'selfplay_mean'] = None
    df[pref + 'score_var'] = None
    df[pref + 'utility_mean'] = None


def is_marginal_move(move):
    return (move[0] == 'a' or move[0] == 'b') or (move[0] == 'r' or move[0] == 's') or \
           (move[1] == 'a' or move[1] == 'b') or (move[1] == 'r' or move[1] == 's')


def count_of_marginal_moves(moves):
    ans = np.zeros(len(moves))
    for i in range(len(moves)):
        ans[i] = is_marginal_move(moves[i])
    ans = np.cumsum(ans)
    return ans


def add_yose_stats(df, player = 'W'):
    pref = 'yose_'
    reset_basic_stats(df, pref)
    df['yose_length'] = None
    df['yose_start'] = None
    df['yose_has'] = None
    for i, row in tqdm(df.iterrows()):
        marginal_moves = count_of_marginal_moves(row['W_move'].split())
        n_moves = get_start_of_yose(marginal_moves, 10)
        add_basic_stats(row, MovesInfo(row, n_moves, player=player), pref)
        row['yose_length'] = n_moves
        row['yose_start'] = len(row['W_move'].split()) - n_moves
        row['yose_has'] = row['yose_start'] != 0
        df.loc[i] = row
    return df    


def delta_moves(a, b):
    return abs(ord(a[0]) - ord(b[0])) + abs(ord(a[1]) - ord(b[1]))


def get_distance_of_moves(moves):
    ans = np.zeros(len(moves) - 1)
    for i in range(1, len(moves)):
        ans[i - 1] = delta_moves(moves[i], moves[i - 1])
    return ans


def get_distance_from_enemy(my_moves, enemy_moves):
    ans = np.zeros(min(len(my_moves), len(enemy_moves)))
    for i in range(min(len(my_moves), len(enemy_moves))):
        ans[i] = delta_moves(my_moves[i], enemy_moves[i])
    return ans


def add_last_moves_stats(df, n_moves, pref=None, player='W'):
    if pref is None:
        pref = str(n_moves) + "_"
    reset_basic_stats(df, pref)
    for i, row in tqdm(df.iterrows()):
        add_basic_stats(row, MovesInfo(row, n_moves, player=player), pref)
        df.loc[i] = row
        
    return df    


def add_dist_stats_to_row(row, player='W'):
    dist = get_distance_of_moves(row[player+'_move'].split())
    dist_enemy = get_distance_from_enemy(row['W_move'].split(), row['B_move'].split())
    dist.sort()

    row['dist_mean'] = np.mean(dist)
    row['dist_var'] = np.var(dist)
    row['dist_median'] = dist[len(dist) // 2]
    row['dist_percent_more_than_10'] = np.mean([x > 10 for x in dist])
    row['dist_percent_more_than_5'] = np.mean([x > 5 for x in dist])
    row['dist_percent_more_than_20'] = np.mean([x > 20 for x in dist])

    row['dist_from_enemy_mean'] = np.mean(dist_enemy)
    row['dist_from_enemy_var'] = np.var(dist_enemy)


def add_dist_stats(df, player='W'):
    df['dist_mean'] = None
    df['dist_var'] = None
    df['dist_median'] = None
    df['dist_percent_more_than_5'] = None
    df['dist_percent_more_than_10'] = None
    df['dist_percent_more_than_20'] = None

    df['dist_from_enemy_mean'] = None
    df['dist_from_enemy_var'] = None

    for i, row in tqdm(df.iterrows()):
       if len(row['W_move']) > 20:
        add_dist_stats_to_row(row, player=player)
        df.loc[i] = row
    return df    


def delete_non_scalar_parameters(df):
    df.drop(['W_rating', 'B_rating', 'W_move', 'B_move', 'W_scoreLead', 'B_scoreLead', 'W_scoreSelfplay',
             'B_scoreSelfplay', 'W_scoreStdev', 'B_scoreStdev', 'W_utility', 'B_utility',
             'W_winrate', 'B_winrate', 'Result'], axis=1, inplace=True)
    return df         


def add_delta_lists_to_row(row, moves, player='W'):
    row['winrate'] = moves.winrate_delta
    row['score'] = moves.score_delta
    row['winrate_sqr'] = np.array([x ** 2 for x  in moves.winrate_delta])
    row['score_sqr'] = np.array([x ** 2 for x  in moves.score_delta])
    row['utility'] = moves.utility_delta
    row['selfplay'] = moves.selfplay_delta
    row['stddev'] = moves.stddev_delta
    row['dist_from_prev'] = get_distance_of_moves(moves.move)
    row['dist_more_5'] = [int(x > 5) + 1 for x in row['dist_from_prev']]
    enemy = 'B' if player == 'W' else 'W'
    row['dist_enemy'] = get_distance_from_enemy(row[player + '_move'].split(), row[enemy + '_move'].split())


def kWorst(moves, cnt=5):
    moves.sort()
    return moves[:cnt][-1]  


def addWorstMoves(df, player='W'):
  for cnt in [5, 10]:
    for type_cls in ['score', 'winrate']:
      df['Worst_' + type_cls + str(cnt)] = np.array(df.apply(lambda row: kWorst(row[type_cls], cnt), axis = 1))
  return df    


def add_lists_to_df(df, player='W'):
    df['winrate'] = None
    df['score'] = None
    df['utility'] = None
    df['selfplay'] = None
    df['stddev'] = None
    df['dist_from_prev'] = None
    df['score_sqr'] = None
    df['winrate_sqr'] = None
    df['dist_more_5'] = None
    df['dist_enemy'] = None
    for i, row in tqdm(df.iterrows()):
        add_delta_lists_to_row(row, MovesInfo(row, player=player), player)
        df.loc[i] = row
    return df    


def add_meta_to_df(df, player='W'):
    df['game_length'] = None
    df['rank'] = None
    df['color'] = None
    df['game_result'] = None
    for i, row in tqdm(df.iterrows()):
        try:
            add_meta(row, player)
            df.loc[i] = row
        except Exception as e:
            print(e)
            df = df.drop(index=[i])
    return df        


def get_feature_df(df, player='W'):
    df = add_meta_to_df(df, player)
    df = convert_to_lists(df)
    df = add_all_game_stats(df, player)
    df = add_yose_stats(df, player)
    df = add_last_moves_stats(df, 10, player)
    df = add_last_moves_stats(df, 20, player)
    df = add_dist_stats(df, player)
    df = delete_non_scalar_parameters(df)
    return df


def get_df_with_lists(df, player='W'):
    df = add_meta_to_df(df, player)
    df = convert_to_lists(df)
    df = add_lists_to_df(df, player)
    return df


In [23]:
from google.colab import drive
drive.mount('/content/drive')
katago_hum = pd.read_csv('/content/drive/MyDrive/data_big.csv')[:3000]
katago = pd.read_csv('/content/drive/MyDrive/computer_data.csv')
katago_hum['is_bot'] = 0
katago['is_bot'] = 1
#katago.dropna(how = 'any', inplace= True)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [24]:
katago_1 = katago.append(katago_hum, ignore_index=True)

In [26]:
bots = ['noob_bot_1', 'doge_bot_2', 'kata-bot', 'katago-micro', 'doge_bot_3', 'doge_bot_1', 'amybot-ddk', 'doge_bot_4', '15bTurboLeela', '12bTurboSai', 'kata_noob']
katago_white = get_df_with_lists(katago_1.copy(), player = 'W')
katago_black = get_df_with_lists(katago_1.copy(), player = 'B')
katago_all = katago_white.append(katago_black, ignore_index=True)

12it [00:00, 57.64it/s]

'float' object has no attribute 'split'
'float' object has no attribute 'split'
'float' object has no attribute 'split'


36it [00:00, 57.49it/s]

'float' object has no attribute 'split'


72it [00:01, 55.49it/s]

'float' object has no attribute 'split'


109it [00:02, 61.47it/s]

'float' object has no attribute 'split'


151it [00:02, 72.37it/s]

'float' object has no attribute 'split'


192it [00:03, 75.05it/s]

'float' object has no attribute 'split'


237it [00:03, 86.28it/s]

'float' object has no attribute 'split'
'float' object has no attribute 'split'


256it [00:03, 87.00it/s]

'float' object has no attribute 'split'
'float' object has no attribute 'split'


320it [00:04, 83.24it/s]

'float' object has no attribute 'split'
'float' object has no attribute 'split'


369it [00:05, 91.42it/s]

'float' object has no attribute 'split'
'float' object has no attribute 'split'


400it [00:05, 89.19it/s]

'float' object has no attribute 'split'


440it [00:06, 88.63it/s]

'float' object has no attribute 'split'
'float' object has no attribute 'split'
'float' object has no attribute 'split'
'float' object has no attribute 'split'


472it [00:06, 92.07it/s]

'float' object has no attribute 'split'
'float' object has no attribute 'split'


558it [00:08, 44.03it/s]

'float' object has no attribute 'split'


594it [00:09, 21.64it/s]

'float' object has no attribute 'split'


650it [00:12, 27.49it/s]

'float' object has no attribute 'split'


784it [00:14, 92.41it/s]

'float' object has no attribute 'split'


803it [00:14, 66.10it/s]

'float' object has no attribute 'split'


861it [00:15, 59.80it/s]

'float' object has no attribute 'split'
'float' object has no attribute 'split'


877it [00:15, 67.16it/s]

'float' object has no attribute 'split'


954it [00:16, 55.01it/s]

'float' object has no attribute 'split'
'float' object has no attribute 'split'


998it [00:17, 55.49it/s]

'float' object has no attribute 'split'


1052it [00:18, 51.09it/s]

'float' object has no attribute 'split'


1111it [00:19, 86.34it/s]

'float' object has no attribute 'split'


1242it [00:21, 84.29it/s]

'float' object has no attribute 'split'


1279it [00:21, 80.74it/s]

'float' object has no attribute 'split'


1374it [00:22, 97.81it/s]

'float' object has no attribute 'split'


1437it [00:23, 96.93it/s]

'float' object has no attribute 'split'


1468it [00:23, 96.20it/s]

'float' object has no attribute 'split'


1551it [00:24, 93.64it/s]

'float' object has no attribute 'split'


1572it [00:24, 84.42it/s]

'float' object has no attribute 'split'


1621it [00:25, 68.89it/s]

'float' object has no attribute 'split'


1659it [00:26, 79.95it/s]

'float' object has no attribute 'split'
'float' object has no attribute 'split'


1783it [00:27, 87.59it/s]

'float' object has no attribute 'split'


1821it [00:27, 90.72it/s]

'float' object has no attribute 'split'
'float' object has no attribute 'split'


1851it [00:28, 87.59it/s]

'float' object has no attribute 'split'
'float' object has no attribute 'split'


1932it [00:29, 83.27it/s]

'float' object has no attribute 'split'


1962it [00:29, 90.47it/s]

'float' object has no attribute 'split'


2003it [00:30, 84.99it/s]

'float' object has no attribute 'split'


2042it [00:30, 90.63it/s]

'float' object has no attribute 'split'


2072it [00:30, 86.60it/s]

'float' object has no attribute 'split'


2129it [00:31, 76.00it/s]

'float' object has no attribute 'split'


2159it [00:31, 89.48it/s]

'float' object has no attribute 'split'


2249it [00:33, 73.41it/s]

'float' object has no attribute 'split'


2310it [00:33, 81.45it/s]

'float' object has no attribute 'split'


2370it [00:34, 75.79it/s]

'float' object has no attribute 'split'


2406it [00:35, 83.20it/s]

'float' object has no attribute 'split'
'float' object has no attribute 'split'


2495it [00:36, 74.84it/s]

'float' object has no attribute 'split'


2531it [00:36, 83.76it/s]

'float' object has no attribute 'split'


5537it [01:14, 74.82it/s]
5472it [01:10, 78.06it/s]
5472it [01:31, 59.72it/s]
20it [00:00, 96.72it/s]

'float' object has no attribute 'split'
'float' object has no attribute 'split'
'float' object has no attribute 'split'


39it [00:00, 84.76it/s]

'float' object has no attribute 'split'


85it [00:01, 83.29it/s]

'float' object has no attribute 'split'


112it [00:01, 84.84it/s]

'float' object has no attribute 'split'


152it [00:01, 94.88it/s]

'float' object has no attribute 'split'


192it [00:02, 92.86it/s]

'float' object has no attribute 'split'


235it [00:02, 99.30it/s]

'float' object has no attribute 'split'
'float' object has no attribute 'split'


256it [00:02, 98.22it/s]

'float' object has no attribute 'split'
'float' object has no attribute 'split'


315it [00:03, 91.66it/s]

'float' object has no attribute 'split'
'float' object has no attribute 'split'


365it [00:04, 82.78it/s]

'float' object has no attribute 'split'


395it [00:04, 90.02it/s]

'float' object has no attribute 'split'
'float' object has no attribute 'split'


448it [00:05, 96.78it/s]

'float' object has no attribute 'split'
'float' object has no attribute 'split'
'float' object has no attribute 'split'
'float' object has no attribute 'split'


481it [00:05, 102.08it/s]

'float' object has no attribute 'split'
'float' object has no attribute 'split'


564it [00:06, 83.20it/s]

'float' object has no attribute 'split'


606it [00:06, 96.72it/s]

'float' object has no attribute 'split'


657it [00:07, 90.24it/s]

'float' object has no attribute 'split'


784it [00:08, 93.51it/s]

'float' object has no attribute 'split'


805it [00:09, 89.78it/s]

'float' object has no attribute 'split'


871it [00:09, 80.14it/s]

'float' object has no attribute 'split'
'float' object has no attribute 'split'
'float' object has no attribute 'split'


963it [00:10, 92.44it/s]

'float' object has no attribute 'split'
'float' object has no attribute 'split'


1005it [00:11, 89.35it/s]

'float' object has no attribute 'split'


1056it [00:11, 93.48it/s]

'float' object has no attribute 'split'


1110it [00:12, 103.57it/s]

'float' object has no attribute 'split'


1237it [00:13, 92.01it/s]

'float' object has no attribute 'split'


1278it [00:14, 89.78it/s]

'float' object has no attribute 'split'


1374it [00:15, 90.99it/s]

'float' object has no attribute 'split'


1437it [00:16, 91.51it/s]

'float' object has no attribute 'split'


1457it [00:16, 86.87it/s]

'float' object has no attribute 'split'


1555it [00:17, 83.04it/s]

'float' object has no attribute 'split'


1588it [00:18, 86.62it/s]

'float' object has no attribute 'split'


1617it [00:18, 84.38it/s]

'float' object has no attribute 'split'


1648it [00:18, 94.73it/s]

'float' object has no attribute 'split'
'float' object has no attribute 'split'


1782it [00:20, 102.00it/s]

'float' object has no attribute 'split'


1815it [00:20, 98.71it/s]

'float' object has no attribute 'split'
'float' object has no attribute 'split'


1846it [00:20, 88.87it/s]

'float' object has no attribute 'split'


1865it [00:21, 86.20it/s]

'float' object has no attribute 'split'


1931it [00:21, 92.05it/s]

'float' object has no attribute 'split'


1973it [00:22, 85.22it/s]

'float' object has no attribute 'split'


2000it [00:22, 78.83it/s]

'float' object has no attribute 'split'


2050it [00:23, 89.93it/s]

'float' object has no attribute 'split'


2079it [00:23, 87.38it/s]

'float' object has no attribute 'split'


2137it [00:24, 84.24it/s]

'float' object has no attribute 'split'


2147it [00:24, 85.81it/s]

'float' object has no attribute 'split'


2242it [00:25, 82.57it/s]

'float' object has no attribute 'split'


2311it [00:26, 92.89it/s]

'float' object has no attribute 'split'


2373it [00:26, 92.56it/s]

'float' object has no attribute 'split'


2403it [00:27, 89.52it/s]

'float' object has no attribute 'split'
'float' object has no attribute 'split'


2495it [00:28, 93.30it/s]

'float' object has no attribute 'split'


2536it [00:28, 97.71it/s]

'float' object has no attribute 'split'


5537it [01:03, 86.87it/s]
5472it [01:06, 82.60it/s]
5472it [01:31, 60.05it/s]


In [27]:
#katago_all = katago_all[features + ['is_bot']].dropna()
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(katago_all, katago_all['is_bot'], test_size=0.2)


In [28]:
katago_white.columns

Index(['W_rating', 'B_rating', 'W_nickname', 'B_nickname', 'Result', 'W_move',
       'B_move', 'W_scoreLead', 'B_scoreLead', 'W_scoreSelfplay',
       'B_scoreSelfplay', 'W_scoreStdev', 'B_scoreStdev', 'W_utility',
       'B_utility', 'W_winrate', 'B_winrate', 'game_id', 'error', 'is_bot',
       'W_visits', 'B_visits', 'game_length', 'rank', 'color', 'game_result',
       'winrate', 'score', 'utility', 'selfplay', 'stddev', 'dist_from_prev',
       'score_sqr', 'winrate_sqr', 'dist_more_5', 'dist_enemy'],
      dtype='object')

In [37]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras import layers
import pandas as pd


def get_tensor(df, features, sequence_len):
    def resize_to_length(m, length):
        if len(m) > length:
            return m[:length]
        n_nules = length - len(m)
        return list([0] * n_nules) + list(m)

    arr = []
    for feature in features:
        arr.append([np.array(y) for y in df[feature].apply(lambda x: resize_to_length(x, sequence_len))])
    arr = np.array(arr)
    arr = arr.swapaxes(0, 1)
    arr = arr.swapaxes(1, 2)
    return tf.convert_to_tensor(arr, np.float32)


class BotClassificatorRunner:
    __DEFAULT_FEATURES = ['score', 'winrate', 'utility', 'stddev', 'selfplay', 'dist_from_prev', 'winrate_sqr', 'score_sqr', 'dist_more_5', 'dist_enemy']

    def __create_rnn(self, hidden_units, input_shape):
        model = keras.Sequential()
        model.add(tf.keras.layers.Masking(mask_value=0.,
                                          input_shape=input_shape))
        model.add(layers.BatchNormalization())
        model.add(layers.Bidirectional(layers.LSTM(hidden_units)))  # , dropout = 0.15, reccurent_dropout = 0.15)))
        model.add(layers.Dense(units=2, activation="linear"))
        model.add(layers.Activation('softmax'))
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics='accuracy')
        return model

    def __init__(self, model=None, sequence_len=150, features=__DEFAULT_FEATURES):
        self.features = features
        self.sequence_len = sequence_len
        if model is None:
            self.model = self.__create_rnn(64, (sequence_len, len(features)))
        else:
            self.model = model

    def fit(self, df, target, val_X, val_y, epochs=10, batch_size=256):
        val_data = None
        if val_X is not None:
            val_y = keras.utils.to_categorical(tf.convert_to_tensor(np.array(val_y, dtype="int32"),
                                                                    np.int32), num_classes=2)
            val_X = get_tensor(val_X, self.features, self.sequence_len)
            val_data = (val_X, val_y)
        X = get_tensor(df, self.features, self.sequence_len)
        y = keras.utils.to_categorical(tf.convert_to_tensor(np.array(target, dtype="int32"), np.int32),
                                       num_classes=2)
        self.model.fit(X, y, validation_data=val_data, epochs=epochs, batch_size=batch_size, class_weight = {0:1, 1: 1.1})

    def get_probs(self, df):
        X = get_tensor(df, self.features, self.sequence_len)
        return self.model.predict(X)

    def predict(self, df):
        y = self.get_probs(df)
        return y.argmax(axis=1)


In [30]:
np.mean(y_test)

0.45043398812243035

In [38]:
rnn_features = ['score', 'winrate', 'utility', 'stddev', 'selfplay', 'dist_from_prev', 'winrate_sqr', 'score_sqr', 'dist_more_5', 'dist_enemy', 'W_scoreLead', 
                'B_scoreLead', 'W_scoreSelfplay',
       'B_scoreSelfplay', 'W_scoreStdev', 'B_scoreStdev', 'W_utility',
       'B_utility', 'W_winrate', 'B_winrate']
model = BotClassificatorRunner(sequence_len=150)

In [39]:
model.fit(X_train, y_train, X_test, y_test, epochs=25, batch_size=32)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [36]:
model.model.save('/content/drive/MyDrive/has_botv1.h5')

In [None]:
ans = model.predict(katago_all)

In [None]:
np.mean(ans)