# 2: Preprocessing

In [1]:
from utils.preprocess_utils import *

## Complete Preprocessing

In [2]:
#IMPORTANT: For the sake of processing, in the future we will leave more columns

df_train = read(data_file = "./Preprocessed_Data\Train\game_batch_0.npy.gz", column_names_file = "./Preprocessed_Data\Train\column_names.txt")
all_columns = df_train.columns
cols_to_leave = ["human", "elo", "color", "stockfish_score_depth_8", "stockfish_difference_depth_8", "current_move", "event"]
delete_cols = list(set(all_columns) - set(cols_to_leave))
delete_cols += ["bullet", "rapid", "classic", "blitz", "rated"]

In [3]:
delete_cols

['stockfish_difference_depth_3',
 'stockfish_score_depth_16',
 'past_move_8',
 'past_move_5',
 'player',
 'stockfish_difference_depth_1',
 'current_move_str',
 'past_move_2',
 'stockfish_score_depth_12',
 'stockfish_difference_depth_10',
 'stockfish_score_depth_2',
 'stockfish_score_depth_10',
 'stockfish_difference_depth_5',
 'stockfish_score_depth_3',
 'clock',
 'past_move_1',
 'stockfish_score_depth_18',
 'stockfish_difference_depth_12',
 'stockfish_difference_depth_16',
 'stockfish_score_depth_20',
 'stockfish_difference_depth_18',
 'past_move_6',
 'past_move_9',
 'stockfish_score_depth_5',
 'past_move_4',
 'stockfish_score_depth_4',
 'past_move_3',
 'stockfish_difference_depth_2',
 'stockfish_difference_depth_15',
 'past_move_11',
 'past_move_12',
 'stockfish_score_depth_15',
 'stockfish_difference_depth_20',
 'past_move_7',
 'past_move_10',
 'stockfish_score_depth_1',
 'stockfish_difference_depth_4',
 'bullet',
 'rapid',
 'classic',
 'blitz',
 'rated']

In [4]:
cols_change_types = ["human"]
for col in df_train.columns:
    if "stockfish" in col:
        cols_change_types.append(col)
        
new_types = ["int"]
while len(new_types) < len(cols_change_types):
    new_types.append(np.float32)

cols_change_types

['human',
 'stockfish_score_depth_1',
 'stockfish_difference_depth_1',
 'stockfish_score_depth_2',
 'stockfish_difference_depth_2',
 'stockfish_score_depth_3',
 'stockfish_difference_depth_3',
 'stockfish_score_depth_4',
 'stockfish_difference_depth_4',
 'stockfish_score_depth_5',
 'stockfish_difference_depth_5',
 'stockfish_score_depth_8',
 'stockfish_difference_depth_8',
 'stockfish_score_depth_10',
 'stockfish_difference_depth_10',
 'stockfish_score_depth_12',
 'stockfish_difference_depth_12',
 'stockfish_score_depth_15',
 'stockfish_difference_depth_15',
 'stockfish_score_depth_16',
 'stockfish_difference_depth_16',
 'stockfish_score_depth_18',
 'stockfish_difference_depth_18',
 'stockfish_score_depth_20',
 'stockfish_difference_depth_20']

In [5]:
cols_to_standardize = cols_change_types[1:]
cols_to_standardize

['stockfish_score_depth_1',
 'stockfish_difference_depth_1',
 'stockfish_score_depth_2',
 'stockfish_difference_depth_2',
 'stockfish_score_depth_3',
 'stockfish_difference_depth_3',
 'stockfish_score_depth_4',
 'stockfish_difference_depth_4',
 'stockfish_score_depth_5',
 'stockfish_difference_depth_5',
 'stockfish_score_depth_8',
 'stockfish_difference_depth_8',
 'stockfish_score_depth_10',
 'stockfish_difference_depth_10',
 'stockfish_score_depth_12',
 'stockfish_difference_depth_12',
 'stockfish_score_depth_15',
 'stockfish_difference_depth_15',
 'stockfish_score_depth_16',
 'stockfish_difference_depth_16',
 'stockfish_score_depth_18',
 'stockfish_difference_depth_18',
 'stockfish_score_depth_20',
 'stockfish_difference_depth_20']

In [6]:
train_mins_list, train_maxs_list, train_means_list, train_stds_list = preprocess_all(save_name = "Train_Batch", target_directory = "Model_Data\Train", directory = "Preprocessed_Data\\Train", columns_to_change_types = cols_change_types, new_types = new_types, columns_to_map = ["color"], str_vals = ["White"], columns_to_one_hot = ["event"], key_strings = [["bullet", "rapid", "classic", "blitz", "rated"]],
               columns_to_divide = ["elo"], divide_val = [3000], cols_to_normalize = ["clock"], cols_to_standardize = cols_to_standardize, columns_to_drop = delete_cols)

In [7]:
val_mins_list, val_maxs_list, val_means_list, val_stds_list = preprocess_all(save_name = "Val_Batch", target_directory = "Model_Data\Val", directory = "Preprocessed_Data\\Val", columns_to_change_types = cols_change_types, new_types = new_types, columns_to_map = ["color"], str_vals = ["White"], columns_to_one_hot = ["event"], key_strings = [["bullet", "rapid", "classic", "blitz", "rated"]],
               columns_to_divide = ["elo"], divide_val = [3000], cols_to_normalize = ["clock"], cols_to_standardize = cols_to_standardize, columns_to_drop = delete_cols)

In [8]:
test_mins_list, test_maxs_list, test_means_list, test_stds_list = preprocess_all(save_name = "Test_Batch", target_directory = "Model_Data\Test", directory = "Preprocessed_Data\\Test", columns_to_change_types = cols_change_types, new_types = new_types, columns_to_map = ["color"], str_vals = ["White"], columns_to_one_hot = ["event"], key_strings = [["bullet", "rapid", "classic", "blitz", "rated"]],
               columns_to_divide = ["elo"], divide_val = [3000], cols_to_normalize = ["clock"], cols_to_standardize = cols_to_standardize, columns_to_drop = delete_cols)

In [9]:
metadata = [("train_mins_list", train_mins_list),
            ("train_maxs_list", train_maxs_list),
            ("train_means_list", train_means_list),
            ("train_stds_list", train_stds_list),
            ("val_mins_list", val_mins_list),
            ("val_maxs_list", val_maxs_list),
            ("val_means_list", val_means_list),
            ("val_stds_list", val_stds_list),
            ("test_mins_list", test_mins_list),
            ("test_maxs_list", test_maxs_list),
            ("test_means_list", test_means_list),
            ("test_stds_list", test_stds_list)]

for arr_name, meta_array in metadata:
    meta_array = np.array(meta_array)
    file_save = f"./Model_Data\Metadata\{arr_name}.npy"
    np.save(file_save, meta_array)

In [10]:
new_train_df = read(data_file = "./Model_Data\Train\Train_Batch_0.npy.gz", column_names_file = "./Model_Data\Train\column_names.txt")
new_train_df.head()

Unnamed: 0,human,elo,color,stockfish_score_depth_8,stockfish_difference_depth_8,current_move
0,1,0.616,0,-0.161056,0.374793,"[[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0..."
1,1,0.43,0,2.752745,0.679102,"[[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0..."
2,1,0.628667,1,0.383251,0.788108,"[[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [1..."
3,0,0.373,1,0.89777,-0.697101,"[[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [1..."
4,1,0.610333,1,-0.380403,0.420212,"[[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0..."


In [11]:
new_val_df = read(data_file = "./Model_Data\Val\Val_Batch_0.npy.gz", column_names_file = "./Model_Data\Val\column_names.txt")
new_train_df.head()

Unnamed: 0,human,elo,color,stockfish_score_depth_8,stockfish_difference_depth_8,current_move
0,1,0.616,0,-0.161056,0.374793,"[[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0..."
1,1,0.43,0,2.752745,0.679102,"[[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0..."
2,1,0.628667,1,0.383251,0.788108,"[[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [1..."
3,0,0.373,1,0.89777,-0.697101,"[[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [1..."
4,1,0.610333,1,-0.380403,0.420212,"[[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0..."


In [12]:
new_val_df = read(data_file = "./Model_Data\Test\Test_Batch_0.npy.gz", column_names_file = "./Model_Data\Test\column_names.txt")
new_train_df.head()

Unnamed: 0,human,elo,color,stockfish_score_depth_8,stockfish_difference_depth_8,current_move
0,1,0.616,0,-0.161056,0.374793,"[[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0..."
1,1,0.43,0,2.752745,0.679102,"[[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0..."
2,1,0.628667,1,0.383251,0.788108,"[[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [1..."
3,0,0.373,1,0.89777,-0.697101,"[[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [1..."
4,1,0.610333,1,-0.380403,0.420212,"[[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0..."
