# Possible ML transformations (data pre-processing) for an Hive game dataset

## Main aspects of this Python Notebook:
1. Importing data from a csv containing info on a single match
2. Classifying the features in different categories:
    - color_player columns ('number_of_turn', 'last_move_played_by', 'current_player_turn', 'result') -> assume values 'White' or 'Black'
    - moves columns -> the one which indicates the possible moves for every piece
    - neighbor columns -> the one which indicates the neighbor in every direction for every piece
3. Replacing all the NaN values with the value 0 
4. Scaling the 'number_of_turn' feature
5. Encoding the categorical features related to the neighbor columns into numerical features
6. Encoding the categorical features related to the color_player columns into numerical features

In [86]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

## 1. Importing Data
Let's import the data from a csv containing the information of a single match played Random vs Random

In [87]:
# folder = '../data/'
# # Cycling in the folder data
# import os
# for file in os.listdir(folder):
#     url = folder + file
#     df = pd.read_csv(url)


url = '../data/match_1737734701_results.csv'
df = pd.read_csv(url)
df.shape

(8663, 202)

In [88]:
df.head()

Unnamed: 0,number_of_turn,last_move_played_by,current_player_turn,wQ_moves,wS1_moves,wB1_moves,wG1_moves,wA1_moves,wS2_moves,wB2_moves,...,bG3_ABOVE_neighbor,bA3_RIGHT_neighbor,bA3_UP_RIGHT_neighbor,bA3_UP_LEFT_neighbor,bA3_LEFT_neighbor,bA3_DOWN_LEFT_neighbor,bA3_DOWN_RIGHT_neighbor,bA3_BELOW_neighbor,bA3_ABOVE_neighbor,result
0,1,White,Black,0,0,0,0,0,0,0,...,,,,,,,,,,Black
1,2,Black,White,0,0,3,0,0,0,0,...,,,,,,,,,,Black
2,3,White,Black,2,0,4,0,0,0,0,...,,,,,,,,,,Black
3,4,Black,White,2,0,4,0,0,0,0,...,,,,,,,,,,Black
4,5,White,Black,2,0,3,0,0,0,3,...,,,,,,,,,,Black


## 2. Classifying the features

In [89]:
cols = list(df.columns)

moves_cols = [col for col in cols if 'moves' in col]
neighbor_cols = [col for col in cols if 'neighbor' in col]
color_player_cols = ['last_move_played_by', 'current_player_turn', 'result']

print(f'Columns ({len(moves_cols)}) containing the number of possible moves for each bug\n',moves_cols, '\n')
print(f'Columns ({len(neighbor_cols)}) containing the number of neighbors for each bug\n',neighbor_cols, '\n')
print(f'Columns ({len(color_player_cols)}) containing info described by White or Black\n',color_player_cols, '\n')

Columns (22) containing the number of possible moves for each bug
 ['wQ_moves', 'wS1_moves', 'wB1_moves', 'wG1_moves', 'wA1_moves', 'wS2_moves', 'wB2_moves', 'wG2_moves', 'wA2_moves', 'wG3_moves', 'wA3_moves', 'bQ_moves', 'bS1_moves', 'bB1_moves', 'bG1_moves', 'bA1_moves', 'bS2_moves', 'bB2_moves', 'bG2_moves', 'bA2_moves', 'bG3_moves', 'bA3_moves'] 

Columns (176) containing the number of neighbors for each bug
 ['wQ_RIGHT_neighbor', 'wQ_UP_RIGHT_neighbor', 'wQ_UP_LEFT_neighbor', 'wQ_LEFT_neighbor', 'wQ_DOWN_LEFT_neighbor', 'wQ_DOWN_RIGHT_neighbor', 'wQ_BELOW_neighbor', 'wQ_ABOVE_neighbor', 'wS1_RIGHT_neighbor', 'wS1_UP_RIGHT_neighbor', 'wS1_UP_LEFT_neighbor', 'wS1_LEFT_neighbor', 'wS1_DOWN_LEFT_neighbor', 'wS1_DOWN_RIGHT_neighbor', 'wS1_BELOW_neighbor', 'wS1_ABOVE_neighbor', 'wB1_RIGHT_neighbor', 'wB1_UP_RIGHT_neighbor', 'wB1_UP_LEFT_neighbor', 'wB1_LEFT_neighbor', 'wB1_DOWN_LEFT_neighbor', 'wB1_DOWN_RIGHT_neighbor', 'wB1_BELOW_neighbor', 'wB1_ABOVE_neighbor', 'wG1_RIGHT_neighbor', '

In [90]:
df.describe()

Unnamed: 0,number_of_turn,wQ_moves,wS1_moves,wB1_moves,wG1_moves,wA1_moves,wS2_moves,wB2_moves,wG2_moves,wA2_moves,...,bS1_moves,bB1_moves,bG1_moves,bA1_moves,bS2_moves,bB2_moves,bG2_moves,bA2_moves,bG3_moves,bA3_moves
count,8663.0,8663.0,8663.0,8663.0,8663.0,8663.0,8663.0,8663.0,8663.0,8663.0,...,8663.0,8663.0,8663.0,8663.0,8663.0,8663.0,8663.0,8663.0,8663.0,8663.0
mean,4332.0,1.96283,1.862172,4.588826,2.407249,34.383008,2.025626,4.733926,2.528685,31.927854,...,1.827658,4.581438,2.435761,33.935819,1.927854,4.656816,2.389703,32.836546,2.428027,32.930971
std,2500.937024,0.938666,0.958588,1.054364,0.825955,10.982707,1.102374,1.092136,0.802086,13.842177,...,1.081086,1.024537,0.864792,11.656281,1.003682,1.086998,0.929789,12.992008,0.884124,13.160075
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2166.5,2.0,2.0,4.0,2.0,35.0,2.0,4.0,2.0,33.0,...,2.0,4.0,2.0,34.0,2.0,4.0,2.0,34.0,2.0,34.0
50%,4332.0,2.0,2.0,4.0,2.0,37.0,2.0,5.0,3.0,37.0,...,2.0,5.0,2.0,37.0,2.0,5.0,2.0,37.0,2.0,37.0
75%,6497.5,2.0,2.0,6.0,3.0,40.0,2.0,6.0,3.0,40.0,...,2.0,5.0,3.0,40.0,2.0,6.0,3.0,40.0,3.0,40.0
max,8663.0,4.0,6.0,6.0,5.0,48.0,6.0,6.0,5.0,47.0,...,6.0,6.0,6.0,47.0,6.0,6.0,6.0,48.0,6.0,47.0


## 3. Replacing NaN values

In [91]:
# Replace the NaN values with 0
df.fillna(0, inplace=True)

## 4. Scaling 'number_of_turn'

It's more useful to have an indicator of in which phase of the match the board state is (start, mid, end), and not the exact number of the turn.

In [92]:

# Use a MinMaxScaler to scale number_of_turn 
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df['number_of_turn'] = scaler.fit_transform(df[['number_of_turn']])
df.describe()


Unnamed: 0,number_of_turn,wQ_moves,wS1_moves,wB1_moves,wG1_moves,wA1_moves,wS2_moves,wB2_moves,wG2_moves,wA2_moves,...,bS1_moves,bB1_moves,bG1_moves,bA1_moves,bS2_moves,bB2_moves,bG2_moves,bA2_moves,bG3_moves,bA3_moves
count,8663.0,8663.0,8663.0,8663.0,8663.0,8663.0,8663.0,8663.0,8663.0,8663.0,...,8663.0,8663.0,8663.0,8663.0,8663.0,8663.0,8663.0,8663.0,8663.0,8663.0
mean,0.5,1.96283,1.862172,4.588826,2.407249,34.383008,2.025626,4.733926,2.528685,31.927854,...,1.827658,4.581438,2.435761,33.935819,1.927854,4.656816,2.389703,32.836546,2.428027,32.930971
std,0.288725,0.938666,0.958588,1.054364,0.825955,10.982707,1.102374,1.092136,0.802086,13.842177,...,1.081086,1.024537,0.864792,11.656281,1.003682,1.086998,0.929789,12.992008,0.884124,13.160075
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.25,2.0,2.0,4.0,2.0,35.0,2.0,4.0,2.0,33.0,...,2.0,4.0,2.0,34.0,2.0,4.0,2.0,34.0,2.0,34.0
50%,0.5,2.0,2.0,4.0,2.0,37.0,2.0,5.0,3.0,37.0,...,2.0,5.0,2.0,37.0,2.0,5.0,2.0,37.0,2.0,37.0
75%,0.75,2.0,2.0,6.0,3.0,40.0,2.0,6.0,3.0,40.0,...,2.0,5.0,3.0,40.0,2.0,6.0,3.0,40.0,3.0,40.0
max,1.0,4.0,6.0,6.0,5.0,48.0,6.0,6.0,5.0,47.0,...,6.0,6.0,6.0,47.0,6.0,6.0,6.0,48.0,6.0,47.0


## 5. Encoding neighbor columns

In [93]:
# Create a dictionary to encode all the possible pieces in the game
pieces_dict = {
    #white pieces
    'wQ': 1,
    'wA1': 2, 'wA2': 2, 'wA3': 2, 
    'wG1': 3, 'wG2': 3, 'wG3': 3,
    'wB1': 4, 'wB2': 4, 
    'wS1': 5, 'wS2': 5,
    'wM': 6,
    'wL': 7,
    'wP': 8,
    #black pieces
    'bQ': -1,
    'bA1': -2, 'bA2': -2, 'bA3': -2,
    'bG1': -3, 'bG2': -3, 'bG3': -3,
    'bB1': -4, 'bB2': -4,
    'bS1': -5, 'bS2': -5,
    'bM': -6,
    'bL': -7,
    'bP': -8
}

In [94]:
# for all the neighbors columns, replace the pieces with the corresponding values in the dictionary
for col in neighbor_cols:
    df[col] = df[col].replace(pieces_dict)

df.describe()

Unnamed: 0,number_of_turn,wQ_moves,wS1_moves,wB1_moves,wG1_moves,wA1_moves,wS2_moves,wB2_moves,wG2_moves,wA2_moves,...,bG3_BELOW_neighbor,bG3_ABOVE_neighbor,bA3_RIGHT_neighbor,bA3_UP_RIGHT_neighbor,bA3_UP_LEFT_neighbor,bA3_LEFT_neighbor,bA3_DOWN_LEFT_neighbor,bA3_DOWN_RIGHT_neighbor,bA3_BELOW_neighbor,bA3_ABOVE_neighbor
count,8663.0,8663.0,8663.0,8663.0,8663.0,8663.0,8663.0,8663.0,8663.0,8663.0,...,8663.0,8663.0,8663.0,8663.0,8663.0,8663.0,8663.0,8663.0,8663.0,8663.0
mean,0.5,1.96283,1.862172,4.588826,2.407249,34.383008,2.025626,4.733926,2.528685,31.927854,...,-2.898072,-2.898072,0.138058,0.104121,-0.195544,0.138058,0.104121,-0.195544,-1.859402,-1.859402
std,0.288725,0.938666,0.958588,1.054364,0.825955,10.982707,1.102374,1.092136,0.802086,13.842177,...,0.8476,0.8476,2.276867,1.770029,2.310436,2.276867,1.770029,2.310436,0.977745,0.977745
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-4.0,-4.0,-5.0,-5.0,-5.0,-5.0,-5.0,-5.0,-4.0,-4.0
25%,0.25,2.0,2.0,4.0,2.0,35.0,2.0,4.0,2.0,33.0,...,-3.0,-3.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.0,-2.0
50%,0.5,2.0,2.0,4.0,2.0,37.0,2.0,5.0,3.0,37.0,...,-3.0,-3.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.0,-2.0
75%,0.75,2.0,2.0,6.0,3.0,40.0,2.0,6.0,3.0,40.0,...,-3.0,-3.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.0,-2.0
max,1.0,4.0,6.0,6.0,5.0,48.0,6.0,6.0,5.0,47.0,...,4.0,4.0,5.0,5.0,5.0,5.0,5.0,5.0,4.0,4.0


## 6. Encoding color_player columns

In [95]:
color_player_dict = {
    'White': 1,
    'Black': -1
}

In [96]:
# Replace all the color_player columns with the corresponding values in the dictionary
for col in color_player_cols:
    df[col] = df[col].replace(color_player_dict)

df.describe()

Unnamed: 0,number_of_turn,last_move_played_by,current_player_turn,wQ_moves,wS1_moves,wB1_moves,wG1_moves,wA1_moves,wS2_moves,wB2_moves,...,bG3_ABOVE_neighbor,bA3_RIGHT_neighbor,bA3_UP_RIGHT_neighbor,bA3_UP_LEFT_neighbor,bA3_LEFT_neighbor,bA3_DOWN_LEFT_neighbor,bA3_DOWN_RIGHT_neighbor,bA3_BELOW_neighbor,bA3_ABOVE_neighbor,result
count,8663.0,8663.0,8663.0,8663.0,8663.0,8663.0,8663.0,8663.0,8663.0,8663.0,...,8663.0,8663.0,8663.0,8663.0,8663.0,8663.0,8663.0,8663.0,8663.0,8663.0
mean,0.5,0.000115,-0.000115,1.96283,1.862172,4.588826,2.407249,34.383008,2.025626,4.733926,...,-2.898072,0.138058,0.104121,-0.195544,0.138058,0.104121,-0.195544,-1.859402,-1.859402,-1.0
std,0.288725,1.000058,1.000058,0.938666,0.958588,1.054364,0.825955,10.982707,1.102374,1.092136,...,0.8476,2.276867,1.770029,2.310436,2.276867,1.770029,2.310436,0.977745,0.977745,0.0
min,0.0,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-4.0,-5.0,-5.0,-5.0,-5.0,-5.0,-5.0,-4.0,-4.0,-1.0
25%,0.25,-1.0,-1.0,2.0,2.0,4.0,2.0,35.0,2.0,4.0,...,-3.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.0,-2.0,-1.0
50%,0.5,1.0,-1.0,2.0,2.0,4.0,2.0,37.0,2.0,5.0,...,-3.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.0,-2.0,-1.0
75%,0.75,1.0,1.0,2.0,2.0,6.0,3.0,40.0,2.0,6.0,...,-3.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.0,-2.0,-1.0
max,1.0,1.0,1.0,4.0,6.0,6.0,5.0,48.0,6.0,6.0,...,4.0,5.0,5.0,5.0,5.0,5.0,5.0,4.0,4.0,-1.0
