<a href="https://colab.research.google.com/github/Gyanma/Chess_Opening_Analysis/blob/main/doc/Chess_Analysis_Data_Prep.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import libraries and dataset

In [None]:
import pandas as pd


In [None]:
data = pd.read_csv("games.csv")

Check for missing data

In [None]:
data.isnull().sum().value_counts()

0    16
dtype: int64

Drop non relevant columns

In [None]:
data = data.drop(columns=["created_at", "last_move_at"])

Remove duplicates

In [None]:
num_duplicates = data.duplicated().sum()
print("Number of duplicates:", num_duplicates)

data = data.drop_duplicates()

Number of duplicates: 945


Get Statistical measures

In [None]:
data.describe()

Unnamed: 0,turns,white_rating,black_rating,opening_ply
count,19113.0,19113.0,19113.0,19113.0
mean,60.513839,1597.300005,1590.045519,4.81578
std,33.488264,290.02274,290.441694,2.798283
min,1.0,784.0,789.0,1.0
25%,37.0,1401.0,1394.0,3.0
50%,55.0,1567.0,1563.0,4.0
75%,79.0,1792.0,1785.0,6.0
max,349.0,2700.0,2723.0,28.0


In [None]:
data.value_counts()

Get statistical measures of the two parts of feature increment_code

In [None]:
df = data

# Split the feature into separate columns for A and B
df[['A', 'B']] = data['increment_code'].str.split('+', expand=True).astype(int)

# Calculate statistical measures for variables A and B
print("Feature: A")
print(df['A'].describe())
print (df['A'].value_counts().head(1))
print (df['A'].nunique())

print("Feature: B")
print(df['B'].describe())
print (df['B'].value_counts().head(1))
print (df['B'].nunique())

data = data.drop(columns=["A", "B"])

Get mode

In [None]:
for column in data.columns:
    mode = data[column].value_counts().head(5)
    unique_count = data[column].nunique()
    print("Feature:", column)
    print(mode)
    print("Number of unique values:", unique_count)
    print()

Feature: id
TZJHLljE    1
ojkMBE49    1
JTfFY9HD    1
0wkPVyCT    1
9oQQUmDI    1
Name: id, dtype: int64
Number of unique values: 19113

Feature: rated
True     15467
False     3646
Name: rated, dtype: int64
Number of unique values: 2

Feature: turns
45    294
53    292
41    289
39    287
57    285
Name: turns, dtype: int64
Number of unique values: 211

Feature: victory_status
resign       10695
mate          5974
outoftime     1598
draw           846
Name: victory_status, dtype: int64
Number of unique values: 4

Feature: winner
white    9545
black    8680
draw      888
Name: winner, dtype: int64
Number of unique values: 3

Feature: increment_code
10+0     7356
15+0     1258
15+15     821
5+5       723
5+8       678
Name: increment_code, dtype: int64
Number of unique values: 400

Feature: white_id
ssf7             48
bleda            48
hassan1365416    44
khelil           41
1240100948       38
Name: white_id, dtype: int64
Number of unique values: 9438

Feature: white_rating
1500    

Modify "moves"

In [None]:
data['moves'] = data.apply(lambda row: ' '.join(row['moves'].split()[:row['opening_ply']]), axis=1)

Drop non relevant columns

In [None]:
data = data.drop(columns=["turns", "id", "victory_status", "white_id", "black_id"])

Create different variables from "increment_code"

In [None]:
data[['starting_minutes', 'additional_seconds']] = data['increment_code'].str.split('+', expand=True).astype(int)
data = data.drop(columns = ["increment_code"])

Reorganize columns

In [None]:
custom_order = ['rated', 'starting_minutes', 'additional_seconds', 'white_rating', 'black_rating', 'moves', 'opening_eco', 'opening_name', 'opening_ply', 'winner']
data = data[custom_order]

Remove undesired characters from the values in preparation of the modeling with Weka

In [None]:
data['opening_name'] = data['opening_name'].str.replace(r'\s+', '_', regex=True)
data['opening_name'] = data['opening_name'].str.replace(r':', '_', regex=True)
data['opening_name'] = data['opening_name'].str.replace(r'\'', '', regex=True)
data['opening_name'] = data['opening_name'].str.replace(r'|', '', regex=True)
data['opening_name'] = data['opening_name'].str.replace(r'#', '', regex=True)
data['opening_name'] = data['opening_name'].str.replace(r'_{2,}', '_', regex=True)

data['moves'] = data['moves'].str.replace(r'\s+', '_', regex=True)
data['moves'] = data['moves'].str.replace(r'_{2,}', '_', regex=True)


Print the dataframe into a csv file

In [None]:
data.to_csv("modified_games.csv", index = False)

Remove drawn instances

In [None]:
data = data[~(data['winner'] == 'draw')]