In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import chess
import seaborn as sns
import time
import random


# sys.path.append('/../scripts/')
# from pgn_to_df import pgn_to_df
sns.set()
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 10)
plt.style.use('ggplot')

In [3]:
df = pd.read_csv("games.csv")
df

FileNotFoundError: [Errno 2] No such file or directory: 'games.csv'

In [None]:
# getting basic data and creating a copy of dataset
columns = df.columns
df_ = df.copy()
df_ = df_.drop(columns=['id', 'created_at', 'last_move_at'])
df_

```
Column names:
id
rated
created_at
last_move_at
turns
victory_status
winner
increment_code
white_id
white_rating
black_id
black_rating
moves
opening_eco
opening_name
opening_ply
```

In [None]:
pd.concat([df_['white_id'], df_['black_id']]).value_counts()

In [None]:
pd.concat([df_['white_id'], df_['black_id']]).value_counts(ascending=True).tail(20) \
    .plot(kind='barh', figsize=(8, 8), title="Most Frequent Players", xlabel='Number of games')
plt.show()

In [None]:
df_['opening_name'].value_counts()

In [None]:
# keep when grouping by elo
# df.groupby(['rating_category'])['opening_names'].value_counts()

df_['opening_eco'].value_counts()

In [None]:
#First check if the ratings are within similar ranges

rating_diff = df_["white_rating"] - df_["black_rating"]
rating_range_within = abs(rating_diff) <= 1000
rating_range_out = abs(rating_diff) > 1000
print("out of {} games, {} games are within a range of 1,000 and {} games are not".format(len(df_), rating_range_within.sum(), rating_range_out.sum()))

In [None]:
rating_bin = [0, 1000, 1200, 1400, 1600, 1800, 2000, 2200, 2300, 2400, 2500, 2700]
labels = ["Novices", "E", "D", "C", "B", "A", "CM", "NM", "FM", "IM", "GM"]
df_['white_category'] = pd.cut(x = df_['white_rating'], bins = rating_bin, labels = labels, include_lowest = True)


In [None]:
df_['black_category'] = pd.cut(x = df_['black_rating'], bins = rating_bin, labels = labels, include_lowest = True)
df_.head()

In [None]:
pd.concat([df_['white_category'], df_['black_category']]).value_counts(ascending=True).tail(20) \
    .plot(kind='barh', figsize=(8, 8), title="LiChess Elo rated players", xlabel='Number of games', ylabel='Elo categories')
plt.show()

```
What does this mean?
Well to give you some context:
    2700+	sometimes informally called "super grandmasters"
2500–2700	most Grandmasters (GM)
2400–2500	most International Masters (IM) and some Grandmasters (GM)
2300–2400	most FIDE Masters (FM) and some International Masters (IM)
2200–2300	FIDE Candidate Masters (CM), most national masters (NM)
2000–2200	Candidate masters (CM)
1800–2000	Class A, category 1
1600–1800	Class B, category 2
1400–1600	Class C, category 3
1200–1400	Class D, category 4
1000–1200	Class E, category 5
Below 1000	Novices

*Note: taken by https://en.wikipedia.org/wiki/Chess_ra
People who play LiChess more often are those with 1200 to 2000 elo, meaning above-average people play chess
ting_system000	Novices
```

Now on to other things

In [None]:
df_["elo_diff"] = df_['white_rating'] - df_['black_rating']
df_.head()

In [None]:
ax = df_["elo_diff"].plot(kind="hist", bins = 20, title="Difference between Rating", xlabel='Elo difference')
plt.show()

In [None]:
df_[['white_rating', 'black_rating', 'elo_diff', 'winner']]

In [None]:
# displays the matches where the high-rated player wins the match
df_upsets = (
    df_.query("(elo_diff > 0 and winner == 'white') or (elo_diff < 0 and winner == 'black')"
             )
    .reset_index(drop=True)
    .copy()
)

df_upsets[['white_rating', 'black_rating', 'elo_diff', 'winner']]

In [None]:
ax = df_upsets["elo_diff"].plot(kind="hist", bins = 20, title="Difference between Rating for upsets", xlabel='Elo difference')
plt.show()

In [None]:
ax = df_upsets["elo_diff"].plot(kind="hist", bins = 20, title="Difference between Rating for upsets")
ax = df_upsets["elo_diff"].loc[df_upsets[['white_rating', 'black_rating']].max(axis=1) > 1600].plot(kind="hist", bins = 20, title="Difference between Rating for upsets +1600", color='blue', alpha=0.2)

ax.set_xlabel("Elo difference")
plt.show()
ax.set_xlabel("Elo difference")
plt.show()

#different upsets

Looking at the matches. For time sake we will only look at upset matches since we already have the data prepared

In [None]:
cond = df_upsets['victory_status'] != 'outoftime'
cond1 = df_upsets['white_category'] == 'C'
match_history = df_upsets['moves'].where(cond & cond1)
match_history = match_history.dropna()
match_history

In [None]:
board = chess.Board()
print(board)
test_moves = match_history[1]
test_moves = test_moves.split()
for move in test_moves:
    board.push_san(move)
    print(board)
    print()
board

Does not look appealing, and only one board is shown, which is the end of the moves list

In [None]:
#so instead, after hours of searching and reading, used someone's code
# https://jupyter.brynmawr.edu/services/public/dblank/CS371%20Cognitive%20Science/2016-Fall/Programming%20a%20Chess%20Player.ipynb
import play_chess as pc

In [None]:
pc.play_game(pc.random_player, pc.random_player)

Now we got a chess board that updates moves without having to create a whole new board. 
With this we can visualize matches, form how the games started, to how they finished.
### But before that, let us go back to our Player, "taranga"

In [None]:
white = df_['white_id'] == "taranga"
black = df_['black_id'] == "taranga"
print("{} {}".format(white.value_counts(), black.value_counts()))