# Imports

In [None]:
# import standard
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# magic lines
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
#import my scripts
import sys, os
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    scripts_path = os.path.join(module_path, "scripts")
    if scripts_path not in sys.path:
        sys.path.append(scripts_path)
    display(sys.path)
   
from data import get_data
from preprocessing import preprocess, add_new_columns

In [None]:
raw_data = get_data(df_name="all_games", descriptor="all_cols")
df = preprocess(raw_data)

In [None]:
fname = "all_games_2023-01_to_2024-12_all_new_cols.csv"
path = os.path.join("..", "data", fname)
os.path.getsize(path) / 1_000_000 #1_048_576

In [None]:
# path = os.path.join("..", "data", "sample_raw_data.csv")
df = pd.read_csv(path)

In [None]:
df.iloc[0]

### Exclude slower time formats

In [None]:
n_games_pre = len(df)
df = df.query("time_class == ['bullet']")
n_games_post = len(df)

print(f"excluded {n_games_pre - n_games_post} games.")

# Analysis

## General game end status
- how often do you resign?
- how often do you lose on time vs how often do your opponents?

-> pie chart

### mvp new columns

In [None]:
test = 53
display(df.iloc[test])

def get_user_result(row):
    """
        to be applied to df to generate new column
    """

    user_colour = row["user_colour"]
    user_result = row["white_result"] if user_colour == "white" else row["black_result"]
    opp_result  = row["white_result"] if user_colour == "black" else row["black_result"]

    user_rating = row["white_rating"] if user_colour == "white" else row["black_rating"]
    opp_rating  = row["white_rating"] if user_colour == "black" else row["black_rating"]
    
    data_out = {
        "user_result" : user_result,
        "opp_result"  : opp_result,
        # "user_rating": user_rating,
        "opp_rating" : opp_rating
    }
    return pd.Series(data_out)
    
get_user_result(df.iloc[test])

## load clean data

In [None]:
import pickle
clean_filename = "sample_clean.pkl"
clean_path = os.path.join("..", "data", clean_filename)
with open (clean_path, "rb") as file:
    clean_df = pickle.load(file)
clean_df

### game end status

In [None]:
# game_results = df.apply(get_user_result, axis=1, result_type="expand")
# game_results

In [None]:
# df = pd.concat([df,game_results], axis=1)
# df

In [None]:
user_results = clean_df.user_result.value_counts()
user_results

In [None]:
plt.pie(x = user_results.values, labels = user_results.index);

In [None]:
#of the games you won, how did you beat the opponent?
won_games = clean_df.query("user_result == 'win'")
won_games.head(3)

In [None]:
won_results = won_games.opp_result.value_counts()
plt.pie(x = won_results.values, labels = won_results.index);

## Rating over time


In [None]:
rating_df = clean_df[["date", "user_rating"]].copy()
rating_df

In [None]:
rating_df["date"] = pd.to_datetime(rating_df["date"])

In [None]:
sns.lineplot(data=rating_df,
             x="date", y="user_rating", 
             errorbar=None)
plt.tick_params(axis="x", rotation=45)

## results vs opponents by rating range
every 50, what's your win/lose/draw split? - your 'true' rating is where it is about 50% 

In [None]:
cols = ["user_rating", "user_result", "opp_result", "opp_rating"]
opp_rating_df = clean_df[cols].copy()
opp_rating_df

#### investigate different game end statuses

In [None]:
game_endings = set(list(opp_rating_df.user_result.unique()) + list(opp_rating_df.opp_result.unique()))
display(game_endings)

In [None]:
game_endings_mapping = {
    'abandoned' : "lose",
    'checkmated' : "lose",
    'resigned' : "lose",
    'timeout' : "lose",
    'agreed' : "draw",
    'insufficient': "draw",
    'repetition' : "draw",
    'stalemate' : "draw",
    'timevsinsufficient' : "draw",
    'win' : "win"
}
game_endings_mapping

In [None]:
keyword = 'timevsinsufficient'
opp_rating_df.query(f"user_result == '{keyword}' or opp_result == '{keyword}'")

In [None]:
opp_rating_df['user_result_map'] = opp_rating_df.user_result.map(game_endings_mapping)
opp_rating_df

In [None]:
bin_width = 50
min_bin = df.opp_rating.min() // bin_width * bin_width #round down to nearest 50
max_bin = df.opp_rating.max() // bin_width * bin_width + bin_width #round up to nearst 50

#printable
# list(range(min_bin, max_bin+1, bin_width))

bin_vals,bins_out = pd.cut(x=opp_rating_df["opp_rating"],
       bins =range(min_bin, max_bin+1, bin_width),
        retbins=True)
display(bins_out)

opp_rating_df.loc[:,"opp_rating_range"] = bin_vals#.values

In [None]:
opp_rating_df.groupby(by="opp_rating_range", observed=False)["user_result_map"].value_counts(normalize=True).unstack().plot(kind="barh", stacked=True)

In [None]:
results_by_rating = opp_rating_df.groupby(by="opp_rating_range", observed=False)["user_result_map"].value_counts(normalize=True).unstack()
results_by_rating = results_by_rating.reindex(labels=["win", "draw", "lose"], axis=1)
results_by_rating.head(3)

In [None]:
#https://matplotlib.org/stable/gallery/lines_bars_and_markers/bar_stacked.html
fig, ax = plt.subplots()

results_by_rating.index = bins_out[:-1]
results_by_rating.plot(kind="barh", 
                       stacked=True,
                      ax=ax,
                       color = ["green", "orange", "red"],
                      ylabel="opponent rating range",
                      xlabel="share of games");
# ax.set_yticks(bins_out[:-1])
# https://www.geeksforgeeks.org/create-a-stacked-bar-plot-in-matplotlib/
#show hist on margin?
fig.savefig(fname=os.path.join(path_to_data, "test_fig.jpg"))

using markdown to display the image:
![filler txt](../data/test_fig.jpg)

In [None]:
results_by_rating['test'] = bins_out[:-1]
results_by_rating

In [None]:
len(results_by_rating.reset_index())

In [None]:
x = results_by_rating.reset_index()['opp_rating_range']#.values
x

In [None]:
results_by_rating

In [None]:
ratings = bins_out[:-1]
# x = results_by_rating.reset_index(level=0)['opp_rating_range']
ywin  = list(results_by_rating['win'].values)
ydraw = list(results_by_rating['draw'].values)
ylose = list(results_by_rating['lose'].values)

barheight = 20
plt.barh(y=ratings, width=ywin, height=barheight)
plt.barh(y=ratings, width=ydraw, left=ywin, height=barheight)
plt.barh(y=ratings, width=ylose, left=np.array(ywin)+np.array(ydraw), height=barheight)

## MVP plan
- load raw data from a csv
- add only columns required
    - user colour
     - user result 
     - opp result 
     - user rating
     - opp rating
- display pie chart of how games end
- display bar chart with performance against different ratings


## mvp preproc

- work from local file (2024?)
    - sample_raw.csv 
- minimum clean (pipe!)
    - date to datetime 
- minimum new cols (pipe!)
    - user colour
    - user/opp rating
    - user/opp result 
- save to local file
    - sample_preproc.csv 

## Create sample raw data from the year 2023

to be used in mvp preproc

In [None]:
from data import get_all_games_list

raw_list = get_all_games_list()

In [None]:
path_to_data = os.path.join("..", "data")
raw_pickle = "sample_raw.pkl"
pkl_path = os.path.join(path_to_data, raw_pickle)
pkl_path

In [None]:
import pickle

In [None]:
with open (pkl_path, "wb") as file:
    pickle.dump(raw_list, file)

In [None]:
! ls {path_to_data}