#Chess API presentation
##1. Null Hypothesis: Choice of opening has no correlation on the probability of winning a game
Alternative Hypothesis: Choice of opening has a positive impact on the probability of winning a game
Alternative Hypothesis: Choice of opening has a negative impact on the probability of winning a game



##2. Null Hypothesis: There is no correlation in players having different draw rate in Blitz games than in Bullet games
Alternative Hypothesis: Players have a higher draw rate in Blitz games than in Bullet games
Alternative Hypothesis: Players have a lower draw rate in Blitz games than in Bullet games

##Google Colab Link:
https://colab.research.google.com/drive/1jqFPBZ3Go8z5Z4PfFsaTIwkz0NOuRqnc?usp=sharing









In [None]:
##2. Null Hypothesis: Choice of opening has no correlation on the probability of winning a game
#Alternative Hypothesis: Choice of opening has a positive impact on the probability of winning a game
#Alternative Hypothesis: Choice of opening has a negative impact on the probability of winning a game


import requests          # For API requests
import pandas as pd      # For DataFrames
import numpy as np       # For numerical operations
import matplotlib.pyplot as plt  # For plotting
from scipy.stats import ttest_ind  # For t-tests


HEADERS = {"User-Agent": "MyChessApp/1.0"}
def get_leaderboard_users(leaderboard_key):
    #Get usernames from Chess.com leaderboard 
    url = "https://api.chess.com/pub/leaderboards"
    resp = requests.get(url, headers=HEADERS)
    resp.raise_for_status()
    data = resp.json()
    return [p['username'] for p in data.get(leaderboard_key, [])]

def get_titled_players(title):
    #Get usernames of titled players
    url = f"https://api.chess.com/pub/titled/{title}"
    resp = requests.get(url, headers=HEADERS)
    resp.raise_for_status()
    data = resp.json()
    return data.get('players', [])

def get_player_archives(username):
    #Get monthly archive URLs for a player
    url = f"https://api.chess.com/pub/player/{username}/games/archives"
    resp = requests.get(url, headers=HEADERS)
    resp.raise_for_status()
    return resp.json().get("archives", [])

def fetch_games_from_archive(archive_url):
    #Get games from a single archive, returning a flat list of dicts
    resp = requests.get(archive_url, headers=HEADERS)
    resp.raise_for_status()
    data = resp.json()
    games = data.get("games", [])

    game_list = []
    for g in games:
        if "white" in g and "black" in g and "time_class" in g:
            for color in ["white", "black"]:
                game_info = g[color]
                game_list.append({
                    "username": game_info.get("username"),
                    "color": color,
                    "result": game_info.get("result"),
                    "time_class": g.get("time_class")
                })
    return game_list

#Collect usernames
usernames = set()

#Leaderboard users
leaderboard_keys = ['live_blitz', 'live_bullet']
for key in leaderboard_keys:
    try:
        lb_users = get_leaderboard_users(key)
        usernames.update(lb_users)
        print(f"Added {len(lb_users)} usernames from leaderboard '{key}'.")
    except Exception as e:
        print(f"Failed to get leaderboard '{key}': {e}")

#Titled players
titles = ['GM', 'IM', 'FM']
for title in titles:
    try:
        titled_users = get_titled_players(title)
        usernames.update(titled_users)
        print(f"Added {len(titled_users)} usernames with title '{title}'.")
    except Exception as e:
        print(f"Failed to get titled players '{title}': {e}")
usernames = list(usernames)[:200]
print(f"Total usernames used: {len(usernames)}")

#get games of last 3 months each player
all_games = []
for i, username in enumerate(usernames):
    try:
        archives = get_player_archives(username)
        for archive_url in archives[-3:]:
            games = fetch_games_from_archive(archive_url)
            all_games.extend(games)
        if (i+1) % 20 == 0:
            print(f"Processed {i+1} users, total games so far: {len(all_games)}")
    except Exception as e:
        print(f"Failed for {username}: {e}")

#Dataframe and clean data
df = pd.DataFrame(all_games)

# Drop rows missing time_class
df = df.dropna(subset=["time_class"])

# Encode wins: 1=win, 0=loss/etc.
df["win"] = np.where(df["result"] == "win", 1, 0)

# Encode color: 1=white, 0=black
df["color"] = np.where(df["color"] == "white", 1, 0)

print("Cleaned dataset shape:", df.shape)
print(df.head())

#Calculate Win Rates & White Advantage
min_games = 50
valid_time_controls = df['time_class'].value_counts()[lambda x: x >= min_games].index
df_filtered = df[df['time_class'].isin(valid_time_controls)]

win_rates = df_filtered.groupby(["time_class", "color"])["win"].mean().unstack()
win_rates = win_rates.reindex(columns=[0,1])
win_rates["white_advantage"] = win_rates[1] - win_rates[0]

print("\nWin rates and White advantage by time control:")
print(win_rates)

#Plot White Advantage
plt.figure(figsize=(8,5))
win_rates["white_advantage"].plot(kind="bar", color="skyblue")
plt.title("White Advantage by Time Control")
plt.ylabel("Win Rate Difference (White - Black)")
plt.xlabel("Time Control")
plt.xticks(rotation=0)
plt.show()

#Check number of games per color
color_counts = df_filtered.groupby(['time_class', 'color']).size().unstack(fill_value=0)
print("\nWhite vs Black counts per time control:")
print(color_counts)

#T-test: White vs Black Win Rate
print("\nT-test results for White vs Black by time control:")
for tc in df_filtered['time_class'].unique():
    data_tc = df_filtered[df_filtered['time_class'] == tc]
    white_wins = data_tc[data_tc['color'] == 1]['win']
    black_wins = data_tc[data_tc['color'] == 0]['win']

    # Only perform t-test if both colors have at least 2 games
    if len(white_wins) > 1 and len(black_wins) > 1:
        t_stat, p_val = ttest_ind(white_wins, black_wins, equal_var=False)
        print(f"{tc}: t-statistic = {t_stat:.3f}, p-value = {p_val:.4f}")
    else:
        print(f"{tc}: Not enough games for a t-test.")


In [None]:
#For comparison

#Show original dataset
print("Original dataset:")
print(df.head())   # Shows the first 5 rows

# Clean the data
df["win"] = np.where(df["result"] == "win", 1, 0)
df["color"] = np.where(df["color"] == "white", 1, 0)
df = df.dropna(subset=["time_class"])

# Show cleaned dataset
print("\nCleaned dataset:")
print(df.head())   # Shows first 5 rows after cleaning


In [None]:
#Bar chart showing white advantage by time control with significance
import matplotlib.pyplot as plt

#   white advantage from win_rates["white_advantage"]
white_advantage = {
    "blitz": 0.044649,
    "bullet": 0.054247,
    "rapid": 0.059497,
    "daily": 0.106383
}

#  t-test p-values
p_values = {
    "blitz": 0.0000,
    "bullet": 0.0000,
    "rapid": 0.0120,
    "daily": 0.0182
}
#  Colors: blue if significant (p < 0.05), grey if not
colors = ['skyblue' if p_values[tc] < 0.05 else 'lightgray' for tc in white_advantage.keys()]

plt.figure(figsize=(8,5))
bars = plt.bar(white_advantage.keys(), white_advantage.values(), color=colors)
plt.axhline(0, color='black', linewidth=0.8)
plt.title("White Advantage by Time Control (with Significance)")
plt.ylabel("Win Rate Difference (White - Black)")
plt.xlabel("Time Control")

#  Annotating bars with advantage and p-value
for i, (tc, adv) in enumerate(white_advantage.items()):
    # Show advantage value
    plt.text(i, adv + 0.004 if adv >= 0 else adv - 0.02,
             f"{adv:.3f}", ha='center', va='bottom' if adv >= 0 else 'top', fontweight='bold')

    if p_values[tc] < 0.05:
        plt.text(i, adv + 0.015, "Significant", ha='center', color='green', fontweight='bold', fontsize=9)

    # Show p-value below each bar
    plt.text(i, -0.045, f"p = {p_values[tc]:.4f}", ha='center', fontsize=9, color='black')
plt.ylim(-0.05, 0.09)
plt.show()


In [None]:
# 1st research questions result: Based on the results of the two-proportion Z-test, there is a statistically significant difference 
# between the win rates of White and Black players across most time controls. For all formats — blitz, 
# bullet, rapid, and daily — the p-values were below the 0.05 significance level, leading us to reject the 
# null hypothesis (H₀) that both colors have equal chances of winning.

#This indicates that playing as White provides a measurable advantage in chess games, 
# regardless of time control. The magnitude of this advantage may vary slightly between formats, 
# but overall, the data supports the long-standing observation that White tends to win more often than Black.

In [None]:
from scipy.stats import norm
# Store results
results = []
for tc in win_rates.index:
    n_white = color_counts.loc[tc, 1]
    n_black = color_counts.loc[tc, 0]
    p_white = win_rates.loc[tc, 1]
    p_black = win_rates.loc[tc, 0]

    # Pooled proportion
    p_pool = (p_white * n_white + p_black * n_black) / (n_white + n_black)

    # Standard error
    se = (p_pool * (1 - p_pool) * (1/n_white + 1/n_black))**0.5

    # z-statistic
    z = (p_white - p_black) / se

    # two-sided p-value
    p_val = 2 * (1 - norm.cdf(abs(z)))

    results.append({
        "time_class": tc,
        "z_statistic": z,
        "p_value": p_val
    })

# Convert to Dataframe
z_test_df = pd.DataFrame(results)
print(z_test_df)


In [None]:
##2. Null Hypothesis: There is no correlation in players having different draw rate in Blitz games than in Bullet games
#Alternative Hypothesis: Players have a higher draw rate in Blitz games than in Bullet games
#Alternative Hypothesis: Players have a lower draw rate in Blitz games than in Bullet games


#Import Libraries
import time       # Provides time-related functions, measuring execution time or creating delays
from scipy import stats  # Access to t-test, z-test, distributions
import seaborn as sns    # visualization library for creating attractive statistical plots
import requests   # For making requests to the Chess.com API
import pandas as pd      # For Dataframe handling
import numpy as np       # For numerical operations
import matplotlib.pyplot as plt  # For plotting results
from scipy.stats import ttest_ind  # For t-tests


HEADERS = {"User-Agent": "MyChessApp/1.0"}
MAX_PLAYERS_FETCH = 1000
REQUEST_DELAY = 0.3
#  Define functions to get player games:
def get_leaderboard_users(leaderboard_key):
    url = "https://api.chess.com/pub/leaderboards"
    resp = requests.get(url, headers=HEADERS)
    resp.raise_for_status()
    data = resp.json()
    return [p['username'] for p in data.get(leaderboard_key, [])]
def get_titled_players(title):
    url = f"https://api.chess.com/pub/titled/{title}"
    resp = requests.get(url, headers=HEADERS)
    resp.raise_for_status()
    data = resp.json()
    return data.get('players', [])
def fetch_player_stats(username):
    url = f"https://api.chess.com/pub/player/{username}/stats"
    resp = requests.get(url, headers=HEADERS)
    resp.raise_for_status()
    return resp.json()
usernames = set()
#leaderboard used to collect player stats for specific gamemodes
leaderboard_keys = ['live_blitz', 'live_bullet', 'daily_blitz', 'daily_bullet']
for key in leaderboard_keys:
    try:
        lb_users = get_leaderboard_users(key)
        usernames.update(lb_users)
        print(f"Added {len(lb_users)} usernames from leaderboard '{key}'")
    except Exception as e:
        print(f"Failed to get leaderboard '{key}': {e}")

#Filter inexperienced players
titles = ['GM', 'IM', 'FM', 'WGM', 'WIM', 'WFM']
for title in titles:
    try:
        titled_users = get_titled_players(title)
        usernames.update(titled_users)
        print(f"Added {len(titled_users)} players with titles '{title}'")
    except Exception as e:
        print(f"Failed to get titled players '{title}': {e}")

usernames = list(usernames)[:MAX_PLAYERS_FETCH]
print(f"Total players got: {len(usernames)}")

players_data = []
for idx, username in enumerate(usernames, 1):
    try:
        player_stats = fetch_player_stats(username)

        blitz = player_stats.get("chess_blitz", {}).get("record", {})
        bullet = player_stats.get("chess_bullet", {}).get("record", {})
#Calculate total of games in bullet and blitz of players stats collected:
        blitz_total = blitz.get("win", 0) + blitz.get("loss", 0) + blitz.get("draw", 0)
        bullet_total = bullet.get("win", 0) + bullet.get("loss", 0) + bullet.get("draw", 0)
#Get number of games draw in blitz and bullet
        blitz_draws = blitz.get("draw", 0)
        bullet_draws = bullet.get("draw", 0)

        players_data.append({
            "username": username,
            "blitz_games_played": blitz_total,
            "blitz_draws": blitz_draws,
            "bullet_games_played": bullet_total,
            "bullet_draws": bullet_draws,
        })

        if idx % 50 == 0:
            print(f"{idx}")
    except Exception as e:
        print(f"Failed to fetch stats for {username}: {e}")
    time.sleep(REQUEST_DELAY)

#Filter players with not enough stats on both blitz and bullet
df = pd.DataFrame(players_data)
df["draw_rate_blitz"] = np.where(
    df["blitz_games_played"] > 0,
    df["blitz_draws"] / df["blitz_games_played"],
    np.nan,
)
df["draw_rate_bullet"] = np.where(
    df["bullet_games_played"] > 0,
    df["bullet_draws"] / df["bullet_games_played"],
    np.nan,
)
#Calculate draw difference between blitz and bullet
df["draw_diff"] = df["draw_rate_blitz"] - df["draw_rate_bullet"]

print(f"Total players got: {len(df)}")
print(f"Players with Blitz games: {(df['blitz_games_played'] > 0).sum()}")
print(f"Players with Bullet games: {(df['bullet_games_played'] > 0).sum()}")

df_clean = df.dropna(subset=["draw_rate_blitz", "draw_rate_bullet"]).copy()
print(f"Players with both Blitz and Bullet data: {len(df_clean)}")

if len(df_clean) < 10:
    print("Not enough players with both Blitz and Bullet data to perform t-test.")
else:
    #t-test
    t_stat, p_value = stats.ttest_rel(df_clean["draw_rate_blitz"], df_clean["draw_rate_bullet"])
    mean_blitz = df_clean["draw_rate_blitz"].mean()
    mean_bullet = df_clean["draw_rate_bullet"].mean()


    print(f"Draw Rate Comparison for {len(df_clean)} Players")

    print(df_clean[["username", "draw_rate_blitz", "draw_rate_bullet", "draw_diff"]])

    output_file = "chess_com_draw_rate_results.csv"
    df_clean.to_csv(output_file, index=False)
    print(f"\n Full results saved to '{output_file}'")

    print(f"Mean Blitz draw rate:  {mean_blitz:.4f}")
    print(f"Mean Bullet draw rate: {mean_bullet:.4f}")
    print(f"t-statistic: {t_stat:.4f}")
    print(f"p-value: {p_value:.10f}")
    #p-value
    if p_value < 0.05:
        print("Reject the null hypothesis")
        if mean_blitz > mean_bullet:
            print("Players have a significantly HIGHER draw rate in Blitz games than in Bullet games.")
        else:
            print("Players have a significantly LOWER draw rate in Blitz games than in Bullet games.")
    else:
        print(" Fail to reject the null hypothesis")
        print("No significant difference in draw rates between Blitz and Bullet games.")


In [None]:
#Scatterplot for blitz and bullet draw rates of players collected
try:
    df_clean = pd.read_csv("chess_com_draw_rate_results.csv")
except:
    exit()
plt.figure(figsize=(8, 6))
sns.scatterplot(x='draw_rate_bullet', y='draw_rate_blitz', data=df_clean, alpha=0.5)
plt.title('Blitz vs. Bullet Draw Rates')
plt.xlabel('Bullet Draw Rate')
plt.ylabel('Blitz Draw Rate')
plt.xlim(0, 0.5)
plt.ylim(0, 0.5)
plt.grid(True, alpha=0.5)
plt.show()

In [None]:
#Box plot comparing distribution of draw rates between 2 gamemodes
plot_data = df_clean[['draw_rate_blitz', 'draw_rate_bullet']].melt(var_name='Box', value_name='Draw Rate')
plt.figure(figsize=(8, 6))
sns.boxplot(x='Box', y='Draw Rate', data=plot_data)
plt.title('Distribution of Draw Rates in Blitz vs. Bullet Games')
plt.ylabel('Draw Rate')
plt.ylim(0,0.3)
plt.grid(axis='y', alpha=0.5)
plt.show()

In [None]:
#2nd research question result:The second null hypothesis proposed that there is no difference 
# in draw rates between Blitz and Bullet games. The T-test produced a statistic of 12.3649 with a 
# p-value of 0.0000000, which is far below the 0.05 threshold. Therefore, we also reject the null 
# hypothesis and conclude that there is a significant difference in draw rates between Blitz and Bullet gamemodes.
