*1. Project Idea / Purpose

Title: Comparing chess games between humans only and chess games between humans and engines.

---

In [15]:
# =====================================================
# Capstone: Human vs Engine Chess Analysis (LO1 - LO11)
# Single script that:
# - Streams a Lichess .pgn.zst archive (stops early when enough games)
# - Extracts features, labels games (human vs human / mixed)
# - Saves CSVs, performs EDA, hypothesis tests, clustering, classification
# - Sections are labelled by Learning Outcome (LO)
# Author: Joshua Tutin (template)
# =====================================================

import os
import io
import sys
import math
import random
import requests
from datetime import datetime
from collections import Counter

# Data & analysis libs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Try import zstandard (needed for .zst streaming)
try:
    import zstandard as zstd
except Exception as e:
    zstd = None

# Try import python-chess
try:
    import chess
    import chess.pgn
except Exception as e:
    raise RuntimeError("This script requires python-chess. Install with `pip install python-chess`") from e

# -----------------------------
# Configurable parameters
# -----------------------------
LICHESS_URL = "https://database.lichess.org/standard/lichess_db_standard_rated_2025-09.pgn.zst"
# If you want a different month change above URL.
MAX_HUMAN_GAMES = 300       # target human vs human (likely)
MAX_MIXED_GAMES = 300       # target human vs engine (mixed)
MAX_TOTAL = MAX_HUMAN_GAMES + MAX_MIXED_GAMES
OUTPUT_CSV = "data/lichess_human_mixed_sample.csv"
DOWNLOAD_CHUNK = 1024 * 1024  # 1MB
VERBOSE = True

# -----------------------------
# Helpers
# -----------------------------
def info(msg):
    if VERBOSE:
        print(msg)

# Heuristic engine detection terms (extendable)
ENGINE_TERMS = [
    "stockfish", "engine", "lichess_bot", "lichess", "bot", "komodo", "fire", "shredder",
    "stockfish", "crafty", "stockfxh", "lc0", "leela", "fairy"
]

def is_engine_name(name: str) -> bool:
    if not name:
        return False
    name_l = name.lower()
    # direct match heuristics
    for t in ENGINE_TERMS:
        if t in name_l:
            return True
    # some engine accounts include digits/engine suffixes; simple heuristic:
    if name_l.endswith("bot") or "engine" in name_l:
        return True
    return False

# -----------------------------
# LO5: Data collection & management
# - Stream a lichess .pgn.zst file and parse games until targets met
# - Saves raw CSV. (LO5: storage and processing)
# -----------------------------
def stream_collect_lichess_sample(url: str,
                                 max_human: int,
                                 max_mixed: int,
                                 zst_file: str = None):
    """
    Streams the lichess .pgn.zst archive and collects games until we have enough labelled samples.
    Returns a list of records (dict).
    If zstandard isn't installed, raises an error.
    """
    if zstd is None:
        raise RuntimeError("zstandard not available. Install with `pip install zstandard` to stream .zst files.")

    records_human = []
    records_mixed = []

    # Stream the remote URL and give the decompressor a streaming reader
    info(f"[LO5] Streaming from {url} (will stop once {max_human} human and {max_mixed} mixed games collected)...")
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        dctx = zstd.ZstdDecompressor()
        # Create a streaming reader that reads from the requests raw stream
        with dctx.stream_reader(r.raw) as reader:
            text_stream = io.TextIOWrapper(reader, encoding="utf-8", errors="ignore", newline="\n")
            # Read PGN games one by one
            count = 0
            while True:
                if len(records_human) >= max_human and len(records_mixed) >= max_mixed:
                    break
                game = chess.pgn.read_game(text_stream)
                if game is None:
                    break  # end of file
                count += 1

                # Skip games with missing headers or extremely short games
                white = game.headers.get("White", "").strip()
                black = game.headers.get("Black", "").strip()
                result = game.headers.get("Result", "")
                if not white and not black:
                    continue

                # Heuristic labelling
                white_engine = is_engine_name(white)
                black_engine = is_engine_name(black)

                # Skip pure engine vs engine games (we want human or mixed)
                if white_engine and black_engine:
                    continue

                label = "Human vs Engine (mixed)" if (white_engine or black_engine) else "Human vs Human (likely)"

                # don't exceed targets
                if label == "Human vs Human (likely)" and len(records_human) >= max_human:
                    continue
                if label == "Human vs Engine (mixed)" and len(records_mixed) >= max_mixed:
                    continue

                # Extract features
                board = game.board()
                num_moves = 0
                checks = 0
                captures = 0
                legal_counts = []  # legal moves counts per ply

                # iterate moves; we will capture legal moves count BEFORE the move (branching factor)
                for mv in game.mainline_moves():
                    legal_before = list(board.legal_moves)
                    legal_counts.append(len(legal_before))
                    # check/capture before pushing?
                    if board.is_check():
                        checks += 1
                    if board.is_capture(mv):
                        captures += 1
                    board.push(mv)
                    num_moves += 1

                avg_legal_moves = float(np.mean(legal_counts)) if legal_counts else 0.0
                # record
                rec = {
                    "white": white,
                    "black": black,
                    "result": result,
                    "label": label,
                    "num_moves": num_moves,
                    "checks": checks,
                    "captures": captures,
                    "avg_legal_moves": avg_legal_moves,
                    "check_ratio": checks / num_moves if num_moves>0 else 0.0,
                    "capture_ratio": captures / num_moves if num_moves>0 else 0.0
                }

                if label == "Human vs Human (likely)":
                    records_human.append(rec)
                else:
                    records_mixed.append(rec)

                if (len(records_human) + len(records_mixed)) % 50 == 0:
                    info(f"  collected total={len(records_human)+len(records_mixed)} (human={len(records_human)}, mixed={len(records_mixed)})")

    # combine
    combined = records_human + records_mixed
    info(f"[LO5] Streaming complete. Collected {len(records_human)} human and {len(records_mixed)} mixed games (total {len(combined)})")
    return combined

# -----------------------------
# LO1 & LO2: Basic stats & data manipulation examples
# - We'll compute descriptive stats and illustrate mean/median/std
# -----------------------------
def descriptive_statistics(df: pd.DataFrame):
    info("\n[LO1] Descriptive statistics (num_moves, avg_legal_moves, check_ratio, capture_ratio):")
    stats_df = df[["num_moves", "avg_legal_moves", "check_ratio", "capture_ratio"]].describe().T
    print(stats_df[['count','mean','std','50%']])
    return stats_df

# -----------------------------
# LO2 & LO8: EDA plots (boxplots, histograms)
# -----------------------------
def exploratory_plots(df: pd.DataFrame, out_dir="figures"):
    os.makedirs(out_dir, exist_ok=True)
    sns.set(style="whitegrid")
    features = ["num_moves", "avg_legal_moves", "check_ratio", "capture_ratio"]
    for feat in features:
        plt.figure(figsize=(8,5))
        sns.boxplot(x="label", y=feat, data=df, palette="Set2")
        plt.title(f"Boxplot: {feat} by label")
        plt.tight_layout()
        path = os.path.join(out_dir, f"box_{feat}.png")
        plt.savefig(path)
        info(f"[LO8] Saved {path}")
        plt.close()

        plt.figure(figsize=(8,5))
        sns.histplot(data=df, x=feat, hue="label", kde=True, element="step", stat="density")
        plt.title(f"Distribution: {feat} by label")
        plt.tight_layout()
        path = os.path.join(out_dir, f"hist_{feat}.png")
        plt.savefig(path)
        info(f"[LO8] Saved {path}")
        plt.close()

# -----------------------------
# LO1: Hypothesis testing
# - Compare human vs human and human vs engine for selected metrics
# -----------------------------
def hypothesis_tests(df: pd.DataFrame):
    # two groups
    group_h = df[df.label == "Human vs Human (likely)"]
    group_m = df[df.label == "Human vs Engine (mixed)"]

    tests = {}
    for feat in ["num_moves", "avg_legal_moves", "check_ratio", "capture_ratio"]:
        x = group_h[feat].dropna()
        y = group_m[feat].dropna()
        # normality check (Shapiro) - small samples only; we'll use Mann-Whitney as robust fallback
        # Use t-test if both look normal, otherwise Mann-Whitney U
        use_ttest = False
        try:
            if len(x) >= 8 and len(y) >= 8:
                psh_x = stats.shapiro(x.sample(500) if len(x)>500 else x)[1]
                psh_y = stats.shapiro(y.sample(500) if len(y)>500 else y)[1]
                use_ttest = (psh_x > 0.05 and psh_y > 0.05)
        except Exception:
            use_ttest = False

        if use_ttest:
            stat, pval = stats.ttest_ind(x, y, equal_var=False)
            test_name = "t-test (indep)"
        else:
            stat, pval = stats.mannwhitneyu(x, y, alternative='two-sided')
            test_name = "Mann-Whitney U"

        tests[feat] = {"test": test_name, "stat": float(stat), "pval": float(pval)}
    info("\n[LO1] Hypothesis tests (Human vs Human) vs (Human vs Engine):")
    for feat, res in tests.items():
        sig = "SIGNIFICANT" if res["pval"] < 0.05 else "not significant"
        print(f" - {feat}: {res['test']}, stat={res['stat']:.4f}, p={res['pval']:.4g} => {sig}")
    return tests

# -----------------------------
# LO3 & LO7: Clustering + visualization
# -----------------------------
def clustering_analysis(df: pd.DataFrame, n_clusters=2):
    feats = ["num_moves", "avg_legal_moves", "check_ratio", "capture_ratio"]
    X = df[feats].fillna(0).values
    km = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    clusters = km.fit_predict(X)
    df["cluster"] = clusters
    cluster_counts = df.groupby(["cluster","label"]).size().unstack(fill_value=0)
    info("\n[LO3] Clustering results (counts per label per cluster):")
    print(cluster_counts)
    return km, df

# -----------------------------
# LO2 & LO4: Classification (Simple model)
# - RandomForest to try to predict label (human vs mixed)
# -----------------------------
def classification_task(df: pd.DataFrame):
    # Prepare X,y
    df2 = df.copy()
    # only keep human vs mixed (we have these two labels)
    df2 = df2[df2.label.isin(["Human vs Human (likely)", "Human vs Engine (mixed)"])]
    df2["y"] = (df2.label == "Human vs Engine (mixed)").astype(int)  # 1=mixed, 0=human
    feats = ["num_moves", "avg_legal_moves", "check_ratio", "capture_ratio"]
    X = df2[feats].fillna(0)
    y = df2["y"]

    # Check class counts
    counts = y.value_counts().to_dict()
    info(f"\n[LO2] Classification dataset class counts: {counts}")
    # If any class has <2 examples, skip classifier
    if y.nunique() < 2 or any(v < 2 for v in counts.values()):
        info("[LO2] Not enough data to train classifier.")
        return None

    # stratify only when possible
    stratify = y if (min(counts.values()) >= 2) else None
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=stratify)
    model = RandomForestClassifier(n_estimators=150, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    proba = model.predict_proba(X_test)[:,1] if hasattr(model, "predict_proba") else None

    info("\n[LO2] Classification report (RandomForest):")
    print(classification_report(y_test, y_pred, digits=4))
    cm = confusion_matrix(y_test, y_pred)
    print("Confusion matrix:\n", cm)
    if proba is not None and len(np.unique(y_test))>1:
        try:
            auc = roc_auc_score(y_test, proba)
            print(f"AUC: {auc:.4f}")
        except Exception:
            pass
    # feature importance
    fi = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False)
    print("\nFeature importances:\n", fi)
    return model, fi

# -----------------------------
# LO6: Ethical considerations printed
# -----------------------------
def print_ethics_section():
    info("\n[LO6] Ethics & data governance")
    print("""
    - Source: Lichess public database (publicly available PGN archives).
    - No personal data used: usernames are pseudonyms; nonetheless we treat them respectfully.
    - Labeling is heuristic (by username contains 'stockfish'/'bot' etc.) -> not definitive detection of cheating.
    - This project is exploratory; any production use for enforcement would require robust verification and privacy review.
    """)

# -----------------------------
# LO10 & LO11: Save artifacts and provide next steps
# -----------------------------
def save_outputs(df: pd.DataFrame, csv_path=OUTPUT_CSV):
    os.makedirs(os.path.dirname(csv_path) if os.path.dirname(csv_path) else ".", exist_ok=True)
    df.to_csv(csv_path, index=False)
    info(f"[LO10] Saved dataset to {csv_path}")

# -----------------------------
# Main pipeline
# -----------------------------
def main_pipeline():
    # Section: collect data (LO5)
    try:
        collected = stream_collect_lichess_sample(LICHESS_URL, MAX_HUMAN_GAMES, MAX_MIXED_GAMES)
    except Exception as e:
        info(f"[ERROR] Data streaming failed: {e}")
        info("You can download the lichess .pgn.zst manually and set LICHESS_URL to a local file or install zstandard.")
        return

    if len(collected) == 0:
        info("[WARN] No games collected. Check URL, internet, or zstandard availability.")
        return

    df = pd.DataFrame(collected)
    info(f"[LO5] Dataframe shape: {df.shape}")
    # Save raw
    save_outputs(df, OUTPUT_CSV)

    # LO1 & LO2: descriptive stats
    desc = descriptive_statistics(df)

    # LO8: EDA plots
    exploratory_plots(df)

    # LO1: hypothesis tests
    tests = hypothesis_tests(df)

    # LO3 & LO7: clustering
    km, df = clustering_analysis(df, n_clusters=2)

    # LO2: classification
    clfres = classification_task(df)

    # LO6: ethics
    print_ethics_section()

    # LO4: show how AI was used (we'll document; user should include AI assistance notes in README)
    info("\n[LO4] AI assistance: This analysis design and code structure were iteratively improved using generative AI suggestions (document in README).")

    # LO9: applications (print short note)
    info("\n[LO9] Application areas: Online fair-play detection, platform moderation, chess education analytics.")

    # LO10 & LO11: plan & next steps
    info("\n[LO10/LO11] Next steps (document these in README):")
    print("""
    - Extend features (centipawn loss using Stockfish analysis per move).
    - Use temporal features, opening classification, Elo changes.
    - Build Streamlit dashboard (visual interactive UI).
    - Improve engine-detection heuristics; validate with known engine-labeled datasets.
    - Use more months of lichess data and scale training data properly.
    """)

    info("\n=== PIPELINE COMPLETE (LO1-LO11 covered with artifacts) ===")
    return df

# Run main pipeline
if __name__ == "__main__":
    df_final = main_pipeline()







[LO5] Streaming from https://database.lichess.org/standard/lichess_db_standard_rated_2025-09.pgn.zst (will stop once 300 human and 300 mixed games collected)...
  collected total=50 (human=50, mixed=0)
  collected total=100 (human=100, mixed=0)
  collected total=150 (human=149, mixed=1)
  collected total=200 (human=198, mixed=2)
  collected total=250 (human=248, mixed=2)
  collected total=300 (human=298, mixed=2)
  collected total=350 (human=300, mixed=50)
  collected total=400 (human=300, mixed=100)
  collected total=450 (human=300, mixed=150)
  collected total=500 (human=300, mixed=200)
  collected total=550 (human=300, mixed=250)
  collected total=600 (human=300, mixed=300)
[LO5] Streaming complete. Collected 300 human and 300 mixed games (total 600)
[LO5] Dataframe shape: (600, 10)
[LO10] Saved dataset to data/lichess_human_mixed_sample.csv

[LO1] Descriptive statistics (num_moves, avg_legal_moves, check_ratio, capture_ratio):
                 count       mean        std        50%


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x="label", y=feat, data=df, palette="Set2")


[LO8] Saved figures\hist_num_moves.png
[LO8] Saved figures\box_avg_legal_moves.png



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x="label", y=feat, data=df, palette="Set2")


[LO8] Saved figures\hist_avg_legal_moves.png
[LO8] Saved figures\box_check_ratio.png



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x="label", y=feat, data=df, palette="Set2")


[LO8] Saved figures\hist_check_ratio.png
[LO8] Saved figures\box_capture_ratio.png



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x="label", y=feat, data=df, palette="Set2")


[LO8] Saved figures\hist_capture_ratio.png

[LO1] Hypothesis tests (Human vs Human) vs (Human vs Engine):
 - num_moves: Mann-Whitney U, stat=38609.0000, p=0.00261 => SIGNIFICANT
 - avg_legal_moves: Mann-Whitney U, stat=46534.5000, p=0.47 => not significant
 - check_ratio: Mann-Whitney U, stat=42879.0000, p=0.3169 => not significant
 - capture_ratio: Mann-Whitney U, stat=49294.0000, p=0.04314 => SIGNIFICANT

[LO3] Clustering results (counts per label per cluster):
label    Human vs Engine (mixed)  Human vs Human (likely)
cluster                                                  
0                            210                      246
1                             90                       54

[LO2] Classification dataset class counts: {0: 300, 1: 300}

[LO2] Classification report (RandomForest):
              precision    recall  f1-score   support

           0     0.4747    0.5222    0.4974        90
           1     0.4691    0.4222    0.4444        90

    accuracy                  

---

* You may add as many sections as you want, as long as it supports your project workflow.
* All notebook's cells should be run top-down (you can't create a dynamic wherein a given point you need to go back to a previous cell to execute some task, like go back to a previous cell and refresh a variable content)

---

# Push files to Repo

* In cases where you don't need to push files to Repo, you may replace this section with "Conclusions and Next Steps" and state your conclusions and next steps.

In [23]:
import os
import json
import subprocess
from IPython.display import display, Javascript
import ipynbname

# 1. Save the notebook
display(Javascript('IPython.notebook.save_checkpoint();'))

# 2. Detect current notebook filename
nb_path = ipynbname.path()
notebook_name = str(nb_path.name)

# 3. Load notebook JSON
with open(notebook_name, "r", encoding="utf-8") as f:
    data = json.load(f)

# 4. Force a dummy change to ensure Git detects a difference
# Adds a harmless comment to the first code cell
if "cells" in data and len(data["cells"]) > 0:
    first_cell = data["cells"][0]
    if first_cell.get("cell_type") == "code":
        # Remove any previous dummy line to keep notebook clean
        first_cell["source"] = [line for line in first_cell["source"] if "# Auto-update: forced change" not in line]
        first_cell["source"].append("\n# Auto-update: forced change to trigger Git commit\n")

# 5. Rewrite notebook JSON
with open(notebook_name, "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=1)

# 6. Stage for Git
subprocess.run(["git", "add", "-f", notebook_name], check=True)

# 7. Commit changes
try:
    subprocess.run(["git", "commit", "-m", f"Auto-update {notebook_name}"], check=True)
    print(f"Committed changes to {notebook_name}.")
except subprocess.CalledProcessError:
    print("No changes detected by Git. Nothing to commit.")

# 8. Push to GitHub
subprocess.run(["git", "push", "origin", "main"], check=True)
print("Pushed to GitHub successfully.")




<IPython.core.display.Javascript object>

No changes detected by Git. Nothing to commit.
Pushed to GitHub successfully.
