### Analysis Notebook

__Title__: Dota 2 Dataset Analysis\
__Subject__: IKT110 - Artificial Intelligence Architecture\
__Authors__: Cornelius Brandt, Maximilian Eckstein, Mohammad Itani\
__Date__: November 2025\
__Version__: 1.0

This notebook contains the analysis of the Dota 2 matches dataset. The analysis is structured around several questions regarding hero picks, win rates, side advantages, and game durations.

In [1]:
import json
from collections import Counter, defaultdict
from itertools import combinations

import sys
from pathlib import Path
import numpy as np

PROJECT_ROOT = Path("../")

SRC_PATH = PROJECT_ROOT / "src"
if str(SRC_PATH) not in sys.path:
    sys.path.append(str(SRC_PATH))

from heroes import hero_name

DATA_PATH = PROJECT_ROOT / "data" / "analysis_dataset.jsonl"
THETA_PATH = PROJECT_ROOT / "data" / "theta_model.npz"
NUM_HEROES = 136

In [2]:
def iter_matches(limit=None):
    """
    Streams matches from analysis_dataset.jsonl.
    limit: if not None, stops after 'limit' matches.
    """
    count = 0
    with open(DATA_PATH, "r") as f:
        for line in f:
            if line.strip() == "":
                continue
            m = json.loads(line)
            yield m
            count += 1
            if limit is not None and count >= limit:
                break

In [3]:
# view first 3 matches

for i, m in enumerate(iter_matches(limit=6)):
    print(f"Match {i}:")
    print("  match_id:", m["match_id"])
    print("  radiant_win:", m["radiant_win"])
    print("  duration:", m["duration"])
    print("  game_mode:", m["game_mode"])
    print("  lobby_type:", m["lobby_type"])
    print("  heroes_radiant:", m["heroes_radiant"])
    print("  heroes_dire:", m["heroes_dire"])
    print()

Match 0:
  match_id: 5607724594
  radiant_win: False
  duration: 2407
  game_mode: 3
  lobby_type: 7
  heroes_radiant: [36, 27, 41, 31, 98]
  heroes_dire: [35, 103, 16, 67, 9]

Match 1:
  match_id: 5647502064
  radiant_win: True
  duration: 1929
  game_mode: 22
  lobby_type: 7
  heroes_radiant: [14, 40, 97, 10, 18]
  heroes_dire: [100, 67, 28, 63, 75]

Match 2:
  match_id: 5670735120
  radiant_win: False
  duration: 1946
  game_mode: 22
  lobby_type: 7
  heroes_radiant: [106, 22, 85, 91, 49]
  heroes_dire: [74, 14, 63, 41, 107]

Match 3:
  match_id: 5657129776
  radiant_win: True
  duration: 1547
  game_mode: 22
  lobby_type: 7
  heroes_radiant: [62, 104, 84, 70, 27]
  heroes_dire: [7, 106, 68, 129, 44]

Match 4:
  match_id: 5619512449
  radiant_win: False
  duration: 1729
  game_mode: 3
  lobby_type: 7
  heroes_radiant: [44, 79, 60, 22, 112]
  heroes_dire: [104, 47, 87, 9, 67]

Match 5:
  match_id: 5607707456
  radiant_win: True
  duration: 2093
  game_mode: 22
  lobby_type: 7
  heroe

#### Question 1: *"What games did I exclude/include for analysis?"*

In [4]:
game_modes = Counter()
lobby_types = Counter()
durations = []

for m in iter_matches():
    game_modes[m["game_mode"]] += 1
    lobby_types[m["lobby_type"]] += 1
    durations.append(m["duration"])

print("Game modes (after filter):", game_modes)
print("Lobby types (after filter):", lobby_types)
print("Number of matches:", sum(game_modes.values()))
print("Min. duration:", min(durations), "Max:", max(durations))

Game modes (after filter): Counter({22: 1652497, 3: 234464, 2: 76})
Lobby types (after filter): Counter({7: 1887037})
Number of matches: 1887037
Min. duration: 600 Max: 3869


#### Question 2: *"What hero is the most picked?"*

In [5]:
pick_counts = Counter()

for m in iter_matches():
    pick_counts.update(m["heroes_radiant"])
    pick_counts.update(m["heroes_dire"])

print("Top 10 most picked heroes:")
for hero_id, cnt in pick_counts.most_common(10):
    print(f"Hero: {hero_name(hero_id)}, Count: {cnt:,}")

Top 10 most picked heroes:
Hero: Pudge, Count: 463,747
Hero: Windrunner, Count: 441,630
Hero: Juggernaut, Count: 428,344
Hero: Invoker, Count: 401,865
Hero: Ogre Magi, Count: 378,114
Hero: Lion, Count: 373,756
Hero: Phantom Assassin, Count: 361,449
Hero: Faceless Void, Count: 361,040
Hero: Rubick, Count: 357,511
Hero: Antimage, Count: 328,744


#### Question 3: *"What hero has the highest win rate?"*

In [6]:
games_per_hero = Counter()
wins_per_hero = Counter()

for m in iter_matches():
    radiant_won = m["radiant_win"]
    rad = m["heroes_radiant"]
    dire = m["heroes_dire"]

    # games per hero
    for hid in rad + dire:
        games_per_hero[hid] += 1

    # winning team
    if radiant_won:
        for hid in rad:
            wins_per_hero[hid] += 1
    else:
        for hid in dire:
            wins_per_hero[hid] += 1

hero_winrates = {}
for hid, games in games_per_hero.items():
    if games < 50:  # optional: minimum games threshold
        continue
    hero_winrates[hid] = wins_per_hero[hid] / games

# Top 10 by winrate
top = sorted(hero_winrates.items(), key=lambda t: t[1], reverse=True)[:10]
print("Top 10 heroes by winrate (min 50 games):")
for hid, wr in top:
    print(f"Hero: {hero_name(hid)}, Winrate: {wr:.3f}")

Top 10 heroes by winrate (min 50 games):
Hero: Broodmother, Winrate: 0.643
Hero: Underlord, Winrate: 0.601
Hero: Drow Ranger, Winrate: 0.590
Hero: Clinkz, Winrate: 0.587
Hero: Lycan, Winrate: 0.569
Hero: Bloodseeker, Winrate: 0.564
Hero: Visage, Winrate: 0.563
Hero: Meepo, Winrate: 0.563
Hero: Ogre Magi, Winrate: 0.553
Hero: Vengeful Spirit, Winrate: 0.551


#### Question 4: *"Is there an advantage to playing Dire or Radiant?"* and Question 4a: *"What hero is most affected by the side?"*

In [7]:
total = 0
radiant_wins = 0

for m in iter_matches():
    total += 1
    if m["radiant_win"]:
        radiant_wins += 1

wr_radiant = radiant_wins / total
wr_dire = 1.0 - wr_radiant

print("Overall winrates by side:")
print("Radiant winrate:", f"{wr_radiant:.3f}")
print("Dire winrate:", f"{wr_dire:.3f}")

# Hero most affected by side
rad_games = Counter()
rad_wins = Counter()
dire_games = Counter()
dire_wins = Counter()

for m in iter_matches():
    radiant_won = m["radiant_win"]
    rad = m["heroes_radiant"]
    dire = m["heroes_dire"]

    for hid in rad:
        rad_games[hid] += 1
        if radiant_won:
            rad_wins[hid] += 1

    for hid in dire:
        dire_games[hid] += 1
        if not radiant_won:
            dire_wins[hid] += 1

side_diff = {}  # hero_id -> |WR_Radiant - WR_Dire|
for hid in range(1, NUM_HEROES+1):
    rg = rad_games[hid]
    dg = dire_games[hid]
    if rg < 50 or dg < 50:  # minimum games threshold
        continue
    wr_rad = rad_wins[hid] / rg
    wr_dir = dire_wins[hid] / dg
    side_diff[hid] = abs(wr_rad - wr_dir)

top_side_affected = sorted(side_diff.items(), key=lambda t: t[1], reverse=True)[:10]
print("\nHeroes most affected by side:")
for hid, diff in top_side_affected:
    print(f"Hero: {hero_name(hid)}, Difference: {diff:.3f}")

Overall winrates by side:
Radiant winrate: 0.575
Dire winrate: 0.425

Heroes most affected by side:
Hero: Sniper, Difference: 0.177
Hero: Pudge, Difference: 0.172
Hero: Drow Ranger, Difference: 0.164
Hero: Zeus, Difference: 0.164
Hero: Viper, Difference: 0.162
Hero: Arc Warden, Difference: 0.161
Hero: Medusa, Difference: 0.161
Hero: Venomancer, Difference: 0.161
Hero: Skywrath Mage, Difference: 0.161
Hero: Spectre, Difference: 0.160


#### Question 5: *"What hero has the highest impact on the game? (Define impact yourself)."*

In [8]:
data = np.load(THETA_PATH)      # NpzFile-Object
theta = data["theta"]           # theta-array
theta_no_bias = theta[1:]       # drop bias-term

# === Find strongest absolute impact ===
idx = np.argmax(np.abs(theta_no_bias))  # index 0..135
impact_value = theta_no_bias[idx]

# === Map index -> hero_id ===
hero_id = idx + 1   # because idx = hero_id - 1

# Find hero name from ID
name = hero_name(hero_id)

# === Display results ===
print("=== Highest Impact Hero ===")
print(f"Hero ID                  : {hero_id}")
print(f"Hero Name                : {name}")
print(f"Theta Value (Impact)     : {impact_value:.6f}")

=== Highest Impact Hero ===
Hero ID                  : 108
Hero Name                : Underlord
Theta Value (Impact)     : 0.374044


#### Question 6 and 7: *"What hero has the longest/shortest games?"*

In [9]:
duration_sum = Counter()
duration_count = Counter()

for m in iter_matches():
    dur = m["duration"]
    for hid in m["heroes_radiant"] + m["heroes_dire"]:
        duration_sum[hid] += dur
        duration_count[hid] += 1

avg_duration = {}
for hid, cnt in duration_count.items():
    if cnt < 50:  # minimum games
        continue
    avg_duration[hid] = duration_sum[hid] / cnt

# Längste
longest = sorted(avg_duration.items(), key=lambda t: t[1], reverse=True)[:10]
print("Heroes with longest average games:")
for hid, d in longest:
    print(f"Hero: {hero_name(hid)}, Average Duration: {d/60:.2f} minutes")

# Kürzeste
shortest = sorted(avg_duration.items(), key=lambda t: t[1])[:10]
print("\nHeroes with shortest average games:")
for hid, d in shortest:
    print(f"Hero: {hero_name(hid)}, Average Duration: {d/60:.2f} minutes")

Heroes with longest average games:
Hero: Tinker, Average Duration: 33.49 minutes
Hero: Zeus, Average Duration: 33.21 minutes
Hero: Spectre, Average Duration: 33.12 minutes
Hero: Sniper, Average Duration: 32.96 minutes
Hero: Ancient Apparition, Average Duration: 32.96 minutes
Hero: Razor, Average Duration: 32.79 minutes
Hero: Earthshaker, Average Duration: 32.72 minutes
Hero: Viper, Average Duration: 32.69 minutes
Hero: Sand King, Average Duration: 32.68 minutes
Hero: Windrunner, Average Duration: 32.66 minutes

Heroes with shortest average games:
Hero: Broodmother, Average Duration: 29.10 minutes
Hero: Meepo, Average Duration: 29.98 minutes
Hero: Lycan, Average Duration: 30.43 minutes
Hero: Chen, Average Duration: 30.47 minutes
Hero: Huskar, Average Duration: 30.50 minutes
Hero: Lone Druid, Average Duration: 30.71 minutes
Hero: Io, Average Duration: 30.88 minutes
Hero: Visage, Average Duration: 31.01 minutes
Hero: Beastmaster, Average Duration: 31.08 minutes
Hero: Templar Assassin, Ave

#### Question 8: *"What pair of heroes are the best together?"*

In [10]:
hero_games = Counter()
hero_wins = Counter()

for m in iter_matches():
    radiant = m["heroes_radiant"]
    dire = m["heroes_dire"]
    radiant_win = m["radiant_win"]

    for h in radiant:
        hero_games[h] += 1
        if radiant_win:
            hero_wins[h] += 1

    for h in dire:
        hero_games[h] += 1
        if not radiant_win:
            hero_wins[h] += 1

# Only consider heroes with enough games
hero_wr = {
    h: hero_wins[h] / hero_games[h]
    for h in hero_games
    if hero_games[h] >= 50
}


pair_games = Counter()
pair_wins = Counter()

for m in iter_matches():
    radiant = m["heroes_radiant"]
    dire = m["heroes_dire"]
    radiant_win = m["radiant_win"]

    for h1, h2 in combinations(radiant, 2):
        if h1 > h2:  # normalize order
            h1, h2 = h2, h1
        pair_games[(h1, h2)] += 1
        if radiant_win:
            pair_wins[(h1, h2)] += 1

    for h1, h2 in combinations(dire, 2):
        if h1 > h2:
            h1, h2 = h2, h1
        pair_games[(h1, h2)] += 1
        if not radiant_win:
            pair_wins[(h1, h2)] += 1


synergy_list = []

for (h1, h2), games in pair_games.items():
    if games < 50:  # avoid noise
        continue
    if h1 not in hero_wr or h2 not in hero_wr:
        continue

    wr_pair = pair_wins[(h1, h2)] / games
    expected_wr = (hero_wr[h1] + hero_wr[h2]) / 2
    synergy = wr_pair - expected_wr

    synergy_list.append(((h1, h2), games, wr_pair, synergy))

synergy_sorted = sorted(synergy_list, key=lambda x: x[3], reverse=True)[:3]

print("\nTop hero pairs by synergy (overperformance):")
for (h1, h2), games, wr_pair, syn in synergy_sorted:
    print(
        f"{hero_name(h1)} + {hero_name(h2)} — "
        f"WR together: {wr_pair:.3f}, "
        f"Games: {games}, "
        f"Synergy: {syn:.3f}"
    )



Top hero pairs by synergy (overperformance):
Lycan + Elder Titan — WR together: 0.735, Games: 185, Synergy: 0.207
Broodmother + Lycan — WR together: 0.765, Games: 310, Synergy: 0.158
Huskar + Lycan — WR together: 0.714, Games: 322, Synergy: 0.155


#### Question 9: *"What hero is hardest countered by another hero?"*

In [11]:
hero_games = Counter()
hero_wins = Counter()

for m in iter_matches():
    radiant = m["heroes_radiant"]
    dire = m["heroes_dire"]
    radiant_win = m["radiant_win"]

    # Radiant heroes
    for h in radiant:
        hero_games[h] += 1
        if radiant_win:
            hero_wins[h] += 1

    # Dire heroes
    for h in dire:
        hero_games[h] += 1
        if not radiant_win:
            hero_wins[h] += 1

# Only heroes with enough games
hero_wr = {
    h: hero_wins[h] / hero_games[h]
    for h in hero_games
    if hero_games[h] >= 50
}


# matchup_games[(A,B)] = times A faced B
# matchup_wins[(A,B)] = times A won vs B
matchup_games = Counter()
matchup_wins = Counter()

for m in iter_matches():
    radiant = m["heroes_radiant"]
    dire = m["heroes_dire"]
    radiant_win = m["radiant_win"]

    # When A is Radiant and B is Dire
    for A in radiant:
        for B in dire:
            matchup_games[(A,B)] += 1
            if radiant_win:
                matchup_wins[(A,B)] += 1

    # When A is Dire and B is Radiant
    for A in dire:
        for B in radiant:
            matchup_games[(A,B)] += 1
            if not radiant_win:
                matchup_wins[(A,B)] += 1


counter_list = []

for (A, B), games in matchup_games.items():
    if games < 50:
        continue
    if A not in hero_wr:
        continue

    wr_A_vs_B = matchup_wins[(A,B)] / games
    expected = hero_wr[A]  # how hero A usually performs
    counter_score = expected - wr_A_vs_B

    counter_list.append((A, B, games, wr_A_vs_B, counter_score))

counter_sorted = sorted(counter_list, key=lambda x: x[4], reverse=True)[:3]

print("\nTop 3 strongest hero counters:")
for A, B, games, wrAB, score in counter_sorted:
    print(
        f"{hero_name(B)} counters {hero_name(A)} - "
        f"WR({hero_name(A)} vs {hero_name(B)}): {wrAB:.3f}, "
        f"Games: {games}, "
        f"CounterScore: {score:.3f}"
    )



Top 3 strongest hero counters:
Broodmother counters Viper - WR(Viper vs Broodmother): 0.270, Games: 1890, CounterScore: 0.270
Broodmother counters Clinkz - WR(Clinkz vs Broodmother): 0.329, Games: 2864, CounterScore: 0.258
Broodmother counters Arc Warden - WR(Arc Warden vs Broodmother): 0.281, Games: 2179, CounterScore: 0.258


#### Question 10: *"What hero is the best if it is not countered by its TOP 5 counters (if not countered it will win type of hero)"*

In [12]:
# Counters already computed
top_counters = {}  # hero A -> list of 5 heroes that counter A
for A in hero_wr:
    counters = [(B, counter_score) for A1, B, games, wrAB, counter_score in counter_list if A1 == A]
    counters_sorted = sorted(counters, key=lambda x: x[1], reverse=True)[:5]
    top_counters[A] = [B for B, s in counters_sorted]

# Initialize stats
safe_games = Counter()
safe_wins = Counter()
countered_games = Counter()
countered_wins = Counter()

for m in iter_matches():
    radiant = set(m["heroes_radiant"])
    dire = set(m["heroes_dire"])
    radiant_win = m["radiant_win"]

    # Check heroes on Radiant
    for A in radiant:
        if A not in top_counters:
            continue
        if not top_counters[A]:  # no counters
            safe_games[A] += 1
            if radiant_win:
                safe_wins[A] += 1
            continue

        if dire.isdisjoint(top_counters[A]):
            safe_games[A] += 1
            if radiant_win:
                safe_wins[A] += 1
        else:  # at least one counter present
            countered_games[A] += 1
            if radiant_win:
                countered_wins[A] += 1

    # Check heroes on Dire
    for A in dire:
        if A not in top_counters:
            continue
        if not top_counters[A]:
            safe_games[A] += 1
            if not radiant_win:
                safe_wins[A] += 1
            continue

        if radiant.isdisjoint(top_counters[A]):
            safe_games[A] += 1
            if not radiant_win:
                safe_wins[A] += 1
        else:
            countered_games[A] += 1
            if not radiant_win:
                countered_wins[A] += 1

# Compute safe and countered winrates
comparison_list = []

for A in safe_games:
    if safe_games[A] < 50 and countered_games.get(A,0) < 50:
        continue

    safe_wr = safe_wins[A] / safe_games[A] if safe_games[A] > 0 else None
    countered_wr = countered_wins[A] / countered_games[A] if countered_games.get(A,0) > 0 else None

    comparison_list.append((A, safe_games[A], safe_wr, countered_games.get(A,0), countered_wr))

# Sort by safe winrate
comparison_sorted = sorted(comparison_list, key=lambda x: (x[2] if x[2] is not None else 0), reverse=True)[:3]

# Print
print("\nBest Heroes if not countered:")
for A, sg, swr, cg, cwr in comparison_sorted:
    print(f"{hero_name(A)}: Not Countered WR: {swr:.3f} ({sg} games), Countered WR: {cwr:.3f} ({cg} games)")



Best Heroes if not countered:
Broodmother: Not Countered WR: 0.659 (42042 games), Countered WR: 0.533 (6267 games)
Underlord: Not Countered WR: 0.632 (137704 games), Countered WR: 0.510 (46079 games)
Clinkz: Not Countered WR: 0.606 (144984 games), Countered WR: 0.433 (17784 games)


#### Question 11: *"Give 2 heroes that a team safely can first pick. "*

In [13]:
first_pick_scores = []

for A in hero_wr:
    overall_wr = hero_wr[A]

    counters = [(B, score) for A1, B, games, wrAB, score in counter_list if A1 == A]
    top5 = sorted(counters, key=lambda x: x[1], reverse=True)[:5]
    if top5:
        avg_counter_score = sum(score for B, score in top5) / len(top5)
    else:
        avg_counter_score = 0  # hero has no strong counters

    safety_score = overall_wr - avg_counter_score

    first_pick_scores.append((A, safety_score, overall_wr, avg_counter_score))

# Step 2: Sort by safety score descending
first_pick_sorted = sorted(first_pick_scores, key=lambda x: x[1], reverse=True)[:2]

print("\nTop 2 heroes to safely first pick:")
for A, safety_score, overall_wr, avg_counter in first_pick_sorted:
    print(f"{hero_name(A)} — Safety Score: {safety_score:.3f}, WR: {overall_wr:.3f}, Avg Counter: {avg_counter:.3f}")



Top 2 heroes to safely first pick:
Broodmother — Safety Score: 0.524, WR: 0.643, Avg Counter: 0.119
Underlord — Safety Score: 0.497, WR: 0.601, Avg Counter: 0.104


#### Question 12: *"How can Molde Dotaklubb use the webpage to improve?"*

In [14]:
print("This question will be answered in the report.")

This question will be answered in the report.
