In [22]:
import random
from collections import Counter
import pandas as pd

In [2]:
WORD_LIST = [
    "PRINT", "POINT", "WATER", "WASTE", "PONNY",
    "POWER", "SNACK", "SLEEP", "SLICE", "FANCY",
]

In [3]:
N_SAMPLES = 1000          # 每個起始詞要抽樣幾局
RANDOM_SEED = 42          # 想要不同結果可改掉

In [4]:
def positional_probabilities(words):
    counts = [Counter() for _ in range(5)]
    for w in words:
        for i, ch in enumerate(w):  #list(enumerate("PRINT"))
            counts[i][ch] += 1      # ➜ [(0, 'P'), (1, 'R'), (2, 'I'), (3, 'N'), (4, 'T')]
    total = len(words)
    return [{k: v / total for k, v in c.items()} for c in counts]

In [5]:
def word_score(word, pos_probs):
    return sum(pos_probs[i].get(ch, 0) for i, ch in enumerate(word))

In [6]:
def feedback(guess, answer):
    fb = ['B'] * 5
    ans_rem = list(answer)

    # 綠色
    for i, (g, a) in enumerate(zip(guess, answer)):
        if g == a:
            fb[i] = 'G'
            ans_rem[i] = None

    # 黃色
    for i, g in enumerate(guess):
        if fb[i] == 'B' and g in ans_rem:
            fb[i] = 'Y'
            ans_rem[ans_rem.index(g)] = None

    return ''.join(fb)

In [7]:
def main():
    random.seed(RANDOM_SEED)
    pos_probs = positional_probabilities(WORD_LIST)
    rows = []

    for guess in WORD_LIST:
        g_total = y_total = 0
        for _ in range(N_SAMPLES):
            ans = random.choice(WORD_LIST)
            fb = feedback(guess, ans)
            g_total += fb.count('G')
            y_total += fb.count('Y')
        avg_g = g_total / N_SAMPLES
        avg_y = y_total / N_SAMPLES
        rows.append(
            {
                "word": guess,
                "score_sum": round(word_score(guess, pos_probs), 3),
                "avg_green": round(avg_g, 3),
                "avg_yellow": round(avg_y, 3),
                "avg_total": round(avg_g + avg_y, 3),
            }
        )

    df = pd.DataFrame(rows).sort_values("score_sum", ascending=False).reset_index(drop=True)
    corr = df["score_sum"].corr(df["avg_total"])

    print(df.to_string(index=False))
    print(f"\nCorrelation between score_sum and avg_total: {corr:.3f}")


if __name__ == "__main__":
    main()

  word  score_sum  avg_green  avg_yellow  avg_total
 POINT        1.5      1.546       0.484      2.030
 PONNY        1.4      1.365       0.194      1.559
 PRINT        1.3      1.298       0.670      1.968
 POWER        1.3      1.305       0.610      1.915
 SLICE        1.3      1.304       0.393      1.697
 WATER        1.1      1.074       0.790      1.864
 FANCY        1.1      1.080       0.429      1.509
 SLEEP        1.0      1.083       0.687      1.770
 WASTE        0.9      0.906       1.114      2.020
 SNACK        0.9      0.888       0.809      1.697

Correlation between score_sum and avg_total: 0.094


In [8]:
import random
from collections import Counter
import pandas as pd

In [9]:
WORD_LIST = [
    "PRINT", "POINT", "WATER", "WASTE", "PONNY",
    "POWER", "SNACK", "SLEEP", "SLICE", "FANCY",
]
N_SAMPLES   = 1000     # 每個起始詞抽樣局數
RANDOM_SEED = 42

In [10]:
def positional_probabilities(words):
    """回傳 5 個 dict：第 i 位→(字母→機率)。"""
    counts = [Counter() for _ in range(5)]
    for w in words:
        for i, ch in enumerate(w):
            counts[i][ch] += 1
    total_words = len(words)
    return [{k: v / total_words for k, v in c.items()} for c in counts]

In [11]:
def overall_probabilities(words):
    """回傳單一 dict：字母→在任何位置的機率。"""
    total_letters = len(words) * 5
    cnt = Counter("".join(words))
    return {k: v / total_letters for k, v in cnt.items()}

In [12]:
def word_score(word, pos_probs, any_probs):
    """位置機率 + 整體機率 (10 項) 加總得分。"""
    s = 0.0
    for i, ch in enumerate(word):
        s += pos_probs[i].get(ch, 0) + any_probs.get(ch, 0)
    return s

In [13]:
def feedback(guess, answer):
    """Wordle 回饋 (G/Y/B)；含重複字母規則。"""
    fb = ['B']*5
    ans_rem = list(answer)

    # 綠
    for i, (g, a) in enumerate(zip(guess, answer)):
        if g == a:
            fb[i] = 'G'
            ans_rem[i] = None
    # 黃
    for i, g in enumerate(guess):
        if fb[i] == 'B' and g in ans_rem:
            fb[i] = 'Y'
            ans_rem[ans_rem.index(g)] = None
    return ''.join(fb)

In [14]:
def main():
    random.seed(RANDOM_SEED)

    pos_probs = positional_probabilities(WORD_LIST)
    any_probs = overall_probabilities(WORD_LIST)

    rows = []
    for guess in WORD_LIST:
        # 計算 10 項機率得分
        score = word_score(guess, pos_probs, any_probs)
        greens_total = yellows_total = 0

        # 隨機抽樣 N_SAMPLES 局
        for _ in range(N_SAMPLES):
            answer = random.choice(WORD_LIST)
            fb = feedback(guess, answer)
            greens_total += fb.count('G')
            yellows_total += fb.count('Y')

        avg_g = greens_total / N_SAMPLES
        avg_y = yellows_total / N_SAMPLES

        rows.append({
            "word": guess,
            "score_10prob": round(score, 3),
            "avg_green":   round(avg_g, 3),
            "avg_yellow":  round(avg_y, 3),
            "avg_total":   round(avg_g + avg_y, 3),
        })

    df = pd.DataFrame(rows).sort_values("score_10prob", ascending=False).reset_index(drop=True)

    # 相關係數：得分 vs 平均資訊量
    corr = df["score_10prob"].corr(df["avg_total"])

    print(df.to_string(index=False))
    print(f"\nCorrelation between 10-prob score and avg_total: {corr:.3f}")


if __name__ == "__main__":
    main()

  word  score_10prob  avg_green  avg_yellow  avg_total
 POINT          1.92      1.546       0.484      2.030
 PONNY          1.84      1.365       0.194      1.559
 PRINT          1.72      1.298       0.670      1.968
 POWER          1.70      1.305       0.610      1.915
 SLICE          1.66      1.304       0.393      1.697
 WATER          1.50      1.074       0.790      1.864
 SLEEP          1.46      1.083       0.687      1.770
 FANCY          1.42      1.080       0.429      1.509
 WASTE          1.32      0.906       1.114      2.020
 SNACK          1.26      0.888       0.809      1.697

Correlation between 10-prob score and avg_total: 0.177


# new probability

In [None]:
"""
wordle_sim_10prob.py
--------------------------------------------------------
1. 對 WORD_LIST 中每個五字母單詞計算「10 項機率分數」
     score_10prob = Σ_i [ P_pos(i,c_i) + P_any(c_i) ]
2. 針對每個單詞作為起始詞，隨機抽 N_GAMES 局：
     • 第一步固定用此起始詞
     • 之後永遠選目前候選集的第一個單詞當猜測
     • 記錄完成答案所需步數
3. 輸出表格 (score_10prob, avg_guesses) 與相關係數
--------------------------------------------------------
"""

In [1]:
import random
from collections import Counter
import pandas as pd

In [2]:
WORD_LIST = [
    "PRINT", "POINT", "WATER", "WASTE", "PONNY",
    "POWER", "SNACK", "SLEEP", "SLICE", "FANCY",
]

In [3]:
N_GAMES   = 1000      # 每個起始詞要模擬的局數
RANDOM_SEED = 42      # 想要不同隨機序列就改掉

In [4]:
# ---------- 機率計算 -----------------------------------------------------------
def positional_probabilities(words):
    """長度 5 的 list，每格字母→機率。"""
    counts = [Counter() for _ in range(5)]
    for w in words:
        for i, ch in enumerate(w):
            counts[i][ch] += 1
    total = len(words)
    return [{k: v / total for k, v in c.items()} for c in counts]

In [5]:
def overall_probabilities(words):
    """單一 dict：字母→在任何位置的機率。"""
    cnt = Counter("".join(words))
    total_letters = len(words) * 5
    return {k: v / total_letters for k, v in cnt.items()}

In [6]:
def word_score_10prob(word, pos_probs, any_probs):
    """五格位置機率 + 同字母整體機率，共 10 項加總。"""
    return sum(pos_probs[i].get(ch, 0) + any_probs.get(ch, 0)
               for i, ch in enumerate(word))

In [7]:
# ---------- Wordle 相關 --------------------------------------------------------
def feedback(guess, answer):
    """Wordle 回饋字串 'G'/'Y'/'B'（含重複字母規則）。"""
    fb = ['B'] * 5
    ans_rem = list(answer)

    # 綠
    for i, (g, a) in enumerate(zip(guess, answer)):
        if g == a:
            fb[i] = 'G'
            ans_rem[i] = None
    # 黃
    for i, g in enumerate(guess):
        if fb[i] == 'B' and g in ans_rem:
            fb[i] = 'Y'
            ans_rem[ans_rem.index(g)] = None
    return ''.join(fb)

In [8]:
def filter_candidates(candidates, guess, fb):
    """保留符合回饋 fb 的單詞。"""
    return [w for w in candidates if feedback(guess, w) == fb]

In [9]:
def guesses_needed(answer, start, bank):
    """硬模式：第一猜固定 start；之後用 remaining[0]。回傳總步數。"""
    guess = start
    steps = 1
    if guess == answer:
        return steps

    remaining = bank.copy()
    while True:
        fb = feedback(guess, answer)
        remaining = filter_candidates(remaining, guess, fb)
        guess = remaining[0]
        steps += 1
        if guess == answer:
            return steps

In [10]:
def simulate(start_word, words, n_games):
    """固定 start_word，隨機抽 n_games 局，回傳平均步數。"""
    rng = random.Random(RANDOM_SEED)
    total = 0
    for _ in range(n_games):
        ans = rng.choice(words)
        total += guesses_needed(ans, start_word, words)
    return total / n_games

In [11]:
def main():
    random.seed(RANDOM_SEED)

    pos_probs = positional_probabilities(WORD_LIST)
    any_probs = overall_probabilities(WORD_LIST)

    rows = []
    for w in WORD_LIST:
        score = word_score_10prob(w, pos_probs, any_probs)
        avg = simulate(w, WORD_LIST, N_GAMES)
        rows.append({
            "word": w,
            "score_10prob": round(score, 4),
            "avg_guesses": round(avg, 4),
        })

    df = pd.DataFrame(rows).sort_values("avg_guesses").reset_index(drop=True)
    corr = df["score_10prob"].corr(df["avg_guesses"])  # 相關係數

    print("Results over", N_GAMES, "games per starting word\n")
    print(df.to_string(index=False))
    print(f"\nCorrelation (score_10prob vs avg_guesses): {corr:.3f}")


if __name__ == "__main__":
    main()

Results over 1000 games per starting word

  word  score_10prob  avg_guesses
 WATER          1.50        1.915
 WASTE          1.32        1.992
 PRINT          1.72        2.004
 SLICE          1.66        2.087
 POINT          1.92        2.103
 PONNY          1.84        2.106
 SLEEP          1.46        2.116
 SNACK          1.26        2.119
 POWER          1.70        2.126
 FANCY          1.42        2.234

Correlation (score_10prob vs avg_guesses): 0.026


# ExpRem

In [23]:
from collections import Counter
import math, statistics
import numpy as np

In [24]:
def feedback(guess, answer):
    guess, answer = guess.upper(), answer.upper()
    pat, remain = ["0"]*5, []  # 先預設五格全灰 # 綠色先吃掉，剩下的字母暫存
    #把大小寫統一後，先做一個長度 5 的字串，稍後逐格改成 2 / 1 / 0。
    for i,(g,a) in enumerate(zip(guess, answer)):
        if g==a: pat[i]="2" # 同位置同字母 → 綠
        else:     remain.append(a)
    #第一輪先標綠色並把未命中位置的「答案字母」留到 remain。
    cnt = Counter(remain)
    for i,g in enumerate(guess):
        if pat[i]=="0" and cnt[g]>0:
            pat[i]="1"; cnt[g]-=1 # 還有庫存 → 黃色 # 消耗一次
    #第二輪再標黃色。使用 Counter 確保重複字母符合官方「先到先得」規則。
    return "".join(pat)

In [25]:
#找到 Sw,p 
def alive_after(guess, pattern, cand): 
    return [w for w in cand if feedback(guess, w)==pattern]
#逐一重算 feedback(guess, w)，留下 pattern 相同的字──這就是定義裡的子集合 Sw,p

In [4]:
#期望剩餘量E[|Sw,p|]
#def exprem(guess, cand):
#    return sum(len(alive_after(guess, feedback(guess,a), cand)) for a in cand)/len(cand)
def exprem(guess, cand):
    total = 0
    for answer in cand:                   # ∑_{a∈S}
        pat   = feedback(guess, answer)   # p(a)
        alive = alive_after(guess, pat, cand)
        total += len(alive)               # |S_{w,p(a)}|
    return total / len(cand)              # 除以 |S|
#按公式做「加總再除以候選個數」；結果越小代表平均砍掉更多字。

In [26]:
def exprem(guess, cand):
    """
    O(N) 計算 ExpRem：
    ExpRem = Σ cnt^2 / N，其中 cnt = 該 pattern 出現次數
    """
    pattern_cnt = Counter(feedback(guess, ans) for ans in cand)
    N = len(cand)
    return sum(cnt*cnt for cnt in pattern_cnt.values()) / N

In [42]:
#最壞殘存量 max_p|Sw,p|
#def worstrem(guess, cand):
#    sizes={}
#    for a in cand:
#        p=feedback(guess,a)
#        if p not in sizes:
#            sizes[p]=len(alive_after(guess,p,cand))
#    return max(sizes.values())
def worstrem(guess, cand):
    sizes = {}
    worst = 0
    for answer in cand:
        pat = feedback(guess, answer)
        if pat not in sizes:                         # 每種 pattern 只算一次
            size   = len(alive_after(guess, pat, cand))
            sizes[pat] = size
            worst  = max(worst, size)                # 更新最大值
    return worst
#直接拉 1 次就把這個 pattern 的大小存進 sizes，避免重複計算。

In [None]:
def worstrem(guess: str, cand: list[str]) -> int:
    """
    O(N) 計算 WorstRem：
    只要 pattern 次數的最大值
    """
    pattern_cnt = Counter(feedback(guess, ans) for ans in cand)
    return max(pattern_cnt.values())

In [27]:
def next_guess(cand):                # 取當前候選中 ExpRem 最小者
    best=min(cand, key=lambda w: exprem(w, cand))
    return best
#把「目前還活著的字」逐一算 ExpRem，挑出最小值。這是最簡單的貪婪策略。
'''
def next_guess(cand):                # 取當前候選中 WorstRem 最小者
    best=min(cand, key=lambda w: worstrem(w, cand))
    return best
#把「目前還活著的字」逐一算 WorstRem，挑出最小值。這是最簡單的貪婪策略。
'''

'\ndef next_guess(cand):                # 取當前候選中 WorstRem 最小者\n    best=min(cand, key=lambda w: worstrem(w, cand))\n    return best\n#把「目前還活著的字」逐一算 WorstRem，挑出最小值。這是最簡單的貪婪策略。\n'

In [28]:
#完整模擬一局 Wordle
#def play(answer, start, full):
#    cand=full.copy(); guess=start; steps=1
#    while True:
#        pat=feedback(guess, answer)
#        if pat=="22222": return steps
#        cand = alive_after(guess, pat, cand)
#        guess = next_guess(cand)
#        steps+=1
def play(answer, start, full):
    cand  = full.copy()        # 一開始候選 = 全字庫
    guess = start              # 起手詞
    steps = 1                  # 步數計數
    while True:
        pat = feedback(guess, answer)
        if pat == "22222":     # 全綠 → 猜中
            return steps
        cand  = alive_after(guess, pat, cand)  # 縮小候選集
        guess = next_guess(cand)               # 挑下一步
        steps += 1
#核心迴圈：feedback → 篩選 cand → 選新 guess 持續到猜中。
#每輪都挑當前 ExpRem 最小的字

In [7]:
CAND = ["PRINT","PRIME","PRISM","POINT","PRAYS","POWER","WATER","WASTE","SNACK","SLEEP","SLICE","SNAKE"]
STARTS = ["PRINT","PRIME","PRISM","POINT","PRAYS","POWER","WATER","WASTE","SNACK","SLEEP","SLICE","SNAKE"]

In [29]:
def load_wordlist(path, *, length=5, to_upper=True, max_words=None):
    """
    讀取文字檔並回傳單字清單。
    - path       : 檔案路徑
    - length     : 只保留指定長度的單字 (預設 5)
    - to_upper   : 是否轉成大寫，方便與 feedback() 對齊
    - max_words  : 選擇性，若只想拿前 N 個字來跑小規模測試
    """
    words = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            w = line.strip()
            if len(w) != length:      # 排除不是 5 字母的行
                continue
            if not w.isalpha():       # 排除含 - ' 等符號
                continue
            words.append(w.upper() if to_upper else w.lower())
            if max_words and len(words) >= max_words:
                break
    return words

In [30]:
CAND = load_wordlist("valid-wordle-words.txt")
print("候選字數：", len(CAND))
STARTS=load_wordlist("valid-wordle-words.txt")
#STARTS=["ADIEU","AUDIO","ALERT","ARISE","ARIES","CANOE","CRANE","CRATE","EARNT","EARNS","GRACE","IRATE","LATER","LEAST","LEARN","MAGIC","ORATE","PANEL","PEARS","PLANT","POINT","PRINT","PRISM","RAISE","RANTS","REACT","SANER","SALET","SATER","SAUCE","SIREN","SLANT","SLATE","SLICE","SNAKE","SOARE","SOLAR","STARE","STEAM","TABLE","TARES","TEARS","TIRES","TONER","TRACE","TRAIN","TRICE","VOTER","WATER","YEAST"]
#print(CAND)

候選字數： 14855


In [32]:
CAND=["PRINT","PRIME","PRISM","POINT","PRAYS","POWER","WATER","WASTE","SNACK","SLEEP","SLICE","SNAKE","CRANE","SLATE","TRACE","CRISP","GRACE","PLANT","GRANT","PRIDE","TRICE","IDEAL","APPLE","BERRY","CHASE","DELTA","EARTH","FAITH","GIANT","HAPPY","INDEX","JUDGE","KNIFE","LEMON","MAGIC","NOVEL","OCEAN","PEACH","QUIET","RIVER","SOLAR","TIGER","UNITY","VIVID","WHALE","XENON","YEAST","ZEBRA","CABIN","DANCE"]
STARTS=["PRINT","PRIME","PRISM","POINT","PRAYS","POWER","WATER","WASTE","SNACK","SLEEP","SLICE","SNAKE","CRANE","SLATE","TRACE","CRISP","GRACE","PLANT","GRANT","PRIDE","TRICE","IDEAL","APPLE","BERRY","CHASE","DELTA","EARTH","FAITH","GIANT","HAPPY","INDEX","JUDGE","KNIFE","LEMON","MAGIC","NOVEL","OCEAN","PEACH","QUIET","RIVER","SOLAR","TIGER","UNITY","VIVID","WHALE","XENON","YEAST","ZEBRA","CABIN","DANCE"]

In [31]:
exp   = {w: exprem(w, CAND) for w in STARTS}
#avg_n = {w: statistics.mean(play(ans, w, CAND) for ans in CAND) for w in STARTS}

In [34]:
print("ExpRem:", exp)
print("Avg:", avg_n)

ExpRem: {'AAHED': 120.598, 'AALII': 140.386, 'AAPAS': 184.424, 'AARGH': 161.212, 'AARTI': 94.848, 'ABACA': 249.936, 'ABACI': 145.12, 'ABACK': 225.29, 'ABACS': 135.62, 'ABAFT': 208.676, 'ABAHT': 194.43, 'ABAKA': 279.378, 'ABAMP': 205.958, 'ABAND': 133.788, 'ABASE': 80.916, 'ABASH': 157.122, 'ABASK': 158.466, 'ABATE': 98.32, 'ABAYA': 259.52, 'ABBAS': 174.476, 'ABBED': 210.534, 'ABBES': 151.846, 'ABBEY': 226.402, 'ABBOT': 264.038, 'ABCEE': 217.744, 'ABEAM': 114.026, 'ABEAR': 90.558, 'ABEAT': 103.408, 'ABEER': 165.074, 'ABELE': 164.032, 'ABENG': 139.948, 'ABERS': 88.704, 'ABETS': 106.262, 'ABEYS': 123.972, 'ABHOR': 182.106, 'ABIDE': 115.614, 'ABIES': 80.588, 'ABIUS': 140.972, 'ABJAD': 222.832, 'ABJUD': 332.806, 'ABLED': 123.292, 'ABLER': 97.946, 'ABLES': 88.29, 'ABLET': 109.268, 'ABLOW': 215.49, 'ABMHO': 249.05, 'ABNET': 117.402, 'ABODE': 129.146, 'ABOHM': 244.552, 'ABOIL': 124.434, 'ABOMA': 147.544, 'ABOON': 245.866, 'ABORD': 145.858, 'ABORE': 100.636, 'ABORN': 127.896, 'ABORT': 145.272, 

In [32]:
import pandas as pd
df = (
    pd.DataFrame({
        "Word":   list(exp.keys()),
        "ExpRem": [exp[w]  for w in exp],
        #"AvgSteps": [avg_n[w] for w in exp]
    })
    .set_index("Word")
    .sort_values("ExpRem")
)


In [36]:
#前1000字
pd.set_option("display.max_rows", None) 
print(df)

        ExpRem  AvgSteps
Word                    
AREAS   55.850     3.860
ARISE   58.358     3.877
ARLES   59.456     3.872
ANSAE   60.006     3.912
ARIAS   60.486     3.880
ARIEL   61.594     3.842
AREAL   61.728     3.845
ALOES   63.304     3.921
AROSE   63.336     3.965
ALANE   64.060     3.880
ALIAS   64.214     3.867
ARILS   65.246     3.888
ANEAR   65.802     3.857
ARENA   66.342     3.819
AISLE   67.174     3.895
ALOSE   67.852     3.933
ANISE   67.992     3.864
ARNAS   68.132     3.862
ALATE   68.430     3.897
ALIEN   68.760     3.807
AESIR   70.000     3.921
ALANS   70.272     3.851
ALINE   71.102     3.812
AEROS   71.434     3.974
AREAD   71.968     3.851
ANILE   72.448     3.803
ARAME   72.578     3.880
ANLAS   72.926     3.868
ANTAE   73.304     3.888
ARNIS   74.126     3.867
ARETS   74.252     3.853
ALDEA   74.276     3.893
AMIES   74.474     3.880
ANOAS   75.040     3.950
ASTER   75.052     3.906
ALMES   75.628     3.884
ALERT   75.814     3.872
ARTEL   75.910     3.871


In [33]:
pd.set_option("display.max_rows", None) 
print(df)

            ExpRem
Word              
LARES   341.470818
RALES   342.878156
NARES   351.065567
RANES   351.254729
REAIS   356.167418
SOARE   356.243891
TARES   357.661461
AEROS   363.075126
SERAI   364.756580
RATES   365.713026
SERIA   367.233457
SANER   368.183036
ARLES   378.097274
SATER   379.291484
LANES   384.675261
RAISE   387.541501
TALES   388.832918
ALOES   390.314305
SAINE   390.968630
REALS   391.009828
LAERS   391.261865
SERAL   395.390979
TERAS   396.166611
LEARS   396.239179
SALET   400.431572
EARLS   400.500370
REANS   403.171121
AEONS   405.146348
NEARS   406.613598
LORES   406.863346
TOEAS   408.539010
NATES   408.573881
ROLES   410.207809
MARES   411.165735
STOAE   411.793672
LASER   413.468798
DARES   413.879569
CARES   414.587883
PARES   415.318681
AURES   415.772265
EARNS   415.845237
STRAE   416.714978
TEARS   417.000539
RAMES   419.338068
SOREL   420.714036
RONES   423.345338
TASER   424.044631
RACES   424.439111
RAPES   425.981488
SARED   426.126489
TORES   427.

In [11]:
#correlation
words = list(exp.keys())
x = [exp[w] for w in words]   # ExpRem
y = [avg_n[w]     for w in words]   # 平均步數

xm, ym = sum(x)/len(x), sum(y)/len(y)
num = sum((xi - xm)*(yi - ym) for xi, yi in zip(x, y))
den = math.sqrt(sum((xi - xm)**2 for xi in x) *
                sum((yi - ym)**2 for yi in y))
r = num / den
print(r)
#前500字 0.87886623150916
#前1000字 0.8634138410899195

0.9383148632568364


# WorstRem

In [1]:
from collections import Counter
import math, statistics
import numpy as np

In [2]:
def feedback(guess, answer):
    guess, answer = guess.upper(), answer.upper()
    pat, remain = ["0"]*5, []  # 先預設五格全灰 # 綠色先吃掉，剩下的字母暫存
    #把大小寫統一後，先做一個長度 5 的字串，稍後逐格改成 2 / 1 / 0。
    for i,(g,a) in enumerate(zip(guess, answer)):
        if g==a: pat[i]="2" # 同位置同字母 → 綠
        else:     remain.append(a)
    #第一輪先標綠色並把未命中位置的「答案字母」留到 remain。
    cnt = Counter(remain)
    for i,g in enumerate(guess):
        if pat[i]=="0" and cnt[g]>0:
            pat[i]="1"; cnt[g]-=1 # 還有庫存 → 黃色 # 消耗一次
    #第二輪再標黃色。使用 Counter 確保重複字母符合官方「先到先得」規則。
    return "".join(pat)

In [3]:
#找到 Sw,p 
def alive_after(guess, pattern, cand): 
    return [w for w in cand if feedback(guess, w)==pattern]
#逐一重算 feedback(guess, w)，留下 pattern 相同的字──這就是定義裡的子集合 Sw,p

In [85]:
#期望剩餘量E[|Sw,p|]
#def exprem(guess, cand):
#    return sum(len(alive_after(guess, feedback(guess,a), cand)) for a in cand)/len(cand)
def exprem(guess, cand):
    total = 0
    for answer in cand:                   # ∑_{a∈S}
        pat   = feedback(guess, answer)   # p(a)
        alive = alive_after(guess, pat, cand)
        total += len(alive)               # |S_{w,p(a)}|
    return total / len(cand)              # 除以 |S|
#按公式做「加總再除以候選個數」；結果越小代表平均砍掉更多字。

In [37]:
#最壞殘存量 max_p|Sw,p|
#def worstrem(guess, cand):
#    sizes={}
#    for a in cand:
#        p=feedback(guess,a)
#        if p not in sizes:
#            sizes[p]=len(alive_after(guess,p,cand))
#    return max(sizes.values())
def worstrem(guess, cand):
    sizes = {}
    worst = 0
    for answer in cand:
        pat = feedback(guess, answer)
        if pat not in sizes:                         # 每種 pattern 只算一次
            size   = len(alive_after(guess, pat, cand))
            sizes[pat] = size
            worst  = max(worst, size)                # 更新最大值
    return worst
#直接拉 1 次就把這個 pattern 的大小存進 sizes，避免重複計算。

In [4]:
def worstrem(guess, cand):
    """
    O(N) 計算 WorstRem：
    只要 pattern 次數的最大值
    """
    pattern_cnt = Counter(feedback(guess, ans) for ans in cand)
    return max(pattern_cnt.values())

In [5]:
'''
def next_guess(cand):                # 取當前候選中 ExpRem 最小者
    best=min(cand, key=lambda w: exprem(w, cand))
    return best
#把「目前還活著的字」逐一算 ExpRem，挑出最小值。這是最簡單的貪婪策略。
'''
def next_guess(cand):                # 取當前候選中 WorstRem 最小者
    best=min(cand, key=lambda w: worstrem(w, cand))
    return best
#把「目前還活著的字」逐一算 WorstRem，挑出最小值。這是最簡單的貪婪策略。


In [6]:
#完整模擬一局 Wordle
#def play(answer, start, full):
#    cand=full.copy(); guess=start; steps=1
#    while True:
#        pat=feedback(guess, answer)
#        if pat=="22222": return steps
#        cand = alive_after(guess, pat, cand)
#        guess = next_guess(cand)
#        steps+=1
def play(answer, start, full):
    cand  = full.copy()        # 一開始候選 = 全字庫
    guess = start              # 起手詞
    steps = 1                  # 步數計數
    while True:
        pat = feedback(guess, answer)
        if pat == "22222":     # 全綠 → 猜中
            return steps
        cand  = alive_after(guess, pat, cand)  # 縮小候選集
        guess = next_guess(cand)               # 挑下一步
        steps += 1
#核心迴圈：feedback → 篩選 cand → 選新 guess 持續到猜中。
#每輪都挑當前 ExpRem 最小的字

In [18]:
CAND = ["PRINT","PRIME","PRISM","POINT","PRAYS","POWER","WATER","WASTE","SNACK","SLEEP","SLICE","SNAKE"]
STARTS = ["PRINT","PRIME","PRISM","POINT","PRAYS","POWER","WATER","WASTE","SNACK","SLEEP","SLICE","SNAKE"]

In [89]:
CAND=["PRINT","PRIME","PRISM","POINT","PRAYS","POWER","WATER","WASTE","SNACK","SLEEP","SLICE","SNAKE","CRANE","SLATE","TRACE","CRISP","GRACE","PLANT","GRANT","PRIDE","TRICE","IDEAL","APPLE","BERRY","CHASE","DELTA","EARTH","FAITH","GIANT","HAPPY","INDEX","JUDGE","KNIFE","LEMON","MAGIC","NOVEL","OCEAN","PEACH","QUIET","RIVER","SOLAR","TIGER","UNITY","VIVID","WHALE","XENON","YEAST","ZEBRA","CABIN","DANCE"]
STARTS=["PRINT","PRIME","PRISM","POINT","PRAYS","POWER","WATER","WASTE","SNACK","SLEEP","SLICE","SNAKE","CRANE","SLATE","TRACE","CRISP","GRACE","PLANT","GRANT","PRIDE","TRICE","IDEAL","APPLE","BERRY","CHASE","DELTA","EARTH","FAITH","GIANT","HAPPY","INDEX","JUDGE","KNIFE","LEMON","MAGIC","NOVEL","OCEAN","PEACH","QUIET","RIVER","SOLAR","TIGER","UNITY","VIVID","WHALE","XENON","YEAST","ZEBRA","CABIN","DANCE"]

In [7]:
def load_wordlist(path, *, length=5, to_upper=True, max_words=None):
    """
    讀取文字檔並回傳單字清單。
    - path       : 檔案路徑
    - length     : 只保留指定長度的單字 (預設 5)
    - to_upper   : 是否轉成大寫，方便與 feedback() 對齊
    - max_words  : 選擇性，若只想拿前 N 個字來跑小規模測試
    """
    words = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            w = line.strip()
            if len(w) != length:      # 排除不是 5 字母的行
                continue
            if not w.isalpha():       # 排除含 - ' 等符號
                continue
            words.append(w.upper() if to_upper else w.lower())
            if max_words and len(words) >= max_words:
                break
    return words

In [8]:
CAND = load_wordlist("valid-wordle-words.txt")
print("候選字數：", len(CAND))
STARTS=load_wordlist("valid-wordle-words.txt")

候選字數： 14855


In [None]:
worst   = {w: worstrem(w, CAND) for w in STARTS}
avg_n = {w: statistics.mean(play(ans, w, CAND) for ans in CAND) for w in STARTS}

In [10]:
print("WorstRem:", worst)
print("Avg:", avg_n)

WorstRem: {'AAHED': 243, 'AALII': 289, 'AAPAS': 383, 'AARGH': 340, 'AARTI': 213, 'ABACA': 466, 'ABACI': 307, 'ABACK': 433, 'ABACS': 315, 'ABAFT': 416, 'ABAHT': 401, 'ABAKA': 500, 'ABAMP': 413, 'ABAND': 316, 'ABASE': 194, 'ABASH': 337, 'ABASK': 341, 'ABATE': 225, 'ABAYA': 481, 'ABBAS': 367, 'ABBED': 412, 'ABBES': 319, 'ABBEY': 424, 'ABBOT': 485, 'ABCEE': 430, 'ABEAM': 244, 'ABEAR': 223, 'ABEAT': 225, 'ABEER': 361, 'ABELE': 358, 'ABENG': 309, 'ABERS': 226, 'ABETS': 249, 'ABEYS': 272, 'ABHOR': 378, 'ABIDE': 274, 'ABIES': 198, 'ABIUS': 325, 'ABJAD': 432, 'ABJUD': 556, 'ABLED': 296, 'ABLER': 249, 'ABLES': 226, 'ABLET': 265, 'ABLOW': 424, 'ABMHO': 463, 'ABNET': 280, 'ABODE': 295, 'ABOHM': 463, 'ABOIL': 294, 'ABOMA': 318, 'ABOON': 465, 'ABORD': 333, 'ABORE': 253, 'ABORN': 298, 'ABORT': 337, 'ABOUT': 416, 'ABOVE': 336, 'ABRAM': 312, 'ABRAY': 336, 'ABRIM': 315, 'ABRIN': 275, 'ABRIS': 239, 'ABSEY': 272, 'ABSIT': 293, 'ABUNA': 345, 'ABUNE': 308, 'ABURA': 326, 'ABURN': 342, 'ABUSE': 271, 'ABUTS': 

In [11]:
import pandas as pd
df = (
    pd.DataFrame({
        "Word":   list(worst.keys()),
        "WorstRem": [worst[w]  for w in worst],
        "AvgSteps": [avg_n[w] for w in worst]
    })
    .set_index("Word")
    .sort_values("WorstRem")
)

In [12]:
print(df)

       WorstRem  AvgSteps
Word                     
AESIR       152     3.951
ARISE       152     3.924
ANSAE       154     3.933
AREAS       159     3.899
AISLE       162     3.923
...         ...       ...
APPUY       617     4.388
AZYGY       647     4.449
ABUZZ       658     4.518
BAFFY       659     4.505
BABBY       687     4.542

[1000 rows x 2 columns]


In [12]:
pd.set_option("display.max_rows", None) 
print(df)

       WorstRem
Word           
SERIA       862
SERAI       865
PASEO       878
SAINE       951
RAISE       952
RANES       959
SANER       959
REANS       959
NARES       959
SNARE       959
NEARS       959
KAIES       974
REAIS       975
SOARE       988
LAERS      1001
LEARS      1001
REALS      1001
ARLES      1001
SERAL      1001
EARLS      1001
LARES      1001
RALES      1001
OAVES      1003
NATES      1008
AESIR      1010
NEALS      1010
LEANS      1010
SLANE      1010
ARISE      1010
LANES      1010
AROSE      1020
AEROS      1020
SATER      1022
STRAE      1022
RATES      1022
TARES      1022
TERAS      1022
STOAE      1023
URSAE      1030
OASES      1033
PAISE      1037
NAIOS      1040
SAYER      1045
SAICE      1045
TALES      1059
SETAL      1059
SALET      1059
SOLEI      1060
STARE      1061
TEARS      1061
MAISE      1063
ANISE      1068
TOEAS      1072
STEAR      1078
ARETS      1078
IDEAS      1080
LASER      1082
AEONS      1082
AISLE      1083
ALOSE      1086
ALOES   

In [13]:
#前1000字
pd.set_option("display.max_rows", None) 
print(df)

       WorstRem  AvgSteps
Word                     
AESIR       152     3.951
ARISE       152     3.924
ANSAE       154     3.933
AREAS       159     3.899
AISLE       162     3.923
AREAL       167     3.869
ALANE       170     3.916
AEROS       171     4.001
AROSE       171     4.000
ARLES       172     3.898
ARIEL       173     3.872
ALATE       174     3.908
ARILS       175     3.936
ARIAS       176     3.917
ALOES       176     3.955
ALOSE       176     3.977
ANISE       176     3.907
AEONS       177     3.995
ANEAR       177     3.896
ARENA       177     3.859
ANTAE       180     3.914
ARNAS       187     3.892
ALIAS       188     3.919
AECIA       190     4.053
AMIES       193     3.917
AREAD       193     3.907
ALIEN       194     3.842
ABASE       194     4.014
ANILE       194     3.840
ALINE       194     3.843
ASIDE       196     3.966
ALDEA       196     3.938
AIDES       196     3.956
ARAME       196     3.922
AURAE       196     3.969
ABIES       198     4.039
AMATE       

In [14]:
#correlation
words = list(worst.keys())
x = [worst[w] for w in words]   # ExpRem
y = [avg_n[w]     for w in words]   # 平均步數

xm, ym = sum(x)/len(x), sum(y)/len(y)
num = sum((xi - xm)*(yi - ym) for xi, yi in zip(x, y))
den = math.sqrt(sum((xi - xm)**2 for xi in x) *
                sum((yi - ym)**2 for yi in y))
r = num / den
print(r)
#隨機50字：0.8780093862373516
#前1000字：0.8237241171280653

0.8237241171280653
