In [6]:
def getWordCount(file_path):
    """
    This function is used to take in a file with a word and its relative word count
    seperated by a space, with each word on its own line
    """
    wordCounts = {}
    with open(file_path, 'r') as file:
        for line in file:
            parts = line.strip().split()
            word = parts[0]
            amount = int(parts[1])
            wordCounts[word] = amount
    return wordCounts

In [8]:
def getTotalWords(wordCounts):
    """
    Returns sum of total word count(not unique)
    """
    return sum(wordCounts.values())

In [10]:
def priorProbs(wordCounts, totalCount):
    """
    For each word, calculates the percentage of that word compared to the total words
    """
    priorProbabilities = {}
    for word, count in wordCounts.items():
        priorProbabilities[word] = count/totalCount
    return priorProbabilities

In [16]:
def wordAmounts(priorProbas):
    """
    Takes in all word probabilities and returns the 15 most/least common words
    """
    wordSorted = sorted(priorProbas.items(), key=lambda x: x[1], reverse = True)
    for word,prob in wordSorted[:15]:
        print(f"{word} : {prob:.4f}")
    print("\n")
    for word,prob in wordSorted[-15:]:
        print(f"{word} : {prob:.6f}")


In [18]:
file_path = "word_counts_05-1.txt"
wordCounts = getWordCount(file_path)
totalCount=getTotalWords(wordCounts)
priorProbas = priorProbs(wordCounts, totalCount)
wordAmounts(priorProbas)

THREE : 0.0356
SEVEN : 0.0233
EIGHT : 0.0216
WOULD : 0.0209
ABOUT : 0.0205
THEIR : 0.0190
WHICH : 0.0185
AFTER : 0.0144
FIRST : 0.0143
FIFTY : 0.0139
OTHER : 0.0138
FORTY : 0.0124
YEARS : 0.0116
THERE : 0.0113
SIXTY : 0.0095


YIGAL : 0.000001
CCAIR : 0.000001
CLEFT : 0.000001
FABRI : 0.000001
FOAMY : 0.000001
NIAID : 0.000001
PAXON : 0.000001
SERNA : 0.000001
TOCOR : 0.000001
YALOM : 0.000001
BOSAK : 0.000001
CAIXA : 0.000001
MAPCO : 0.000001
OTTIS : 0.000001
TROUP : 0.000001


In [20]:
def probOfLetterGivenW(letter, word):
    """
    Helper function to help sum up instances of letters in words
    """
    for i in range(5):  
        if word[i] == letter:
            return 1
    return 0

In [22]:
def probOfEvidGivenW(evidence, word):
    """
    Helper function to organize and seperate incorrect/correct guesses based on guessed letters
    """
    # Extract the correctly guessed letters and their positions
    correct_guesses = evidence["correct"]
    incorrect_guesses = evidence["incorrect"]
    
    # Check if the correct letters are in the right positions
    for pos, letter in correct_guesses.items():
        if word[pos] != letter:
            return 0
    
    # Check if the incorrect letters don't appear in the unguessed positions
    for i in range(5):
        if i not in correct_guesses and word[i] in incorrect_guesses:
            return 0
    
    return 1

In [24]:
def probOfW(word, wordCounts, totalCount):
    """
    Helper function to return percent of word to total word count
    """
    return wordCounts[word] / totalCount

In [26]:
def predictiveProb(letter, evidence, wordCounts, totalCount):
    """
    Given a letter and position, function outputs the probability of all possible outcomes
    """
    numerator = 0
    denominator = 0
    
    for word in wordCounts:
        # Calculate the posterior probability for each word
        prob_E_given_W = probOfEvidGivenW(evidence, word)
        prob_W = probOfW(word, wordCounts, totalCount)
        posterior = prob_E_given_W * prob_W
        
        # Update denominator
        denominator += posterior
        
        # Check if the letter is in any position and update numerator
        if probOfLetterGivenW(letter, word):
            numerator += posterior
    
    # Return the predictive probability
    return numerator / denominator if denominator != 0 else 0


In [28]:
def findBestNextGuess(unguessed_letters, evidence, wordCounts, totalCount):
    """
    Returns next letter with highest probability and the word with the highest probability of being correct
    """
    best_letter = None
    best_prob = 0
    
    for letter in unguessed_letters:
        prob = predictiveProb(letter, evidence, wordCounts, totalCount)
        if prob > best_prob:
            best_prob = prob
            best_letter = letter
    
    return best_letter, best_prob


In [34]:
"""
Here, given A _ _ _ S and an incorrect guess of I we find all letters that are not 
guessed and find the best probability of the next letter that can be in this word.
"""
evidence = {
    "correct": {0:'A', 4:'S'},
    "incorrect": ['I']
}
unguessedLetters = [chr(i) for i in range(65, 91) if chr(i) not in list(evidence["correct"].values()) + evidence["incorrect"]]

bestLetter, bestProb = findBestNextGuess(unguessedLetters, evidence, wordCounts, totalCount)
print(f"Best next guess: {bestLetter} with probability {bestProb}")

Best next guess: R with probability 0.785262785660611


In [38]:
"""
Here, given _ _ O _ _ and an incorrect guess of A,E,M,N,T we find all letters that are not 
guessed and find the best probability of the next letter that can be in this word.
"""
evidence = {
    "correct": {2:'O'},
    "incorrect": ['A','E','M','N','T']
}
##used chatgpt here because I got hardstuck trying to think of a way to represent all letters for guessing

unguessedLetters = [chr(i) for i in range(65, 91) if chr(i) not in list(evidence["correct"].values()) + evidence["incorrect"]]

bestLetter, bestProb = findBestNextGuess(unguessedLetters, evidence, wordCounts, totalCount)
print(f"Best next guess: {bestLetter} with probability {bestProb}")


Best next guess: R with probability 0.5898774885145486


In [42]:
"""
Here, given _ U _ _ _ and an incorrect guess of A,E,I,O,S we find all letters that are not 
guessed and find the best probability of the next letter that can be in this word.
"""
evidence = {
    "correct": {1:'U'},
    "incorrect": ['A','E','I','O','S']
}

# Create a list of unguessed letters
unguessedLetters = [chr(i) for i in range(65, 91) if chr(i) not in list(evidence["correct"].values()) + evidence["incorrect"]]
##used chatgpt here because I got hardstuck trying to think of a way to represent all letters for guessing

best_letter, best_prob = findBestNextGuess(unguessedLetters, evidence, wordCounts, totalCount)
print(f"Best next guess: {best_letter} with probability {best_prob}")


Best next guess: Y with probability 0.6190119760479037
