In [20]:
import string
import random
import numpy as np
import pandas as np
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

from google.colab import drive
drive.mount('/content/drive')

# Load the word list from the file
try:
    with open(r'/content/drive/MyDrive/Hangman/Hangman Word List.txt') as f:
        word_list = f.read().splitlines()
except FileNotFoundError:
    print("Word list file not found. Please check the file path.")
    exit(1)

if not word_list:
    print("Word list is empty. Cannot proceed with the game.")
    exit(1)

# Split the word list into training and testing sets
train_words, test_words = train_test_split(word_list, test_size=0.2, random_state=42)

# Function of ngrams
def ngrams(text, n):
    """
    Generate n-grams from the given text.

    Args:
        text (str): The input text.
        n (int): The order of the n-grams.

    Returns:
        list: A list of n-grams.
    """
    ngrams = []
    for i in range(len(text) - n + 1):
        ngrams.append(text[i:i+n])
    return ngrams

# Function to train the n-gram model
def train_ngram_model(word_list, n=3):
    """
    Train an n-gram model on the given word list.

    Args:
        word_list (list): A list of words to train the model on.
        n (int): The order of the n-gram model (default: 3).

    Returns:
        dict: A dictionary representing the n-gram model.
    """
    model = {}
    for word in word_list:
        word = '<' + word + '>'  # Add start and end tokens
        grams = ngrams(word, n)
        for gram in grams:
            key = gram[:-1]
            next_char = gram[-1]
            if key in model:
                model[key][next_char] += 1
            else:
                model[key] = Counter({next_char: 1})
    return model

# Function to generate the machine's guess using the n-gram model, word similarity, and most used letters
def generate_guess(ngram_model, word_state, used_letters, same_length_words, most_used_letters):
    """
    Generate the machine's guess based on various strategies.

    Args:
        ngram_model (dict): The n-gram model.
        word_state (list): The current state of the word.
        used_letters (set): The set of letters already used.
        same_length_words (list): A list of words with the same length as the target word.
        most_used_letters (list): A list of the most commonly used letters.

    Returns:
        str: The machine's guess.
    """
    # Prioritize guessing vowels in the initial guesses
    vowels = set('aeiou')
    if len(used_letters) < 3:
        for vowel in vowels:
            if vowel not in used_letters:
                return vowel

    # Use the n-gram model
    context = '<' + ''.join(word_state)
    if len(context) >= 3:
        context = context[-3:]
        if context in ngram_model:
            next_char_counter = ngram_model[context]
            for char, _ in next_char_counter.most_common():
                if char not in used_letters:
                    return char

    # Use word similarity with same-length words
    vectorizer = CountVectorizer(analyzer='char', ngram_range=(1, 3))
    word_vectors = vectorizer.fit_transform(same_length_words)
    query_vector = vectorizer.transform([''.join(word_state)])
    similarities = cosine_similarity(query_vector, word_vectors)[0]
    sorted_indices = similarities.argsort()[::-1]

    for idx in sorted_indices:
        word = same_length_words[idx]
        for char in word:
            if char not in used_letters and char not in word_state:
                return char

    # Prioritize guessing the most used letters
    for char in most_used_letters:
        if char not in used_letters and char not in word_state:
            return char

    # If all else fails, choose a random unused letter
    return random.choice([char for char in string.ascii_lowercase if char not in used_letters])

# Function to play the game
def play_game(word_list, ngram_model, most_used_letters, num_guesses=10):
    """
    Play a single game of Hangman.

    Args:
        word_list (list): A list of words to choose the target word from.
        ngram_model (dict): The n-gram model.
        most_used_letters (list): A list of the most commonly used letters.
        num_guesses (int): The number of guesses allowed (default: 10).

    Returns:
        bool: True if the machine guessed the word correctly, False otherwise.
    """
    word = random.choice(word_list)
    word_length = len(word)
    same_length_words = [w for w in word_list if len(w) == word_length]
    used_letters = set()
    remaining_guesses = num_guesses
    word_state = ['_'] * word_length

    while remaining_guesses > 0:
        guess = generate_guess(ngram_model, word_state, used_letters, same_length_words, most_used_letters)
        used_letters.add(guess)

        # Update the word state based on the guess
        for i, char in enumerate(word):
            if char == guess:
                word_state[i] = char

        if guess not in word:
            remaining_guesses -= 1
            print(f"Oops! The letter '{guess}' is not in the word. Remaining guesses: {remaining_guesses}")
        else:
            print(f"Good guess! The word is now: {''.join(word_state)}")

        if '_' not in word_state:
            print(f"Congratulations! The machine guessed the word: {''.join(word_state)}")
            return True

    print(f"Sorry, the machine ran out of guesses. The word was: {word}")
    return False

def main():
    # Train the n-gram model on the training set
    ngram_model = train_ngram_model(train_words)

    # Get the most used letters from the training set
    letter_counts = Counter(''.join(train_words))
    most_used_letters = [char for char, count in letter_counts.most_common()]

    # Play the game and evaluate the accuracy
    num_correct = 0
    num_games = 100
    num_guesses = 10  # Number of guesses allowed

    for _ in range(num_games):
        if play_game(test_words, ngram_model, most_used_letters, num_guesses):
            num_correct += 1

    accuracy = num_correct / num_games
    print(f"Accuracy: {accuracy:.2f}")

if __name__ == "__main__":
    main()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Good guess! The word is now: ___e____
Good guess! The word is now: ___e__a_
Good guess! The word is now: _u_e__a_
Oops! The letter 'n' is not in the word. Remaining guesses: 9
Good guess! The word is now: _u_e__as
Oops! The letter 'g' is not in the word. Remaining guesses: 8
Oops! The letter 't' is not in the word. Remaining guesses: 7
Oops! The letter 'r' is not in the word. Remaining guesses: 6
Oops! The letter 'd' is not in the word. Remaining guesses: 5
Good guess! The word is now: _uce__as
Good guess! The word is now: buce__as
Good guess! The word is now: bucellas
Congratulations! The machine guessed the word: bucellas
Good guess! The word is now: ___e_________e_e___
Good guess! The word is now: ___e___a_____e_e___
Oops! The letter 'u' is not in the word. Remaining guesses: 9
Good guess! The word is now: ___e___a_c___e_e___
Oops! The letter 'o' is not in