mask function with permutations

In [1]:
import random
import itertools

def mask_word(word, mask_prob=0.5):
    masked_entries = []

    for num_to_mask in range(1, min(6, len(word))):  # Mask 1 to 5 letters
        for _ in range(num_to_mask):
            masked_word = list(word)
            target_labels = []

            # Randomly decide whether to mask a letter
            indices_to_mask = random.sample(range(len(word)), num_to_mask)
            for i in indices_to_mask:
                target_labels.append(masked_word[i])
                masked_word[i] = "_"

            masked_entries.append(("".join(masked_word), target_labels))

    return masked_entries

def preprocess_word(word):
    return [ord(letter) - ord('a') + 1 if letter != '_' else 0 for letter in word]  # Use 0 for underscore

def generate_permutations(word):
    underscores_indices = [i for i, char in enumerate(word) if char == '_']
    num_permutations = random.randint(1, len(underscores_indices))

    # Generate all possible combinations of underscores in the masked word
    permutations_with_underscores = list(itertools.combinations(underscores_indices, num_permutations))

    # Replace underscores with the corresponding letters in each permutation
    result_permutations = []
    for combination in permutations_with_underscores:
        new_word = list(word)
        for index in combination:
            new_word[index] = '_'
        result_permutations.append("".join(new_word))

    return result_permutations

def process_words_from_file(file_path):
    masked_entries_dict = {}

    with open(file_path, 'r') as file:
        words = [line.strip() for line in file]

    for word in words:
        masked_entries = mask_word(word)
        for masked_word, target_labels in masked_entries:
            word_id = len(masked_entries_dict)
            masked_entries_dict[word_id] = {'original_word': word, 'masked_word': masked_word, 'target_labels': target_labels}

    return masked_entries_dict

# Example usage:
# file_path =  "/content/words_250000_train.txt"  # Replace with the path to your text file containing words
# masked_entries_dict = process_words_from_file(file_path)



Mask function without permutation


In [None]:
# import random

# def mask_word(word, mask_prob=0.5):
#     masked_entries = []

#     for num_to_mask in range(1, min(6, len(word))):  # Mask 1 to 5 letters
#         # Randomly decide whether to mask a letter
#         if random.random() < mask_prob:
#             masked_word = list(word)
#             target_labels = []

#             # Randomly decide which letters to mask
#             indices_to_mask = random.sample(range(len(word)), num_to_mask)
#             for i in indices_to_mask:
#                 target_labels.append(masked_word[i])
#                 masked_word[i] = "_"

#             masked_entries.append(("".join(masked_word), target_labels))

#     return masked_entries

# # Example usage:
# # word = "example"
# # masked_entries = mask_word(word)
# # print(masked_entries)


read fromm file

In [2]:
def read_data_from_file(file_path):
    """
    Read data from a text file and store it as a list.

    Parameters:
    - file_path (str): The path to the text file.

    Returns:
    - data_list (list): A list containing the data read from the file.
    """
    data_list = []

    try:
        with open(file_path, 'r') as file:
            # Read each line from the file and append it to the list
            for line in file:
                data_list.append(line.strip())
    except FileNotFoundError:
        print(f"File not found: {file_path}")
    except Exception as e:
        print(f"Error reading file: {e}")

    return data_list

# Example usage:
# file_path = "/path/to/your/file.txt"  # Replace with the actual path to your text file
# data_list = read_data_from_file(file_path)

# # Now 'data_list' contains the data read from the file
# print(data_list)


model - 5 layer

evaluate on words



In [3]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Embedding

# Define the function to read data from a text file
def read_data_from_file(file_path):
    data_list = []
    try:
        with open(file_path, 'r') as file:
            for line in file:
                data_list.append(line.strip())
    except FileNotFoundError:
        print(f"File not found: {file_path}")
    except Exception as e:
        print(f"Error reading file: {e}")
    return data_list

# Function to create training data
def create_training_data(words):
    X, y = [], []
    unique_chars = set(''.join(words))
    unique_chars.add('_')  # Add underscore to the set
    char_to_int = {char: i for i, char in enumerate(unique_chars)}
    int_to_char = {i: char for char, i in char_to_int.items()}

    for word in words:
        for i in range(len(word)):
            X.append(word[:i] + '_' + word[i+1:])
            y.append(word[i])
    return X, y, char_to_int, int_to_char




def evaluate_model(model, words, char_to_int, X_train_numeric):
    X_eval, y_eval, _, _ = create_training_data(words)

    # Convert characters to numerical values, handling characters not in char_to_int
    X_eval_numeric = [[char_to_int[char] if char in char_to_int else 0 for char in word] for word in X_eval]
    y_eval_numeric = [char_to_int[char] if char in char_to_int else 0 for char in y_eval]

    # Pad sequences to ensure consistent length
    X_eval_numeric = pad_sequences(X_eval_numeric, maxlen=X_train_numeric.shape[1], padding='post')
    y_eval_numeric = np.array(y_eval_numeric)

    # Reshape input for the model
    X_eval_numeric = np.reshape(X_eval_numeric, (X_eval_numeric.shape[0], X_eval_numeric.shape[1], 1))

    # Evaluate the model on masked words
    _, accuracy = model.evaluate(X_eval_numeric, y_eval_numeric)
    print(f"Model Accuracy on Eval Words: {accuracy * 100:.2f}%")



In [4]:
# Load data from the file
file_path = "/content/words_250000_train.txt"
words = read_data_from_file(file_path)

# Create training data from the sample words
X_train, y_train, char_to_int, int_to_char = create_training_data(words)

# Modify the training data creation to use masked words
X_train = [word[:i] + '_' + word[i + 1:] for word in words for i in range(len(word))]
y_train = [word[i] for word in words for i in range(len(word))]

# Continue with the rest of the code
X_train_numeric = pad_sequences([[char_to_int[char] for char in entry] for entry in X_train], padding='post')
y_train_numeric = np.array([char_to_int[char] for char in y_train])
X_train_numeric = np.reshape(X_train_numeric, (X_train_numeric.shape[0], X_train_numeric.shape[1], 1))

# Build and train the model
model = Sequential()
model.add(Embedding(input_dim=len(char_to_int), output_dim=50, input_length=X_train_numeric.shape[1]))
model.add(LSTM(100, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dropout(0.2))
model.add(Dense(50, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(len(char_to_int), activation='softmax'))

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(lr=0.001), metrics=['accuracy'])

# Train the model
history = model.fit(X_train_numeric, y_train_numeric, epochs=10, batch_size=26)

# Evaluate the model on masked words
evaluate_model(model, words, char_to_int, X_train_numeric)




Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Model Accuracy on Eval Words: 59.47%


In [None]:
#version 2 guess

In [None]:
# def guess(model, char_to_int, current_word):
#     # Convert the current_word to numerical values
#     current_word_numeric = [char_to_int[char] if char in char_to_int else 0 for char in current_word]

#     # Pad the sequence
#     current_word_padded = pad_sequences([current_word_numeric], maxlen=model.input_shape[1], padding='post')

#     # Reshape input for the model
#     current_word_padded = np.reshape(current_word_padded, (current_word_padded.shape[0], current_word_padded.shape[1], 1))

#     # Make a prediction using the model
#     prediction = model.predict(current_word_padded)

#     # Get the index of the predicted letter with the highest probability
#     predicted_index = np.argmax(prediction)

#     # Convert the index back to the character using int_to_char dictionary
#     guessed_letter = int_to_char[predicted_index]

#     return guessed_letter



In [6]:
# Save the model
from keras.models import load_model

# Save the model
model.save("hungman_lstm.h5")
model.save("C:/Users/Kaarvin/Downloads/hungman_lstm.h5")

# Load the saved model
loaded_model = load_model("hungman_lstm.h5")

# Example usage of guess function
# current_word = 'boo_'  # Replace with the current masked word
# guessed_letter = guess(loaded_model, char_to_int, current_word)
# print(f"Guessed Letter: {guessed_letter}")


# # Example usage of guess function
# current_word = 'boo_'  # Replace with the current masked word
# guessed_letter = guess(loaded_model, char_to_int, current_word)
# print(f"Guessed Letter: {guessed_letter}")


2 params version

In [7]:
# Define global variables for the model, char_to_int, and int_to_char
global_model = None
global_char_to_int = None
global_int_to_char = None

# Modify the guess function
def guess(current_word, prev_guess):
    global global_model
    global global_char_to_int
    global global_int_to_char

    # Combine the current word and previous guess
    input_sequence = current_word.replace('_', prev_guess)

    # Convert the input sequence to numerical values
    input_sequence_numeric = [global_char_to_int[char] if char in global_char_to_int else 0 for char in input_sequence]

    # Pad the sequence
    input_sequence_padded = pad_sequences([input_sequence_numeric], maxlen=global_model.input_shape[1], padding='post')

    # Reshape input for the model
    input_sequence_padded = np.reshape(input_sequence_padded, (input_sequence_padded.shape[0], input_sequence_padded.shape[1], 1))

    # Make a prediction using the global model
    prediction = global_model.predict(input_sequence_padded)

    # Get the index of the predicted letter with the highest probability
    predicted_index = np.argmax(prediction)

    # Convert the index back to the character using global_int_to_char dictionary
    guessed_letter = global_int_to_char[predicted_index]

    return guessed_letter


In [14]:
# Set global variables
global_model = loaded_model
global_char_to_int = char_to_int
global_int_to_char = int_to_char

# Example usage of guess function with previous guess 'a'
current_word = 'a__le'  # Replace with the current masked word
prev_guess = 'p'  # Replace with the previous guess
guessed_letter = guess(current_word, prev_guess)
print(f"Guessed Letter: {guessed_letter}")


Guessed Letter: e
