In [3]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Embedding, Dense
import nltk

In [4]:
# Download NLTK tokenizer
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\david\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
# Load cleaned dataset
cl_food_data = pd.read_csv("cleaned_food_text.csv")


In [6]:
# Reduce dataset size for faster training
cl_food_data = cl_food_data.sample(n=25000, random_state=42)  # Adjust sample size as needed

In [7]:
# Extract text data
sentences = cl_food_data["tokenized_text"].dropna().tolist()

In [8]:
# Tokenizer setup
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
total_words = len(tokenizer.word_index) + 1


In [9]:
# Define max sequence length
MAX_SEQ_LENGTH = 50  # Limit sequences to 50 tokens to prevent large memory usage

In [10]:
# Convert text to sequences in batches
def generate_training_data(sentences, tokenizer, max_seq_length):
    for sentence in sentences:
        token_list = tokenizer.texts_to_sequences([sentence])[0]
        for i in range(1, len(token_list)):
            seq = token_list[:i+1]
            if len(seq) <= max_seq_length:
                yield seq

In [11]:
# Create input sequences dynamically
input_sequences = list(generate_training_data(sentences, tokenizer, MAX_SEQ_LENGTH))

In [12]:
# Padding sequences for equal length
input_sequences = pad_sequences(input_sequences, maxlen=MAX_SEQ_LENGTH, padding="pre")

In [13]:
# Splitting data into inputs (X) and labels (y)
X, y = input_sequences[:, :-1], input_sequences[:, -1]  # Keep labels as integers (no one-hot encoding)

In [14]:
# Define LSTM Model (Use sparse_categorical_crossentropy to avoid large one-hot encoding)
model = Sequential([
    Embedding(input_dim=total_words, output_dim=64, input_length=MAX_SEQ_LENGTH-1),
    LSTM(64, return_sequences=True),
    LSTM(64),
    Dense(64, activation="relu"),
    Dense(total_words, activation="softmax")
])



In [15]:
# Compile Model (Use sparse_categorical_crossentropy)
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])


In [18]:
# Train Model in Batches
model.fit(X, y, epochs=1, batch_size=8, verbose=1)

[1m   200/147816[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:48:59[0m 44ms/step - accuracy: 0.0271 - loss: 6.4995

KeyboardInterrupt: 

In [30]:
# Save trained model
model.save("lstm_next_word_model.h5")



In [31]:
# Function to predict next word
def predict_next_word(seed_text, tokenizer, max_seq_length):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_seq_length-1, padding="pre")
    predicted_probs = model.predict(token_list, verbose=0)
    predicted_index = np.argmax(predicted_probs)

    for word, index in tokenizer.word_index.items():
        if index == predicted_index:
            return word
    return ""

In [32]:
# Test model
print("\nTesting next-word prediction:")



Testing next-word prediction:


In [1]:
print(f"Input: 'Cake'")

Input: 'Cake'


In [2]:
print(f"Prediction: {predict_next_word('Cake', tokenizer, MAX_SEQ_LENGTH)}")

NameError: name 'predict_next_word' is not defined