<a href="https://colab.research.google.com/github/NINJAHATTORI004/programmingwithdataanalysis/blob/main/tensorflow3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.utils import to_categorical

# Sample text corpus
text = """
The quick brown fox jumps over the lazy dog.
A quick brown fox jumps over the lazy dog in the park.
The quick brown fox is very fast and jumps high.
Dogs and foxes are animals that we often compare.
The lazy dog sleeps all day in the sunny park.
"""

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1  # Add 1 for OOV token

print(f"Total words: {total_words}")
print(f"Word index: {tokenizer.word_index}")

# Create input sequences
input_sequences = []
for line in text.split('\n'):
    if line.strip() == '':
        continue
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)


Total words: 29
Word index: {'the': 1, 'quick': 2, 'brown': 3, 'fox': 4, 'jumps': 5, 'lazy': 6, 'dog': 7, 'over': 8, 'in': 9, 'park': 10, 'and': 11, 'a': 12, 'is': 13, 'very': 14, 'fast': 15, 'high': 16, 'dogs': 17, 'foxes': 18, 'are': 19, 'animals': 20, 'that': 21, 'we': 22, 'often': 23, 'compare': 24, 'sleeps': 25, 'all': 26, 'day': 27, 'sunny': 28}


In [8]:
# Pad sequences and create predictors and label
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')

In [9]:
X = input_sequences[:, :-1]
y = input_sequences[:, -1]
y = to_categorical(y, num_classes=total_words)



In [10]:
print("\nSample input sequences:")
for i, seq in enumerate(X[:3]):
    print(f"Input: {seq} -> Target: {np.argmax(y[i])} ({tokenizer.index_word[np.argmax(y[i])]})")


Sample input sequences:
Input: [0 0 0 0 0 0 0 0 0 0 1] -> Target: 2 (quick)
Input: [0 0 0 0 0 0 0 0 0 1 2] -> Target: 3 (brown)
Input: [0 0 0 0 0 0 0 0 1 2 3] -> Target: 4 (fox)


In [11]:
# Model architecture
model = Sequential([
    Embedding(total_words, 10, input_length=max_sequence_len-1),
    SimpleRNN(50, return_sequences=False),
    Dense(total_words, activation='softmax')
])



In [12]:
model.compile(optimizer='adam', loss='categorical_crossentropy',metrics=['accuracy'])


In [13]:
model.summary()

In [14]:
# Train the model
history = model.fit(X, y, epochs=100, verbose=1)

Epoch 1/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 33ms/step - accuracy: 0.0252 - loss: 3.3662
Epoch 2/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - accuracy: 0.1306 - loss: 3.3387
Epoch 3/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - accuracy: 0.1706 - loss: 3.3143
Epoch 4/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - accuracy: 0.1706 - loss: 3.2968
Epoch 5/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - accuracy: 0.1498 - loss: 3.2714
Epoch 6/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - accuracy: 0.1602 - loss: 3.2436
Epoch 7/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - accuracy: 0.1410 - loss: 3.2134
Epoch 8/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - accuracy: 0.1410 - loss: 3.1850
Epoch 9/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3

In [15]:
# Prediction function
def predict_next_word(seed_text, model, tokenizer, max_sequence_len):
    for _ in range(3):  # Predict next 3 words
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted_probs = model.predict(token_list, verbose=0)
        predicted_index = np.argmax(predicted_probs)
        predicted_word = tokenizer.index_word[predicted_index]
        seed_text += " " + predicted_word
    return seed_text

In [16]:
# Test predictions
test_texts = [
    "The quick brown",
    "A lazy dog",
    "The fox jumps"
]

print("\nPredictions:")
for test_text in test_texts:
    prediction = predict_next_word(test_text, model, tokenizer, max_sequence_len)
    print(f"{test_text} -> {prediction}")


Predictions:
The quick brown -> The quick brown fox jumps over
A lazy dog -> A lazy dog sleeps all day
The fox jumps -> The fox jumps fox all over
