In [None]:
!pip install tensorflow



In [1]:
import numpy as np
from keras.models import Sequential
from keras.layers import SimpleRNN, Dense,Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [2]:
#Generating some example of sequential data
import pandas as pd
df=pd.read_csv("/content/earth.txt",sep=".")
df

Unnamed: 0,Earth is the third planet from the Sun and the only astronomical object known to harbor life,"This is enabled by Earth being a water world, the only one in the Solar System sustaining liquid surface water","Almost all of Earth's water is contained in its global ocean, covering 70",8% of Earth's crust,The remaining 29,"2% of Earth's crust is land, most of which is located in the form of continental landmasses within one hemisphere, Earth's land hemisphere","Most of Earth's land is somewhat humid and covered by vegetation, while large sheets of ice at Earth's polar deserts retain more water than Earth's groundwater, lakes, rivers and atmospheric water combined","Earth's crust consists of slowly moving tectonic plates, which interact to produce mountain ranges, volcanoes, and earthquakes",Earth has a liquid outer core that generates a magnetosphere capable of deflecting most of the destructive solar winds and cosmic radiation,Unnamed: 9


In [6]:

#tokenizing the words
tokenizer=Tokenizer()
tokenizer.fit_on_texts(df)
total_words=len(tokenizer.word_index) + 1
print(total_words)

98


In [7]:

# Creating input sequences and their corresponding next words
input_sequences = []
for sentence in df:
    tokenized_sentence = tokenizer.texts_to_sequences([sentence])[0]
    for i in range(1, len(tokenized_sentence)):
        n_gram_sequence = tokenized_sentence[:i+1]
        input_sequences.append(n_gram_sequence)
input_sequences

[[7, 4],
 [7, 4, 2],
 [7, 4, 2, 21],
 [7, 4, 2, 21, 22],
 [7, 4, 2, 21, 22, 23],
 [7, 4, 2, 21, 22, 23, 2],
 [7, 4, 2, 21, 22, 23, 2, 24],
 [7, 4, 2, 21, 22, 23, 2, 24, 5],
 [7, 4, 2, 21, 22, 23, 2, 24, 5, 2],
 [7, 4, 2, 21, 22, 23, 2, 24, 5, 2, 13],
 [7, 4, 2, 21, 22, 23, 2, 24, 5, 2, 13, 25],
 [7, 4, 2, 21, 22, 23, 2, 24, 5, 2, 13, 25, 26],
 [7, 4, 2, 21, 22, 23, 2, 24, 5, 2, 13, 25, 26, 27],
 [7, 4, 2, 21, 22, 23, 2, 24, 5, 2, 13, 25, 26, 27, 14],
 [7, 4, 2, 21, 22, 23, 2, 24, 5, 2, 13, 25, 26, 27, 14, 28],
 [7, 4, 2, 21, 22, 23, 2, 24, 5, 2, 13, 25, 26, 27, 14, 28, 29],
 [30, 4],
 [30, 4, 31],
 [30, 4, 31, 15],
 [30, 4, 31, 15, 7],
 [30, 4, 31, 15, 7, 32],
 [30, 4, 31, 15, 7, 32, 8],
 [30, 4, 31, 15, 7, 32, 8, 6],
 [30, 4, 31, 15, 7, 32, 8, 6, 33],
 [30, 4, 31, 15, 7, 32, 8, 6, 33, 2],
 [30, 4, 31, 15, 7, 32, 8, 6, 33, 2, 13],
 [30, 4, 31, 15, 7, 32, 8, 6, 33, 2, 13, 16],
 [30, 4, 31, 15, 7, 32, 8, 6, 33, 2, 13, 16, 9],
 [30, 4, 31, 15, 7, 32, 8, 6, 33, 2, 13, 16, 9, 2],
 [30, 4, 3

In [8]:
# Padding sequences for consistent input size
max_sequence_length = max([len(seq) for seq in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_length,padding='pre')

In [9]:
input_sequences

array([[ 0,  0,  0, ...,  0,  7,  4],
       [ 0,  0,  0, ...,  7,  4,  2],
       [ 0,  0,  0, ...,  4,  2, 21],
       ...,
       [ 0,  0,  0, ..., 93,  5, 94],
       [ 0,  0,  0, ...,  5, 94, 95],
       [ 0,  0,  0, ...,  0, 96, 97]], dtype=int32)

In [10]:
# Padding sequences for consistent input size
max_sequence_length = max([len(seq) for seq in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_length,padding='pre')

In [11]:
# Creating input and output data
X, y = input_sequences[:, :-1], input_sequences[:, -1]
y = to_categorical(y, num_classes=total_words)

In [12]:
# Building a simple RNN model
model = Sequential()
model.add(Embedding(input_dim=total_words, output_dim=50, input_length=max_sequence_length-1))
model.add(SimpleRNN(100, return_sequences=True))
model.add(SimpleRNN(100))
model.add(Dense(total_words, activation='softmax'))

In [13]:
#compile the model
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])

#train the model
model.fit(X,y,epochs=50,verbose=2)

Epoch 1/50
5/5 - 3s - loss: 4.6360 - accuracy: 0.0070 - 3s/epoch - 565ms/step
Epoch 2/50
5/5 - 0s - loss: 4.4260 - accuracy: 0.0839 - 130ms/epoch - 26ms/step
Epoch 3/50
5/5 - 0s - loss: 4.2394 - accuracy: 0.1469 - 175ms/epoch - 35ms/step
Epoch 4/50
5/5 - 0s - loss: 4.0750 - accuracy: 0.1888 - 165ms/epoch - 33ms/step
Epoch 5/50
5/5 - 0s - loss: 3.9295 - accuracy: 0.1958 - 164ms/epoch - 33ms/step
Epoch 6/50
5/5 - 0s - loss: 3.7914 - accuracy: 0.3147 - 167ms/epoch - 33ms/step
Epoch 7/50
5/5 - 0s - loss: 3.6182 - accuracy: 0.3357 - 181ms/epoch - 36ms/step
Epoch 8/50
5/5 - 0s - loss: 3.4674 - accuracy: 0.3846 - 156ms/epoch - 31ms/step
Epoch 9/50
5/5 - 0s - loss: 3.3033 - accuracy: 0.4406 - 155ms/epoch - 31ms/step
Epoch 10/50
5/5 - 0s - loss: 3.1508 - accuracy: 0.4406 - 177ms/epoch - 35ms/step
Epoch 11/50
5/5 - 0s - loss: 2.9818 - accuracy: 0.5385 - 168ms/epoch - 34ms/step
Epoch 12/50
5/5 - 0s - loss: 2.8383 - accuracy: 0.5455 - 170ms/epoch - 34ms/step
Epoch 13/50
5/5 - 0s - loss: 2.6913 - a

<keras.src.callbacks.History at 0x7de31c79a830>

In [15]:
# Generating text using the trained model
seed_text = input("Enter the starting word: ")
next_words = int(input("Enter how many words to predict: "))

for _ in range(next_words):
    tokenized_seed = tokenizer.texts_to_sequences([seed_text])[0]
    tokenized_seed = pad_sequences([tokenized_seed], maxlen=max_sequence_length-1, padding='pre')
    predicted_word_index = np.argmax(model.predict(tokenized_seed), axis=-1)
    predicted_word = tokenizer.index_word[predicted_word_index[0]]
    seed_text += " " + predicted_word
print(seed_text)

Enter the starting word: third
Enter how many words to predict: 7
third of earth's crust is land most and
