In [1]:
import numpy as np

In [2]:
# Example corpus for RNN language modeling (focused on smartphones)
corpus = [
    "The iPhone 14 has a powerful A16 chip",
    "Samsung Galaxy phones have great displays",
    "I love the camera quality of Google Pixel",
    "OnePlus offers fast charging and smooth performance",
    "Battery life is very important in modern smartphones"
]

In [3]:
corpus2 = [
    "data science is an interdisciplinary field",
    "machine learning is a part of data science",
    "deep learning uses neural networks",
    "python is widely used in data science",
    "statistics is important for data analysis",
    "data visualization helps understand patterns",
    "models learn from historical data",
    "feature engineering improves model performance",
    "big data requires scalable systems",
    "artificial intelligence powers modern applications"
]

In [4]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

In [5]:
# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1
total_words

37

In [6]:
tokenizer.word_index

{'the': 1,
 'iphone': 2,
 '14': 3,
 'has': 4,
 'a': 5,
 'powerful': 6,
 'a16': 7,
 'chip': 8,
 'samsung': 9,
 'galaxy': 10,
 'phones': 11,
 'have': 12,
 'great': 13,
 'displays': 14,
 'i': 15,
 'love': 16,
 'camera': 17,
 'quality': 18,
 'of': 19,
 'google': 20,
 'pixel': 21,
 'oneplus': 22,
 'offers': 23,
 'fast': 24,
 'charging': 25,
 'and': 26,
 'smooth': 27,
 'performance': 28,
 'battery': 29,
 'life': 30,
 'is': 31,
 'very': 32,
 'important': 33,
 'in': 34,
 'modern': 35,
 'smartphones': 36}

In [7]:
# Generate input sequences
input_sequences = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [8]:
input_sequences

[[1, 2],
 [1, 2, 3],
 [1, 2, 3, 4],
 [1, 2, 3, 4, 5],
 [1, 2, 3, 4, 5, 6],
 [1, 2, 3, 4, 5, 6, 7],
 [1, 2, 3, 4, 5, 6, 7, 8],
 [9, 10],
 [9, 10, 11],
 [9, 10, 11, 12],
 [9, 10, 11, 12, 13],
 [9, 10, 11, 12, 13, 14],
 [15, 16],
 [15, 16, 1],
 [15, 16, 1, 17],
 [15, 16, 1, 17, 18],
 [15, 16, 1, 17, 18, 19],
 [15, 16, 1, 17, 18, 19, 20],
 [15, 16, 1, 17, 18, 19, 20, 21],
 [22, 23],
 [22, 23, 24],
 [22, 23, 24, 25],
 [22, 23, 24, 25, 26],
 [22, 23, 24, 25, 26, 27],
 [22, 23, 24, 25, 26, 27, 28],
 [29, 30],
 [29, 30, 31],
 [29, 30, 31, 32],
 [29, 30, 31, 32, 33],
 [29, 30, 31, 32, 33, 34],
 [29, 30, 31, 32, 33, 34, 35],
 [29, 30, 31, 32, 33, 34, 35, 36]]

In [9]:
# Padding sequences
from tensorflow.keras.preprocessing.sequence import pad_sequences
max_seq_len = max([len(x) for x in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_seq_len, padding='pre')
input_sequences

array([[ 0,  0,  0,  0,  0,  0,  1,  2],
       [ 0,  0,  0,  0,  0,  1,  2,  3],
       [ 0,  0,  0,  0,  1,  2,  3,  4],
       [ 0,  0,  0,  1,  2,  3,  4,  5],
       [ 0,  0,  1,  2,  3,  4,  5,  6],
       [ 0,  1,  2,  3,  4,  5,  6,  7],
       [ 1,  2,  3,  4,  5,  6,  7,  8],
       [ 0,  0,  0,  0,  0,  0,  9, 10],
       [ 0,  0,  0,  0,  0,  9, 10, 11],
       [ 0,  0,  0,  0,  9, 10, 11, 12],
       [ 0,  0,  0,  9, 10, 11, 12, 13],
       [ 0,  0,  9, 10, 11, 12, 13, 14],
       [ 0,  0,  0,  0,  0,  0, 15, 16],
       [ 0,  0,  0,  0,  0, 15, 16,  1],
       [ 0,  0,  0,  0, 15, 16,  1, 17],
       [ 0,  0,  0, 15, 16,  1, 17, 18],
       [ 0,  0, 15, 16,  1, 17, 18, 19],
       [ 0, 15, 16,  1, 17, 18, 19, 20],
       [15, 16,  1, 17, 18, 19, 20, 21],
       [ 0,  0,  0,  0,  0,  0, 22, 23],
       [ 0,  0,  0,  0,  0, 22, 23, 24],
       [ 0,  0,  0,  0, 22, 23, 24, 25],
       [ 0,  0,  0, 22, 23, 24, 25, 26],
       [ 0,  0, 22, 23, 24, 25, 26, 27],
       [ 0, 22, 

In [10]:
# Split into input and label
X = input_sequences[:, :-1]
X

array([[ 0,  0,  0,  0,  0,  0,  1],
       [ 0,  0,  0,  0,  0,  1,  2],
       [ 0,  0,  0,  0,  1,  2,  3],
       [ 0,  0,  0,  1,  2,  3,  4],
       [ 0,  0,  1,  2,  3,  4,  5],
       [ 0,  1,  2,  3,  4,  5,  6],
       [ 1,  2,  3,  4,  5,  6,  7],
       [ 0,  0,  0,  0,  0,  0,  9],
       [ 0,  0,  0,  0,  0,  9, 10],
       [ 0,  0,  0,  0,  9, 10, 11],
       [ 0,  0,  0,  9, 10, 11, 12],
       [ 0,  0,  9, 10, 11, 12, 13],
       [ 0,  0,  0,  0,  0,  0, 15],
       [ 0,  0,  0,  0,  0, 15, 16],
       [ 0,  0,  0,  0, 15, 16,  1],
       [ 0,  0,  0, 15, 16,  1, 17],
       [ 0,  0, 15, 16,  1, 17, 18],
       [ 0, 15, 16,  1, 17, 18, 19],
       [15, 16,  1, 17, 18, 19, 20],
       [ 0,  0,  0,  0,  0,  0, 22],
       [ 0,  0,  0,  0,  0, 22, 23],
       [ 0,  0,  0,  0, 22, 23, 24],
       [ 0,  0,  0, 22, 23, 24, 25],
       [ 0,  0, 22, 23, 24, 25, 26],
       [ 0, 22, 23, 24, 25, 26, 27],
       [ 0,  0,  0,  0,  0,  0, 29],
       [ 0,  0,  0,  0,  0, 29, 30],
 

In [11]:
y = input_sequences[:, -1]
y

array([ 2,  3,  4,  5,  6,  7,  8, 10, 11, 12, 13, 14, 16,  1, 17, 18, 19,
       20, 21, 23, 24, 25, 26, 27, 28, 30, 31, 32, 33, 34, 35, 36])

In [12]:
y = tf.keras.utils.to_categorical(y, num_classes=total_words)
y

array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [13]:
# Define a simple RNN model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.layers import LSTM

model = Sequential()
#model.add(Embedding(input_dim=total_words, output_dim=10))
model.add(Embedding(input_dim=total_words,
          output_dim=10,
          input_length=max_seq_len - 1))
#model.add(SimpleRNN(units=128))
model.add(LSTM(100))
model.add(Dense(total_words, activation='softmax'))



In [14]:
# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [15]:
# Train the model
model.fit(X, y, epochs=300, verbose=1)

Epoch 1/300
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - accuracy: 0.0000e+00 - loss: 3.6107
Epoch 2/300
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 139ms/step - accuracy: 0.0625 - loss: 3.6089
Epoch 3/300
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step - accuracy: 0.0312 - loss: 3.6071
Epoch 4/300
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 175ms/step - accuracy: 0.0312 - loss: 3.6052
Epoch 5/300
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step - accuracy: 0.0312 - loss: 3.6032
Epoch 6/300
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 152ms/step - accuracy: 0.0312 - loss: 3.6011
Epoch 7/300
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 164ms/step - accuracy: 0.0312 - loss: 3.5988
Epoch 8/300
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 159ms/step - accuracy: 0.0312 - loss: 3.5964
Epoch 9/300
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x28302442450>

In [16]:
# Function to predict next word using RNN
import numpy as np
def predict_next_word_rnn(seed_text):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_seq_len-1, padding='pre')
    predicted = np.argmax(model.predict(token_list, verbose=0), axis=-1)
    return tokenizer.index_word[predicted[0]]

In [17]:
# Predict the next word
seed_text_rnn = "Battery life is very"
predicted_word_rnn = predict_next_word_rnn(seed_text_rnn)
predicted_word_rnn

'important'