In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
import pickle
import numpy as np
import os

In [2]:
file=open('MalayalamWords.txt','r',encoding="utf8")

In [3]:
file = open("MalayalamWords.txt", "r", encoding = "utf8")

# store file in list
lines = []
for i in file:
    lines.append(i)

# Convert list to string
data = ""
for i in lines:
  data = ' '. join(lines) 

#replace unnecessary stuff with space
data = data.replace('\n', '').replace('\r', '').replace('\ufeff', '').replace('“','').replace('”','')  #new line, carriage return, unicode character --> replace by space

#remove unnecessary spaces 
data = data.split()
data = ' '.join(data)
data[:500]

'മൃദുവും, വെളുത്ത നിറത്തിലുള്ളതും, തിളക്കമേറിയതുമായ ഒരു ലോഹമാണ് വെള്ളി അഥവാ രജതം . ആവർത്തനപ്പട്ടികയിൽ സംക്രമണമൂലകങ്ങളുടെ കൂട്ടത്തിലാണ് ഇതിന്റെ സ്ഥാനം. വെള്ളിയുടെ ആറ്റോമിക സംഖ്യ 47 ആണ്. പ്രതീകം: Ag. എല്ലാ ലോഹങ്ങളിലും വച്ച് ഏറ്റവും കൂടുതൽ താപ വൈദ്യുത ചാലകത പ്രകടിപ്പിക്കുന്നത് വെള്ളിയാണ്. പ്രകൃതിയിൽ ഇത് ധാതു രൂപത്തിലും അല്ലാതെ സ്വതന്ത്രമായും ഇത് കാണപ്പെടുന്നു. നാണയങ്ങൾ, ആഭരണങ്ങൾ, കരണ്ടികൾ, പാത്രങ്ങൾ, കണ്ണാടികൾ എന്നിവയുടെ നിർമ്മാണത്തിനും ഛായഗ്രഹണമേഖലയിലും വെള്ളി ഉപയോഗിക്കുന്നു. വെള്ളി വളരെ ലോലമായ ഒരു'

In [4]:
len(data)

41886

In [5]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])

# saving the tokenizer for predict function
pickle.dump(tokenizer, open('token.pkl', 'wb'))

sequence_data = tokenizer.texts_to_sequences([data])[0]
sequence_data[:15]

[610, 611, 612, 613, 1, 264, 16, 96, 614, 615, 616, 617, 97, 265, 33]

In [6]:
len(sequence_data)

4399

In [7]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

2971


In [8]:
sequences = []

for i in range(3, len(sequence_data)):
    words = sequence_data[i-3:i+1]
    sequences.append(words)
    
print("The Length of sequences are: ", len(sequences))
sequences = np.array(sequences)
sequences[:10]

The Length of sequences are:  4396


array([[610, 611, 612, 613],
       [611, 612, 613,   1],
       [612, 613,   1, 264],
       [613,   1, 264,  16],
       [  1, 264,  16,  96],
       [264,  16,  96, 614],
       [ 16,  96, 614, 615],
       [ 96, 614, 615, 616],
       [614, 615, 616, 617],
       [615, 616, 617,  97]])

In [9]:
X = []
y = []

for i in sequences:
    X.append(i[0:3])
    y.append(i[3])
    
X = np.array(X)
y = np.array(y)

In [10]:
print("Data: ", X[:10])
print("Response: ", y[:10])

Data:  [[610 611 612]
 [611 612 613]
 [612 613   1]
 [613   1 264]
 [  1 264  16]
 [264  16  96]
 [ 16  96 614]
 [ 96 614 615]
 [614 615 616]
 [615 616 617]]
Response:  [613   1 264  16  96 614 615 616 617  97]


In [11]:
y = to_categorical(y, num_classes=vocab_size)
y[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [12]:
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=3))
model.add(LSTM(1000, return_sequences=True))
model.add(LSTM(1000))
model.add(Dense(1000, activation="relu"))
model.add(Dense(vocab_size, activation="softmax"))

In [13]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 3, 10)             29710     
                                                                 
 lstm (LSTM)                 (None, 3, 1000)           4044000   
                                                                 
 lstm_1 (LSTM)               (None, 1000)              8004000   
                                                                 
 dense (Dense)               (None, 1000)              1001000   
                                                                 
 dense_1 (Dense)             (None, 2971)              2973971   
                                                                 
Total params: 16,052,681
Trainable params: 16,052,681
Non-trainable params: 0
_________________________________________________________________


In [14]:
from tensorflow.keras.callbacks import ModelCheckpoint

checkpoint = ModelCheckpoint("next_words.h5", monitor='loss', verbose=1, save_best_only=True)
model.compile(loss="categorical_crossentropy", optimizer=Adam(learning_rate=0.001))
model.fit(X, y, epochs=70, batch_size=64, callbacks=[checkpoint])

Epoch 1/70
Epoch 1: loss improved from inf to 8.00013, saving model to next_words.h5
Epoch 2/70
Epoch 2: loss improved from 8.00013 to 7.87571, saving model to next_words.h5
Epoch 3/70
Epoch 3: loss improved from 7.87571 to 7.68394, saving model to next_words.h5
Epoch 4/70
Epoch 4: loss improved from 7.68394 to 7.47197, saving model to next_words.h5
Epoch 5/70
Epoch 5: loss improved from 7.47197 to 7.22837, saving model to next_words.h5
Epoch 6/70
Epoch 6: loss improved from 7.22837 to 6.88524, saving model to next_words.h5
Epoch 7/70
Epoch 7: loss improved from 6.88524 to 6.37903, saving model to next_words.h5
Epoch 8/70
Epoch 8: loss improved from 6.37903 to 5.60857, saving model to next_words.h5
Epoch 9/70
Epoch 9: loss improved from 5.60857 to 4.79658, saving model to next_words.h5
Epoch 10/70
Epoch 10: loss improved from 4.79658 to 4.09355, saving model to next_words.h5
Epoch 11/70
Epoch 11: loss improved from 4.09355 to 3.46233, saving model to next_words.h5
Epoch 12/70
Epoch 12:

<keras.callbacks.History at 0x25ac03bb880>

In [15]:
from tensorflow.keras.models import load_model
import numpy as np
import pickle

# Load the model and tokenizer
model = load_model('next_words.h5')
tokenizer = pickle.load(open('token.pkl', 'rb'))

def Predict_Next_Words(model, tokenizer, text):

  sequence = tokenizer.texts_to_sequences([text])
  sequence = np.array(sequence)
  preds = np.argmax(model.predict(sequence))
  predicted_word = ""
  
  for key, value in tokenizer.word_index.items():
      if value == preds:
          predicted_word = key
          break
  
  print(predicted_word)
  return predicted_word

In [None]:
while(True):
  text = input("Enter your line: ")
  
  if text == "0":
      print("Execution completed.....")
      break
  
  else:
      try:
          text = text.split(" ")
          text = text[-3:]
          print(text)
        
          Predict_Next_Words(model, tokenizer, text)
          
      except Exception as e:
        print("Error occurred: ",e)
        continue

Enter your line:  രസതന്ത്രവുമായി ബന്ധപ്പെട്ട ഈ


['രസതന്ത്രവുമായി', 'ബന്ധപ്പെട്ട', 'ഈ']
ലേഖനം
