# **Import Libraries**

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

In [1]:
from gensim.models import Word2Vec

import tensorflow as tf

import string
import pandas as pd
import numpy as np

from wordcloud import WordCloud
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import OneHotEncoder,LabelEncoder

from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional, Embedding, Activation
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
import tensorflow.keras.utils as ku
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping

from keras.preprocessing.text import Tokenizer

# **Read Text File**

In [2]:
poem_file = open('C:/Users/Ghost/Desktop/gits/Nepali_Poem_Generator/datasets/poem.txt','r')
poem = poem_file.read()

# **Read Word2Vec Model**

In [3]:
w2vec_model= Word2Vec.load("C:/Users/Ghost/Desktop/gits/Nepali_Poem_Generator/w2vec/langauge_models/nepaliW2V_5Million.model")

# **Preprocessing**

## Read numbers

In [4]:
nepali_num_file=open("C:/Users/Ghost/Desktop/gits/Nepali_Poem_Generator/preprocess/numbers.txt","r",encoding="utf-8")
nepali_num=nepali_num_file.read()
nepali_num=nepali_num.split(",")

## Split

In [5]:
poem_corpus = poem.split("\n")
print(poem_corpus[:5])

['नछाडी जानोस् हे मेरा प्राण ! अकेली मलाई,', 'मनको वनमा ननिभ्ने गरी विरह जलाई !', 'ननिभ्ने गरी विरह जलाई,', 'लोचनका तारा ! हे मेर प्यारा ! यो जोति  बिलाए !', 'के भनूँ? भन्ने म केही थिइन  विष नै पिलाए !']


## Apply Denoising

In [6]:
def remove_puncutations_and_noise(sentences):
    punctuations_and_noise = ['।', ',', ';', '?', ' !',' ! ' '!', '—', '-', '.',"’","‘","'","”",'\u200d']
    processed_sentences = []
    for sentence in sentences:
        for punct in punctuations_and_noise:
            sentence = sentence.replace(punct,'')
        processed_sentences.append(sentence)

    return processed_sentences

In [7]:
processed_poem_corpus = remove_puncutations_and_noise(poem_corpus)
print(processed_poem_corpus[:5])

['नछाडी जानोस् हे मेरा प्राण अकेली मलाई', 'मनको वनमा ननिभ्ने गरी विरह जलाई', 'ननिभ्ने गरी विरह जलाई', 'लोचनका तारा हे मेर प्यारा यो जोति  बिलाए', 'के भनूँ भन्ने म केही थिइन  विष नै पिलाए']


# **Make Corpus Ready to Fit**

In [8]:
import re
import snowballstemmer
mainlist = list()
class Main_Data_list:
    def __init__(self, dataset):
        self.dataset = dataset
        self.noise_list = ['\n','\ufeff','0','1','2','3','4','5','6','7','8','9','०','१','२','३','४','५','६','७','८','९','१०','।', ',', ';', '?', ' !', "”",' ! ' '!', '—', '-', '.',"’","‘","'",'\u200d']
        self.mainlist = []

        self.stemmer = snowballstemmer.NepaliStemmer()

    def simple_tokenizer(self,text) -> list:

        line = re.sub('[।]',"", text)

        devanagari_range = r'[\u0900-\u097F\\]'
        def getDevanagariCharCount(token):
            return len(list(filter(lambda char: re.match(devanagari_range, char), (char for char in token))))
        def isDevanagari(token):
            return True if getDevanagariCharCount(token) >= len(token)/2 else False

        tokens = list(filter(lambda t: isDevanagari(t), line.split(" ")))
        return tokens

    def get(self):
        for i,line in enumerate(self.dataset[0:2000000]):

            wordsList = self.simple_tokenizer(line)
            words1 = [w for w in wordsList if not w in self.noise_list]
            words1.append('')
            words = []
            for word in words1:
              words.append([word.replace(noise,'') for noise in self.noise_list][0])
              self.mainlist.append(words)

            if i % 100000 == 0:
                print(f"DONE FOR {i/100000} LAKHS LINES")
        return self.mainlist

final = Main_Data_list(processed_poem_corpus)
mainlist = final.get()

DONE FOR 0.0 LAKHS LINES


# **Fit Word2Vec Model**

In [9]:
w2vec_model.build_vocab(mainlist, update=True)
w2vec_model.train(mainlist, total_examples=w2vec_model.corpus_count, epochs=w2vec_model.epochs)

(612473, 780460)

In [10]:
trained_weights = w2vec_model.wv.vectors
vocab_size, embedding_size = trained_weights.shape
vocab_size, embedding_size

(293154, 200)

# **Create Train dataset**

In [11]:
input_sequences = []
embedding_list = []
for line in processed_poem_corpus:
  embedding_list.append([list(w2vec_model.wv[word]) for word in line.split()])

max_sequence_len = max([len(x) for x in embedding_list])

for embeddings in embedding_list:
  for i in range(1, len(embeddings)):
    embedding_seq = embeddings[: i+1]
    padded_sequence = np.zeros((max_sequence_len, embedding_size), dtype=np.float_)

    for index,_ in enumerate(embedding_seq):
      insert_index = max_sequence_len - len(embedding_seq) + index
      padded_sequence[insert_index] = embedding_seq[index]

    input_sequences.append(padded_sequence)

input_sequences = np.array(input_sequences, dtype=np.object_)
predictors, label = input_sequences[:, :-1],input_sequences[:, -1]


In [12]:
predictors = np.asarray(predictors).astype(np.float32)
label = np.asarray(label).astype(np.float32)

In [18]:
print(processed_poem_corpus[0])
similar_words = w2vec_model.wv.similar_by_vector( np.array(predictors[1][-2]), topn=1)

similar_words = w2vec_model.wv.similar_by_vector( np.array(predictors[1][-1]), topn=1)

similar_words

नछाडी जानोस् हे मेरा प्राण अकेली मलाई


[('जानोस्', 0.9999999403953552)]

In [15]:
predictors[0]

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.21607383, -0.04870717,  0.05236128, ...,  0.08642674,
        -0.10928017,  0.08665729]], dtype=float32)

In [19]:
# # predictors = tf.convert_to_tensor(predictors, dtype=tf.float32)
# # labels = tf.convert_to_tensor(label, dtype=tf.float32)
# predictors[0]

# **Build LSTM models**

In [20]:
input_dim = (max_sequence_len - 1, embedding_size)
model = Sequential()

# LSTM layer with return_sequences=True for sequence generation
model.add(LSTM(150, input_shape=input_dim, return_sequences=True))
model.add(Bidirectional(LSTM(150, return_sequences=True)))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dense(embedding_size, activation='linear',  # Adjusted the activation here
                kernel_regularizer=regularizers.l2(0.01)))
model.compile(loss='mse',  # Changed the loss function to mean squared error
              optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 10, 150)           210600    
                                                                 
 bidirectional (Bidirection  (None, 10, 300)           361200    
 al)                                                             
                                                                 
 dropout (Dropout)           (None, 10, 300)           0         
                                                                 
 lstm_2 (LSTM)               (None, 100)               160400    
                                                                 
 dense (Dense)               (None, 200)               20200     
                                                                 
Total params: 752400 (2.87 MB)
Trainable params: 752400 (2.87 MB)
Non-trainable params: 0 (0.00 Byte)
____________________

In [16]:
# model = Sequential()
# model.add(Embedding(vocab_size + 1, 100, input_length=max_sequence_len - 1))
# model.add(Bidirectional(LSTM(150, return_sequences=True)))
# model.add(Dropout(0.2))
# model.add(LSTM(100))
# model.add(Dense(emdedding_size, activation='linear',  # Adjusted the activation here
#                 kernel_regularizer=regularizers.l2(0.01)))
# model.compile(loss='mse',  # Changed the loss function to mean squared error
#               optimizer='adam', metrics=['accuracy'])
# print(model.summary())

In [21]:
early_stopping = EarlyStopping(monitor='val_accuracy', patience=100, restore_best_weights=True)

history = model.fit(predictors, label, epochs=500, batch_size= 1 , verbose=1, validation_split=0.2, callbacks=[early_stopping])

Epoch 1/500
Epoch 2/500

KeyboardInterrupt: 

In [None]:
model.save("lstm_w2vec.h5")

array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       list([0.21927229, -0.056478217, 0.060117915, -0.037925318, -0.13777573, -0.057317194, -0.07726518, 0.07834977, -0.007228161, -0.18599378, -0.16449532, -0.0002100825, 0.064958744, 0.10180828, 0.003921977, -0.06316593, 0.017740157, 0.065559484, 0.11922026, -0.039086472, 0.074046224, -0.036231853, 0.01534304, 0.04818608, 0.006743838, -0.25357458, 0.13440284, -0.016839337, -0.00096636376, 0.1255406, 0.060458858, -0.01191231, 0.02501321, 0.031469624, -0.021567931, -0.18737431, -0.051946122, 0.06698385, 0.039512694, 0.04450356, 0.06749839, 0.09726132, -0.100591086, 0.025240183, -0.10920636, -0.028276535, -0.16780567, -0.046132654, 0.044618323, -0.06237182, -0.10258083, 0.10996795, -0.1794217, 0.02424799, -0.010753302, 0.04090991, -0.031017533, 0.022069093, 0.06129545, -0.06201478, -0.06716515, -0.062729836, -0.0024193488, -0.009916899, -0.060892146, -0.039804492, -0.011431253, -0.16640712, -0.15059566, -0.14573205, -0.1517604, 0.14601587,