# **Import Libraries**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
from gensim.models import Word2Vec

import tensorflow as tf

import string
import pandas as pd
import numpy as np

from wordcloud import WordCloud
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import OneHotEncoder,LabelEncoder

from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional, Embedding, Activation
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
import tensorflow.keras.utils as ku
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping

from keras.preprocessing.text import Tokenizer

# **Read Text File**

In [3]:
poem_file = open('/content/drive/MyDrive/poem/datasets/poem.txt','r')
poem = poem_file.read()

# **Read Word2Vec Model**

In [4]:
w2vec_model= Word2Vec.load("/content/drive/MyDrive/language_models/nepaliW2V_5Million.model")

# **Preprocessing**

## Read numbers

In [5]:
nepali_num_file=open("/content/drive/MyDrive/poem/preprocess/numbers.txt","r",encoding="utf-8")
nepali_num=nepali_num_file.read()
nepali_num=nepali_num.split(",")

## Split

In [6]:
poem_corpus = poem.split("\n")
print(poem_corpus[:5])

['नछाडी जानोस् हे मेरा प्राण ! अकेली मलाई,', 'मनको वनमा ननिभ्ने गरी विरह जलाई !', 'ननिभ्ने गरी विरह जलाई,', 'लोचनका तारा ! हे मेर प्यारा ! यो जोति  बिलाए !', 'के भनूँ? भन्ने म केही थिइन  विष नै पिलाए !']


## Apply Denoising

In [7]:
def remove_puncutations_and_noise(sentences):
    punctuations_and_noise = ['।', ',', ';', '?', ' !',' ! ' '!', '—', '-', '.',"’","‘","'","”",'\u200d']
    processed_sentences = []
    for sentence in sentences:
        for punct in punctuations_and_noise:
            sentence = sentence.replace(punct,'')
        processed_sentences.append(sentence)

    return processed_sentences

In [8]:
processed_poem_corpus = remove_puncutations_and_noise(poem_corpus)
print(processed_poem_corpus[:5])

['नछाडी जानोस् हे मेरा प्राण अकेली मलाई', 'मनको वनमा ननिभ्ने गरी विरह जलाई', 'ननिभ्ने गरी विरह जलाई', 'लोचनका तारा हे मेर प्यारा यो जोति  बिलाए', 'के भनूँ भन्ने म केही थिइन  विष नै पिलाए']


# **Make Corpus Ready to Fit**

In [9]:
import re
import snowballstemmer
mainlist = list()
class Main_Data_list:
    def __init__(self, dataset):
        self.dataset = dataset
        self.noise_list = ['\n','\ufeff','0','1','2','3','4','5','6','7','8','9','०','१','२','३','४','५','६','७','८','९','१०','।', ',', ';', '?', ' !', "”",' ! ' '!', '—', '-', '.',"’","‘","'",'\u200d']
        self.mainlist = []

        self.stemmer = snowballstemmer.NepaliStemmer()

    def simple_tokenizer(self,text) -> list:

        line = re.sub('[।]',"", text)

        devanagari_range = r'[\u0900-\u097F\\]'
        def getDevanagariCharCount(token):
            return len(list(filter(lambda char: re.match(devanagari_range, char), (char for char in token))))
        def isDevanagari(token):
            return True if getDevanagariCharCount(token) >= len(token)/2 else False

        tokens = list(filter(lambda t: isDevanagari(t), line.split(" ")))
        return tokens

    def get(self):
        for i,line in enumerate(self.dataset[0:2000000]):

            wordsList = self.simple_tokenizer(line)
            words1 = [w for w in wordsList if not w in self.noise_list]
            words1.append('')
            words = []
            for word in words1:
              words.append([word.replace(noise,'') for noise in self.noise_list][0])
              self.mainlist.append(words)

            if i % 100000 == 0:
                print(f"DONE FOR {i/100000} LAKHS LINES")
        return self.mainlist

final = Main_Data_list(processed_poem_corpus)
mainlist = final.get()

DONE FOR 0.0 LAKHS LINES


# **Fit Word2Vec Model**

In [10]:
w2vec_model.build_vocab(mainlist, update=True)
w2vec_model.train(mainlist, total_examples=w2vec_model.corpus_count, epochs=w2vec_model.epochs)



(612288, 780460)

In [11]:
trained_weights = w2vec_model.wv.vectors
vocab_size, emdedding_size = trained_weights.shape
vocab_size, emdedding_size

(293154, 200)

# **Create Train dataset**

In [40]:
import torch

In [53]:
input_sequences = []
embedding_list = []
for line in processed_poem_corpus:
  embedding_list.append([w2vec_model.wv[word].astype(np.float32) for word in line.split()])

max_sequence_len = max([len(x) for x in embedding_list])

for embeddings in embedding_list:
  for i in range(1, len(embeddings)):
    embedding_seq = embeddings[: i+1]
    padded_sequence = [torch.zeros(1)] * max_sequence_len

    for index,_ in enumerate(embedding_seq):
      insert_index = max_sequence_len - len(embedding_seq) + index
      padded_sequence[insert_index] = embedding_seq[index]

    input_sequences.append(padded_sequence)

input_sequences = np.array(input_sequences)
predictors, label = input_sequences[:, :-1],input_sequences[:, -1]


  input_sequences = np.array(input_sequences)
  input_sequences = np.array(input_sequences)


In [54]:
print(processed_poem_corpus[1])
similar_words = w2vec_model.wv.similar_by_vector( np.array(predictors[6][-1]), topn=1)
similar_words

मनको वनमा ननिभ्ने गरी विरह जलाई


[('मनको', 1.0)]

In [55]:
# predictors = tf.convert_to_tensor(predictors, dtype=tf.float32)
# labels = tf.convert_to_tensor(label, dtype=tf.float32)
predictors[0]

array([tensor([0.]), tensor([0.]), tensor([0.]), tensor([0.]),
       tensor([0.]), tensor([0.]), tensor([0.]), tensor([0.]),
       tensor([0.]),
       array([ 2.21316218e-01, -6.84821904e-02,  6.40273392e-02, -4.64315936e-02,
              -1.26503855e-01, -5.76133840e-02, -8.57502818e-02,  8.41274783e-02,
               4.53876704e-03, -1.67515144e-01, -1.61734328e-01, -8.42714589e-03,
               5.61235882e-02,  1.12102836e-01, -1.84491219e-03, -4.68855798e-02,
               8.88644904e-03,  7.69452080e-02,  9.80778784e-02, -1.45451277e-02,
               6.41386956e-02, -4.01973054e-02, -8.15697946e-03,  5.98928407e-02,
               1.29324328e-02, -2.61404157e-01,  1.61195174e-01, -1.89326387e-02,
              -4.45985189e-03,  9.77275446e-02,  6.17768727e-02, -1.34935202e-02,
               1.58869587e-02,  1.79507267e-02, -2.71560202e-05, -2.08216831e-01,
              -2.52208710e-02,  8.00456181e-02,  4.35174964e-02,  4.54146713e-02,
               6.95537627e-02,  1

# **Build LSTM models**

In [56]:
model = Sequential()
model.add(Embedding(vocab_size + 1, 100, input_length=max_sequence_len - 1))
model.add(Bidirectional(LSTM(150, return_sequences=True)))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dense(emdedding_size, activation='linear',  # Adjusted the activation here
                kernel_regularizer=regularizers.l2(0.01)))
model.compile(loss='mse',  # Changed the loss function to mean squared error
              optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 10, 100)           29315500  
                                                                 
 bidirectional_5 (Bidirecti  (None, 10, 300)           301200    
 onal)                                                           
                                                                 
 dropout_5 (Dropout)         (None, 10, 300)           0         
                                                                 
 lstm_11 (LSTM)              (None, 100)               160400    
                                                                 
 dense_5 (Dense)             (None, 200)               20200     
                                                                 
Total params: 29797300 (113.67 MB)
Trainable params: 29797300 (113.67 MB)
Non-trainable params: 0 (0.00 Byte)
__________

In [58]:
early_stopping = EarlyStopping(monitor='val_accuracy', patience=100, restore_best_weights=True)

history = model.fit(predictors, label, epochs=500, verbose=1, validation_split=0.2, callbacks=[early_stopping])

ValueError: ignored