In [None]:
import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.preprocessing import text_dataset_from_directory
from tensorflow.keras.layers import SimpleRNN,LSTM,Dense,GRU,Bidirectional,Reshape
from tensorflow.data.experimental import AUTOTUNE
import numpy as np
import re
import string
import nltk
import datetime
import numpy as np
import gensim.downloader as api
from tensorboard.plugins import projector
from matplotlib import pyplot as plt
import pandas
%reload_ext tensorboad

In [None]:
wv=api.load('word2vec-google-news-300')

In [None]:
len(wv.index_to_key)

In [None]:
def preprocess_sentences(input_data):
    '''
    Task: Preprocess sentences or standardize the sentences
    Input: raw reviews
    output: standardized reviews
    '''
    output=tf.strings.lower(input_data)
    outputs=tf.strings.regex_replace(output,"<[^>]+>","")
    outputs=tf.strings.regex_replace(output,"<[%s]"%re.esceape(string.punctuation)," ")
    outputs=tf.strings.regex_replace(output,"  "," ")
    
    return output

In [None]:
def word_index(word):
    out=0
    try:
        out=wv.key_to_index[word]
        if out<30000:
            return out
        else:
            return 0
    except:
        try:
            out=wv.key_to_index[word[0].upper()+word[1:]]
            if out<30000:
                return out
            else:
                return 0
        except:
            return 0

In [None]:
class DataGenerator(tf.keras.utils.Sequence):
    
    def __init__(self,train_pos,train_neg,vocab_size,sequence_length,batch_size,shuffle=False):
        self.train_pos = train_pos
        self.train_neg = train_neg
        self.batch_size = batch_size
        self.train_pos_list = os.listdir(train_pos)
        self.train_neg_list = os.listdir(train_neg)
        self.vocab_size = vocab_size
        self.sequence_length = sequence_length
        
    def __getitem__(self,idx):
        X,y=self.__data_generation(idx)
        return X,y
    def __data_generation(self,idx):

        x=[]
        y=[]

        for j in range(idx*self.batch_size,(idx+1)*self.batch_size):
            for t in range(2):
                if t==0:
                    with open(self.train_pos+self.train_pos_list[j],encoding="utf-8") as f:
                        for line in f:
                            lin=line

                else:
                    with open(self.train_neg+self.train_neg_list[j],encoding="utf-8") as f:
                        for line in f:
                            lin=line
                rev=[]
                for k,i in enumerate(tf.strings.split(preprocess_sentences(lin))):
                    rev.append(word_index(str(i.numpy())[2:-1]))
                    if k>=(250-1):
                        break
                out=tf.concat([tf.constant(rev),tf.zeros([self.sequence_length-len(rev)],dtype=tf.int32)],axis=0)
                f.close()
                X.append(list(out.numpy()))
                
                if t==0:
                    y.append(1)
                else:
                    y.append(0)
        return tf.constant(X),tf.constant(y)

In [None]:
train_pos='...'
train_neg='...'

val_pos='...'
val_neg='...'

BATCH_SIZE=32
LR=1E-4
VOCAB_SIZE=30000
SEQUENCE_LENGTH=250

In [None]:
train_gen=DataGenerator(train_pos,train_neg,VOCAB_SIZE,SEQUENCE_LENGTH,BATCH_SIZE)
train_gen=DataGenerator(val_pos,val_neg,VOCAB_SIZE,SEQUENCE_LENGTH,BATCH_SIZE)

In [None]:
embedding_matrix=[]

for i in range(VOCAB_SIZE):
    embedding_matrix.append(wv[i])
embedding_matrix=np.array(embedding_matrix)
print(embedding_matrix.shape)

In [None]:
inputs=tf.keras.layers.Input(shape=(SEQUENCE_LENGTH,),)

embedding=tf.keras.layers.Embedding(
    VOCAB_SIZE,
    300,
    embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
    trainable=False,)

In [None]:

model=tf.keras.models.Sequential([
    inputs,
    embedding,
    tf.keras.layers.Conv1D(256,kernel_size=2,activation='relu'),
    tf.keras.layers.MaxPooling1D(2),
    
    tf.keras.layers.Conv1D(128,kernel_size=2,activation='relu'),
    tf.keras.layers.MaxPooling1D(2),
    
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(1,activation='sigmoid',),
])
model.summary()

In [None]:
model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(lr=LR),
    metrics=['accuracy'])

In [None]:
checkpoint_filepath='...'
log_dir='...'
callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='loss',
    mode='min',
    save_best_only=True
)

In [None]:
t_callback=tf.keras.callbacks.TensorBoard(log_dir=log_dir,histogram_freq=1)

In [None]:
history=model.fit(train_gen,verbose=1,epochs=EPOCH,callbacks=[callback,t_callback])

In [None]:
if not os.path.exist(log_dir):
    os.makedirs(log_dir)
    
with open(os.path.join(log_dir,'metadaata.tsv'),"w",encodings="utf-8") as f:
    for i in range(VOCAB_SIZE):
        f.write("{} {}\n".format(i,wv.index_to_key[i]))

embedding_weights=tf.Variable(model.layers[0].get_weights()[0])
checkpoint=tf.train.Checkpoint(embedding-embedding_weights)
checkpoint.save(os.path.join(log_dir,"embedding.ckpt"))

config=projector.ProjectorConfig()
embedding=config.embeddings.add()

embedding.metadata_path='metadata.tsv'
projector.visualize_embeddings(log_dir,config)

In [None]:
%tensorboard --log_dir logs/imdb/fit/