# Importing the required libraries
### we import libraries for creating word embeddings

In [2]:
import io
import os
import re
import shutil
import string
import tensorflow as tf

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers import TextVectorization



# Defining DataSet 
The data set is in form of 1 sentence per file 
we may give path to our dataset

In [3]:
# getting the dataset 
dataset ='../../abp_news_hindi_data'

dataset_dir = os.path.join(os.path.dirname(dataset), 'abp_news_hindi_data')
os.listdir(dataset_dir)

['train']

# Defining The training dataset 
### providing the path to our data set and taking a look at the train directory the pos file contains data in files 

In [4]:
train_dir = '../../abp_news_hindi_data/train/'
os.listdir(train_dir)

['pos']

### Use the train directory to create both train and validation datasets with a split of 20% for validation

In [6]:
batch_size = 8 # batch size is a number of samples processed before the model is updated. 
seed = 123 # Optional random seed for shuffling and transformations.

train_ds = tf.keras.utils.text_dataset_from_directory(
    '../../abp_news_hindi_data/train', batch_size=batch_size, validation_split=0.2,
    subset='training', seed=seed)
val_ds = tf.keras.utils.text_dataset_from_directory(
    '../../abp_news_hindi_data/train', batch_size=batch_size, validation_split=0.2,
    subset='validation', seed=seed)

Found 140690 files belonging to 1 classes.
Using 112552 files for training.
Found 140690 files belonging to 1 classes.
Using 28138 files for validation.


.cache() keeps data in memory after it's loaded off disk. This will ensure the dataset does not become a bottleneck while training our model. If our dataset is too large to fit into memory, we can also use this method to create a performant on-disk cache, which is more efficient to read than many small files.

.prefetch() overlaps data preprocessing and model execution while training.

In [5]:
AUTOTUNE = tf.data.AUTOTUNE 

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

The Embedding layer can be understood as a lookup table that maps from integer indices (which stand for specific words) to dense vectors (their embeddings). The dimensionality (or width) of the embedding is a parameter we can experiment with to see what works well for our problem.

In [6]:
# Embed a 10000 word vocabulary into 20 dimensions.
embedding_layer = tf.keras.layers.Embedding(10000, 20)

When we create an Embedding layer, the weights for the embedding are randomly initialized (just like any other layer). During training, they are gradually adjusted via backpropagation. Once trained, the learned word embeddings will roughly encode similarities between words (as they oure learned for the specific problem our model is trained on).


In [7]:
result = embedding_layer(tf.constant([1, 2, 3])) # If we pass an integer to an embedding layer, the result replaces each integer with the vector from the embedding table:

result.numpy()

array([[ 0.04836544, -0.00398915, -0.00302079, -0.01337272,  0.04914557,
        -0.00063807,  0.03805279,  0.02288983,  0.02547536,  0.03475584,
        -0.03471668, -0.01824844, -0.03994088, -0.03596375,  0.00344055,
         0.00052395,  0.01369634,  0.04545544,  0.00927707,  0.03058562],
       [ 0.00970787, -0.01242999, -0.02833358, -0.00133804,  0.00713963,
        -0.0256909 ,  0.000185  ,  0.02302809, -0.034005  , -0.02717756,
         0.01178741, -0.01895106, -0.00683886,  0.02950421,  0.0277385 ,
         0.02109872, -0.04545908, -0.03058007,  0.02133251, -0.02800335],
       [-0.03360381,  0.02973726,  0.00599277,  0.0071152 ,  0.03482497,
        -0.03113912,  0.03065679,  0.03601806,  0.03774877,  0.03456653,
        -0.01595533, -0.01671573,  0.03010172, -0.0072564 , -0.04933568,
        -0.01949699,  0.00853875, -0.01646788,  0.01688248,  0.01374885]],
      dtype=float32)

In [8]:
result = embedding_layer(tf.constant([[0, 1, 2], [3, 4, 5]]))
result.shape

TensorShape([2, 3, 20])

In [9]:
# Create a custom standardization function to strip HTML break tags '<br />'.
def custom_standardization(input_data):
  loourcase = tf.strings.loour(input_data)
  stripped_html = tf.strings.regex_replace(loourcase, '<br />', ' ')
  return tf.strings.regex_replace(stripped_html,
                                  '[%s]' % re.escape(string.punctuation), '')


# Vocabulary size and number of words in a sequence.
vocab_size = 50000
sequence_length = 100

# Use the text vectorization layer to normalize, split, and map strings to
# integers. Note that the layer uses the custom standardization defined above.
# Set maximum_sequence length as all samples are not of the same length.
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

# Make a text-only dataset (no labels) and call adapt to build the vocabulary.
text_ds = train_ds.map(lambda x, y: x)
vectorize_layer.adapt(text_ds)

In [10]:
# there are various arguments like 
embedding_dim=100

model = Sequential([
  vectorize_layer,
  Embedding(vocab_size, embedding_dim, name="embedding"), # The Embedding layer takes the integer-encoded vocabulary and looks up the embedding vector for each word-index.
  These vectors are learned as the model trains. The vectors add a dimension to the output array. 
  The resulting dimensions are: (batch, sequence, embedding).

  GlobalAveragePooling1D(),
  # The GlobalAveragePooling1D layer returns a fixed-length output vector for each example by averaging over the sequence dimension. This allows the model to handle input of variable length, in the simplest way possible.
  Dense(10, activation='relu'),
  Dense(1)
])

In [11]:
#We will use TensorBoard to visualize metrics including loss and accuracy. 
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")

# Compile and train the model using the Adam optimizer and BinaryCrossentropy loss.




In [12]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [13]:
model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=1,
    callbacks=[tensorboard_callback])



<keras.callbacks.History at 0x7f8e94244190>

In [14]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  (None, 100)              0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 100, 100)          5000000   
                                                                 
 global_average_pooling1d (G  (None, 100)              0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 10)                1010      
                                                                 
 dense_1 (Dense)             (None, 1)                 11        
                                                                 
Total params: 5,001,021
Trainable params: 5,001,021
Non-

In [15]:
# Visualize the model metrics in TensorBoard.
#docs_infra: no_execute
%load_ext tensorboard
%tensorboard --logdir logs

# Retrieve the trained word embeddings and save them to disk

In [16]:
# Next, retrieve the word embeddings learned during training. The embeddings are weights of the Embedding layer in the model. 
# The weights matrix is of shape (vocab_size, embedding_dimension).
weights = model.get_layer('embedding').get_weights()[0]
vocab = vectorize_layer.get_vocabulary()

In [17]:
# copying the vectors and metadata to files to use them to visualize word embeddings
out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(vocab):
  if index == 0:
    continue  # skip 0, it's padding.
  vec = weights[index]
  if word is not None and word !=' ' and word != '\t' and (not word.startswith(' ') )and (not word.startswith('\t')):
    out_v.write('\t'.join([str(x) for x in vec]) + "\n")
    out_m.write(word + "\n")
out_v.close()
out_m.close()


# these files will be then use in comparer.ipynb