MODEL CREATION

In [5]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Lambda
from tensorflow.keras.models import Model
import numpy as np


# Function to create Siamese model
def create_siamese_model(vocab_size, embedding_dim, lstm_units):
    # Shared embedding layer
    input_layer = Input(shape=(None,))
    embedding_layer = Embedding(vocab_size, embedding_dim)
    lstm_layer = LSTM(lstm_units)
    embedded_sequence = lstm_layer(embedding_layer(input_layer))

    shared_lstm_model = Model(inputs=input_layer, outputs=embedded_sequence)

    # Two input sentences
    input_sentence1 = Input(shape=(None,))
    input_sentence2 = Input(shape=(None,))

    # Process sentences using the shared LSTM model
    embedded_sentence1 = shared_lstm_model(input_sentence1)
    embedded_sentence2 = shared_lstm_model(input_sentence2)

    # Compute cosine similarity between the sentence embeddings
    def cosine_distance(inputs):
        x, y = inputs
        x = tf.math.l2_normalize(x, axis=-1)
        y = tf.math.l2_normalize(y, axis=-1)
        return tf.reduce_sum(x * y, axis=-1, keepdims=True)

    distance = Lambda(cosine_distance)([embedded_sentence1, embedded_sentence2])

    # Final output layer for similarity score
    output_layer = Dense(1, activation='sigmoid')(distance)

    siamese_model = Model(inputs=[input_sentence1, input_sentence2], outputs=output_layer)

    return siamese_model



DATA PREPROCESSING

In [None]:


# Example data for training
sentence_pairs = [
    ("I love coding with Python.", "Python programming is my favorite."),
    ("Machine learning is fascinating.", "I enjoy studying ML algorithms."),
    ("The sky is blue.", "Grass is green."),
    ("Reading books is relaxing.", "I like to read novels."),
]

# Generate vocabulary and prepare data
vocab = set()
for sent1, sent2 in sentence_pairs:
  #  vocab.update(sent1.split())
    vocab.update(sent2.split())

word_to_index = {word: idx for idx, word in enumerate(vocab)}
index_to_word = {idx: word for word, idx in word_to_index.items()}

max_sequence_length = max(len(sent.split()) for sent1, sent2 in sentence_pairs for sent in [sent1, sent2])

# Convert sentences to sequences of word indices
def convert_to_sequences(sentence_pairs, word_to_index, max_sequence_length):
    sentence1_sequences = []
    sentence2_sequences = []
    for sent1, sent2 in sentence_pairs:
        sent1_seq = [word_to_index[word] for word in sent1.split()]
        sent2_seq = [word_to_index[word] for word in sent2.split()]
        pad_length = max_sequence_length - len(sent1_seq)
        sentence1_sequences.append(sent1_seq + [0] * pad_length)
        pad_length = max_sequence_length - len(sent2_seq)
        sentence2_sequences.append(sent2_seq + [0] * pad_length)
    return np.array(sentence1_sequences), np.array(sentence2_sequences)

sentence1_sequences, sentence2_sequences = convert_to_sequences(sentence_pairs, word_to_index, max_sequence_length)

TRAIN DATA

In [None]:
# Define model hyperparameters
vocab_size = len(vocab) + 1
embedding_dim = 50
lstm_units = 64

# Create and compile Siamese model
siamese_model = create_siamese_model(vocab_size, embedding_dim, lstm_units)
siamese_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the Siamese model
siamese_model.fit([sentence1_sequences, sentence2_sequences], np.array([1, 1, 0, 0]), epochs=10, batch_size=2)

# Now the Siamese model is trained and ready to be used for similarity measurement.
