In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, Lambda, Dense
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K
from sklearn.model_selection import train_test_split

# Function to load data from a space-separated .txt file
def read_space_separated_txt(file_path):
    data_list = []

    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

        for line in lines:
            # Split each line based on spaces and append to the list
            line_values = line.strip().split(' ')
            data_list.append(line_values)

    return data_list

# Load English data from a .txt file
file_path_english = 'English_Chapter.txt'  # Replace with your actual file path for English data
english_data = read_space_separated_txt(file_path_english)

# Load French data from a .txt file
file_path_french = 'French_Chapter (2).txt'  # Replace with your actual file path for French data
french_data = read_space_separated_txt(file_path_french)

# Tokenize and pad the input sequences
vocab_size = 10000
max_sequence_length = 20

tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(english_data + french_data)

english_sequences = tokenizer.texts_to_sequences(english_data)
french_sequences = tokenizer.texts_to_sequences(french_data)

english_sequences = tf.keras.preprocessing.sequence.pad_sequences(english_sequences, maxlen=max_sequence_length)
french_sequences = tf.keras.preprocessing.sequence.pad_sequences(french_sequences, maxlen=max_sequence_length)

# Labels (1 for similar, 0 for dissimilar)
labels = np.concatenate([np.ones(len(english_sequences)), np.zeros(len(french_sequences))])

# Combine English and French data for training
X1_combined = np.concatenate([english_sequences, french_sequences])
X2_combined = np.concatenate([english_sequences, french_sequences])

# Split the combined data into training and validation sets
X1_train, X1_val, X2_train, X2_val, y_train, y_val = train_test_split(
    X1_combined, X2_combined, labels, test_size=0.2, random_state=42
)

# Siamese network architecture
embedding_dim = 50
lstm_units = 50

# Shared LSTM layer
shared_lstm = LSTM(lstm_units)

# Input layers
input_1 = Input(shape=(max_sequence_length,))
input_2 = Input(shape=(max_sequence_length,))

# Embedding layers
embedding_layer = Embedding(vocab_size, embedding_dim)
embedded_1 = embedding_layer(input_1)
embedded_2 = embedding_layer(input_2)

# Shared LSTM layer
encoded_1 = shared_lstm(embedded_1)
encoded_2 = shared_lstm(embedded_2)

# Lambda layer to calculate Manhattan Distance
distance = Lambda(lambda x: K.abs(x[0] - x[1]))([encoded_1, encoded_2])

# Fully connected layer
prediction = Dense(1, activation='sigmoid')(distance)

# Create the Siamese model
siamese_model = Model(inputs=[input_1, input_2], outputs=prediction)

# Compile the model
siamese_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the Siamese model
siamese_model.fit([X1_train, X2_train], y_train, validation_data=([X1_val, X2_val], y_val), epochs=10, batch_size=16)

# Use the trained model to predict similarity on unseen data
# Replace the following with your actual test data
test_text_english = ["The quick brown fox"]
test_text_french = ["Le renard brun rapide"]

test_seq_english = tokenizer.texts_to_sequences(test_text_english)
test_seq_french = tokenizer.texts_to_sequences(test_text_french)

test_seq_english = tf.keras.preprocessing.sequence.pad_sequences(test_seq_english, maxlen=max_sequence_length)
test_seq_french = tf.keras.preprocessing.sequence.pad_sequences(test_seq_french, maxlen=max_sequence_length)

# Print test sequences
print("Test Sequence (English):", test_seq_english)
print("Test Sequence (French):", test_seq_french)

# Predict similarity
prediction = siamese_model.predict([test_seq_english, test_seq_french])

# Print predicted similarity
print("Predicted similarity:", prediction[0][0])


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Sequence (English): [[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]]
Test Sequence (French): [[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 17]]
Predicted similarity: 0.49845064
