# Imports and Stuff

In [None]:
import tensorflow as tf

from tensorflow.keras.layers import Input, Dot, LSTM, Bidirectional, Dense, Embedding, Lambda
from tensorflow.keras import backend, Model

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import pandas as pd
import numpy as np

# Hyper parameters

In [None]:
max_length = 512
BATCH_SIZE = 64
EPOCHS=20
max_features = 15000
embedding_dim = 512
LEARNING_RATE = 0.01
DROPOUT = 0.1

# Data loading and pre-processing

In [None]:
# Data preprocessing functions
def clean_text(text):
  """
  Cleans text data by converting to lowercase, removing punctuation,
  and removing stop words (optional).
  """
  if type(text) == str:
    text = text.lower()
    text = ''.join([c for c in text if c.isalnum() or c.isspace()])  # Remove punctuation
    return text
  return ""

def preprocess_data(texts, tokenizer):
  """
  Preprocesses text data by cleaning, tokenizing, and padding sequences.
  """
  cleaned_texts = [clean_text(text) for text in texts]  # Clean text
  if not tokenizer:
    tokenizer = Tokenizer(num_words=max_features)  # Create tokenizer
  tokenizer.fit_on_texts(cleaned_texts)  # Fit tokenizer on text data
  sequences = tokenizer.texts_to_sequences(cleaned_texts)  # Convert text to sequences
  padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')  # Pad sequences

  return padded_sequences, tokenizer  # Return padded sequences and tokenizer for encoding unseen text



In [None]:
train_df = pd.read_csv("train.csv").dropna()
val_df = pd.read_csv("dev.csv").dropna()
test_df = pd.read_csv("AV_trial.csv").dropna()

train_texts1, train_texts2, train_labels = train_df["text_1"].tolist(), train_df["text_2"].tolist(), train_df["label"].tolist()  # Load training data (pair of texts and labels)
val_texts1, val_texts2, val_labels = val_df["text_1"].tolist(), val_df["text_2"].tolist(), val_df["label"].tolist()  # Load validation data
test_texts1, test_texts2, test_labels = test_df["text_1"].tolist(), test_df["text_2"].tolist(), test_df["label"].tolist()  # Load test data

# Preprocess training, validation, and test data
train_data1, tokenizer = preprocess_data(train_texts1, None)
train_data2, tokenizer = preprocess_data(train_texts2, tokenizer)  # Reuse tokenizer

val_data1, val_tokenizer = preprocess_data(val_texts1, tokenizer)
val_data2, tokenizer = preprocess_data(val_texts2, val_tokenizer)  # Reuse tokenizer

test_data1, test_tokenizer = preprocess_data(test_texts1, tokenizer)
test_data2, tokenizer = preprocess_data(test_texts2, test_tokenizer)  # Reuse tokenizer

# Get vocabulary size (considering all data for better coverage)
all_texts = train_texts1 + train_texts2 + val_texts1 + val_texts2 + test_texts1 + test_texts2
cleaned_texts = [clean_text(text) for text in all_texts]  # Clean text

tokenizer.fit_on_texts(cleaned_texts)  # Fit tokenizer on text data
vocab_size = len(tokenizer.word_index) + 1  # Preprocess and get vocabulary size



# Model Definition

In [None]:
from gensim.models import KeyedVectors
from keras import regularizers
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.callbacks import ModelCheckpoint
np.random.seed(42)
embedding_dim = 300

# Defining input shapes
input_shape = (max_length,)

# Loading pre-trained Word2Vec model
word2vec_model = KeyedVectors.load_word2vec_format('/content/GoogleNews-vectors-negative300.bin.gz', binary=True)
word_index= tokenizer.word_index

# Creating an embedding matrix
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in word_index.items():
    if word in word2vec_model:
        embedding_matrix[i] = word2vec_model[word]

embedding_layer = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix], input_length=max_length, trainable=False)

# Defining a shared GRU layer
gru_layer = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(32, dropout=0.3))

# Define input layers for two texts
input_text1 = tf.keras.layers.Input(shape=input_shape, name='input_text1')
input_text2 = tf.keras.layers.Input(shape=input_shape, name='input_text2')

# Applying shared embedding and GRU layers to both inputs
embedded_text1 = embedding_layer(input_text1)
embedded_text2 = embedding_layer(input_text2)
gru_output1 = gru_layer(embedded_text1)
gru_output2 = gru_layer(embedded_text2)

# Concatenating GRU outputs
concatenated_output = tf.keras.layers.Concatenate()([gru_output1, gru_output2])

# Applying Batch Normalization
bn_layer = tf.keras.layers.BatchNormalization()
bn_output = bn_layer(concatenated_output)

# Adding Dense layers for classification with L2 regularization
x = tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01))(bn_output)
x = tf.keras.layers.Dropout(0.5)(x)  # Adding dropout layer
output = tf.keras.layers.Dense(1, activation='sigmoid', kernel_regularizer=tf.keras.regularizers.l2(0.01))(x)

# Creating the Siamese GRU model
siamese_model = tf.keras.Model(inputs=[input_text1, input_text2], outputs=output)

model_checkpoint = ModelCheckpoint(filepath='best_model_B.h5', monitor='val_accuracy', save_best_only=True)


# Compiling the model
siamese_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
                      loss='binary_crossentropy',
                      metrics=['accuracy'])

# Display model summary
siamese_model.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_text1 (InputLayer)    [(None, 512)]                0         []                            
                                                                                                  
 input_text2 (InputLayer)    [(None, 512)]                0         []                            
                                                                                                  
 embedding_5 (Embedding)     (None, 512, 300)             4226280   ['input_text1[0][0]',         
                                                          0          'input_text2[0][0]']         
                                                                                                  
 bidirectional_9 (Bidirecti  (None, 64)                   64128     ['embedding_5[0][0]',   

# Model Training

In [None]:
siamese_model.fit([np.array(train_data1), np.array(train_data2)],
          np.array(train_labels),
          epochs=100,
          validation_data=([np.array(val_data1), np.array(val_data2)],
                           np.array(val_labels)),
                  callbacks=[model_checkpoint])


Epoch 1/100

  saving_api.save_model(


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 7

<keras.src.callbacks.History at 0x7a3ebdcb9990>

In [None]:
# Create table data
table_data = [
    ["Dropout Only", "0.5038"],
    ["Adding Regularization and Batch Normalization", "0.5189"],
    ["Adding Word2Vec", "0.5440"]
]

# Create HTML table with custom styling
html_table = "<table style='border-collapse: collapse;'>"
html_table += "<tr><th style='border: 2px solid #FF0A9D; padding: 10px; color: #1F4DBF;'>Method</th>"
html_table += "<th style='border: 2px solid #FF0A9D; padding: 10px; color: #1F4DBF;'>Val Accuracy</th></tr>"

for row in table_data:
    html_table += "<tr>"
    for cell in row:
        html_table += f"<td style='border: 2px solid #FF0A9D; padding: 10px; color: #1F4DBF;'>{cell}</td>"
    html_table += "</tr>"

html_table += "</table>"

# Display HTML table
from IPython.display import HTML
HTML(html_table)


Method,Val Accuracy
Dropout Only,0.5038
Adding Regularization and Batch Normalization,0.5189
Adding Word2Vec,0.544
