In [16]:
# Importing needed libraries
!pip install tensorflow
!pip install scikit-learn
!pip install pandas
!pip install numpy
!pip install pickle

import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, GlobalMaxPooling1D, Dense, Concatenate, Lambda
from tensorflow.keras.models import Model, Sequential
import tensorflow.keras.backend as K

from sklearn.metrics import accuracy_score, f1_score

import os
import random
import pandas as pd
import re
import numpy as np

In [17]:
# Setting s seed for recreatability
SEED = 42
os.environ['PYTHONHASHSEED'] = str(SEED)
os.environ['TF_DETERMINISTIC_OPS'] = '1'
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

In [18]:
# Define a function for text cleaning
def clean_text(text):
    """Clean text: normalize spaces, remove misplaced punctuation, fix contractions."""
    text = str(text).strip().lower()

    # Fix spaces around punctuation (keep punctuation but standardize spacing)
    text = re.sub(r'\s+([?.!,"])', r'\1', text)  # Removes spaces before punctuation
    text = re.sub(r'([?.!,"])', r'\1 ', text)  # Ensures one space after punctuation

    # Normalize quotes (remove extra surrounding quotes)
    text = re.sub(r'^"|"$', '', text)

    # Handle common contractions
    text = re.sub(r"\bd'you\b", "do you", text)
    text = re.sub(r"\b'cause\b", "because", text)
    text = re.sub(r"\bi'm\b", "i am", text)
    text = re.sub(r"\bain't\b", "is not", text)

    return text

In [19]:
# WARNING! in order to successfully load the dataset, all csv files have to be in the same directory as ipynb file
# Load training dataset
df_train = pd.read_csv("train.csv", quotechar='"', delimiter=",", encoding="utf-8")
df_train.columns = ["premise", "hypothesis", "label"]  # Ensure correct column names
df_train.dropna(inplace=True)  # Remove missing values

# Apply text cleaning for training dataset
df_train["premise"] = df_train["premise"].apply(clean_text)
df_train["hypothesis"] = df_train["hypothesis"].apply(clean_text)

df_train["label"] = df_train["label"].astype(int)
X_premise_train = df_train['premise'].values
X_hypothesis_train = df_train['hypothesis'].values

In [20]:
# Load validation dataset
df_val = pd.read_csv("dev.csv", quotechar='"', delimiter=",", encoding="utf-8")
df_val.columns = ["premise", "hypothesis", "label"]  # Ensure correct column names
df_val.dropna(inplace=True)  # Remove missing values

# Apply text cleaning for validation dataset
df_val["premise"] = df_val["premise"].apply(clean_text)
df_val["hypothesis"] = df_val["hypothesis"].apply(clean_text)

df_val["label"] = df_val["label"].astype(int)

X_premise_val = df_val['premise'].values
X_hypothesis_val = df_val['hypothesis'].values

In [21]:
# Define constants needed for preprocessing
max_seq_len = 25

# Transform the dataset into the needed format
y_train = df_train['label'].values
y_val = df_val['label'].values

vectorizer = tf.keras.layers.TextVectorization(
    max_tokens=None,
    output_mode='int',
    output_sequence_length=max_seq_len)

# Adapt the vectorizer to the data
combined_data = pd.concat([df_train['premise'], df_train['hypothesis']])
vectorizer.adapt(combined_data)

# Vectorize input data
X_premise_train = vectorizer(X_premise_train)
X_hypothesis_train = vectorizer(X_hypothesis_train)

X_premise_val = vectorizer(X_premise_val)
X_hypothesis_val = vectorizer(X_hypothesis_val)

In [22]:
# Defining model's constants
embedding_size = 100
lstm_units = 300
hidden_layer_size = 512

# Defining Tensorflow Inputs for sentences
premise_input = Input(shape=(max_seq_len,))
hypothesis_input = Input(shape=(max_seq_len,))

# Defining an Embedding layer based on GLoVe
embedding_layer = Embedding(input_dim=len(vectorizer.get_vocabulary()),
                            output_dim=embedding_size,
                            trainable=True)

# Defining function that will be encoding sentences using Embedding layer above and two BiLSTMs
@tf.keras.utils.register_keras_serializable(package="Custom", name="encode_sentence")
def encode_sentence(input_text):
    x = embedding_layer(input_text)
    x = Bidirectional(LSTM(lstm_units, return_sequences=True))(x)
    x = Bidirectional(LSTM(lstm_units, return_sequences=True))(x)
    x = GlobalMaxPooling1D()(x)
    return x

@tf.keras.utils.register_keras_serializable(package="Custom", name="combine_vectors")
def combine_vectors(inputs):
    # inputs is a list of two tensors: [premise_encoded, hypothesis_encoded]
    premise, hypothesis = inputs
    return tf.concat([premise, hypothesis, tf.abs(premise - hypothesis), premise * hypothesis], axis=1)

# Encoding both premise and hypothesis
premise_encoded = encode_sentence(premise_input)
hypothesis_encoded = encode_sentence(hypothesis_input)

joined_vector = Lambda(combine_vectors,
                      output_shape=(2400,))([premise_encoded, hypothesis_encoded])

# Final Dense layer
hidden_layer = Dense(hidden_layer_size, activation='relu')(joined_vector)
output = Dense(1, activation='sigmoid')(hidden_layer)
model = Model(inputs=[premise_input, hypothesis_input], outputs=output)

In [23]:
# Build and compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

In [24]:
model.fit([X_premise_train, X_hypothesis_train], y_train, validation_data=([X_premise_val, X_hypothesis_val], y_val), epochs=2, batch_size=128)

Epoch 1/2
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 82ms/step - accuracy: 0.5843 - loss: 0.6626 - val_accuracy: 0.6593 - val_loss: 0.6068
Epoch 2/2
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 77ms/step - accuracy: 0.7456 - loss: 0.5118 - val_accuracy: 0.6318 - val_loss: 0.6494


<keras.src.callbacks.history.History at 0x79c805c77950>

In [25]:
# Load testing dataset
df_test = pd.read_csv("NLI_trial.csv", quotechar='"', delimiter=",", encoding="utf-8")
df_test.columns = ["premise", "hypothesis", "label"]  # Ensure correct column names
df_test.dropna(inplace=True)  # Remove missing values

# Apply text cleaning for testing dataset
df_test["premise"] = df_test["premise"].apply(clean_text)
df_test["hypothesis"] = df_test["hypothesis"].apply(clean_text)

df_test["label"] = df_test["label"].astype(int)

In [26]:
X_hypothesis_test = vectorizer(df_test['hypothesis'].values)
X_premise_test = vectorizer(df_test['premise'].values)
y_test = df_test['label'].values

# Produce predicted probabilities
y_pred = model.predict([X_premise_test, X_hypothesis_test])
avg = np.mean(y_pred)
y_pred = (y_pred > avg).astype(int)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test F1 Score: {f1:.4f}")

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 671ms/step
Test Accuracy: 0.8000
Test F1 Score: 0.8000


In [27]:
y_pred = y_pred.flatten()
df_predictions = pd.DataFrame({'prediction': y_pred})

df_predictions.to_csv('predictions.csv', index=False)

In [28]:
# Saving the vectorizer and model
model.save("bilstm_model.keras")

import pickle
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

In [None]:
with open('vectorizer.pkl', 'rb') as f:
    vectorizer = pickle.load(f)

loaded_model = tf.keras.models.load_model("bilstm_model.keras")

X_premise_loaded = vectorizer(df_test['premise'].values)
X_hypothesis_loaded = vectorizer(df_test['hypothesis'].values)

# Produce predictions
y_pred = loaded_model.predict([X_premise_loaded, X_hypothesis_loaded])
avg = np.mean(y_pred)
y_pred = (y_pred > avg).astype(int)

# Print model's metrics
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test F1 Score: {f1:.4f}")



[1m1/2[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m1s[0m 1s/step



[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 701ms/step
Test Accuracy: 0.7800
Test F1 Score: 0.7660
