In [1]:
# Importing needed libraries
!pip install tensorflow
!pip install scikit-learn
!pip install pandas
!pip install numpy
!pip install pickle

import tensorflow as tf

from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, GlobalMaxPooling1D, Dense, Concatenate, Lambda
from tensorflow.keras.models import Model, Sequential
import tensorflow.keras.backend as K

from sklearn.metrics import accuracy_score, f1_score

import pandas as pd
import re
import numpy as np



In [2]:
# Function for cleaning text
def clean_text(text):
    """Clean text: normalize spaces, remove misplaced punctuation, fix contractions."""
    text = str(text).strip().lower()

    # Fix spaces around punctuation (keep punctuation but standardize spacing)
    text = re.sub(r'\s+([?.!,"])', r'\1', text)  # Removes spaces before punctuation
    text = re.sub(r'([?.!,"])', r'\1 ', text)  # Ensures one space after punctuation

    # Normalize quotes (remove extra surrounding quotes)
    text = re.sub(r'^"|"$', '', text)

    # Handle common contractions
    text = re.sub(r"\bd'you\b", "do you", text)
    text = re.sub(r"\b'cause\b", "because", text)
    text = re.sub(r"\bi'm\b", "i am", text)
    text = re.sub(r"\bain't\b", "is not", text)

    return text

In [3]:
# Load the vectorizer
import pickle
with open('vectorizer.pkl', 'rb') as f:
      vectorizer = pickle.load(f)

In [4]:
# Define model structure
# Defining model's constants
max_seq_len = 25
embedding_size = 100
lstm_units = 300
hidden_layer_size = 512

# Defining Tensorflow Inputs for sentences
premise_input = Input(shape=(max_seq_len,))
hypothesis_input = Input(shape=(max_seq_len,))


# Defining an Embedding layer based on GLoVe
embedding_layer = Embedding(input_dim=len(vectorizer.get_vocabulary()),
                            output_dim=embedding_size,
                            trainable=True)

# Defining function that will be encoding sentences using Embedding layer above and two BiLSTMs
@tf.keras.utils.register_keras_serializable(package="Custom", name="encode_sentence")
def encode_sentence(input_text):
    x = embedding_layer(input_text)
    x = Bidirectional(LSTM(lstm_units, return_sequences=True))(x)
    x = Bidirectional(LSTM(lstm_units, return_sequences=True))(x)
    x = GlobalMaxPooling1D()(x)
    return x

@tf.keras.utils.register_keras_serializable(package="Custom", name="combine_vectors")
def combine_vectors(inputs):
    # inputs is a list of two tensors: [premise_encoded, hypothesis_encoded]
    premise, hypothesis = inputs
    return tf.concat([premise, hypothesis, tf.abs(premise - hypothesis), premise * hypothesis], axis=1)

# Encoding both premise and hypothesis
premise_encoded = encode_sentence(premise_input)
hypothesis_encoded = encode_sentence(hypothesis_input)

joined_vector = Lambda(combine_vectors,
                      output_shape=(2400,))([premise_encoded, hypothesis_encoded])

# Final Dense layer
hidden_layer = Dense(hidden_layer_size, activation='relu')(joined_vector)
output = Dense(1, activation='sigmoid')(hidden_layer)
model = Model(inputs=[premise_input, hypothesis_input], outputs=output)

In [7]:
# Function to produce a csv file with predictions from a csv file with testing data
# Load testing dataset
def get_predictions(file_path):
  df = pd.read_csv(file_path, quotechar='"', delimiter=",", encoding="utf-8")
  df.columns = ["premise", "hypothesis"]  # Ensure correct column names
  df.dropna(inplace=True)  # Remove missing values

  # Apply text cleaning for testing dataset
  df["premise"] = df["premise"].apply(clean_text)
  df["hypothesis"] = df["hypothesis"].apply(clean_text)

  # Loading the model
  model = tf.keras.models.load_model("bilstm_model.keras")

  # Vectorizing the input text
  X_premise_loaded = vectorizer(df['premise'].values)
  X_hypothesis_loaded = vectorizer(df['hypothesis'].values)

  # Produce predictions
  y_pred = model.predict([X_premise_loaded, X_hypothesis_loaded])
  avg = np.mean(y_pred)
  y_pred = (y_pred > avg).astype(int)

  # Print model's metrics
  print("Predictions successfully produced")

  # Save the predictions into a csv file
  y_pred = y_pred.flatten()
  df_predictions = pd.DataFrame({'prediction': y_pred})

  df_predictions.to_csv('predictions.csv', index=False)

In [8]:
get_predictions("test.csv")

[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 323ms/step
Predictions successfully produced
