In [1]:
import numpy as np
import pandas as pd
from gensim.models import KeyedVectors  # Assuming you're using Word2Vec embeddings
import re
from nltk.corpus import stopwords
from keras.preprocessing.sequence import pad_sequences
import pickle
from tensorflow.keras.models import load_model

In [2]:
stops = set(stopwords.words('english'))

In [3]:
def text_to_word_list(text):
    ''' Pre process and convert texts to a list of words '''
    text = str(text)
    text = text.lower()

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)

    text = text.split()

    return text

### Checking the functionality of the loaded embedding layer

In [4]:
# Load vocabulary and inverse vocabulary from saved files
with open(r'products_weights_vocabs/vocabulary.pkl', 'rb') as f:
    vocabulary = pickle.load(f)

with open(r'products_weights_vocabs/inverse_vocabulary.pkl', 'rb') as f:
    inverse_vocabulary = pickle.load(f)

In [5]:
vocabulary["bad"]

409

In [6]:
len(vocabulary)

58563

In [7]:
Q = ["Why Is It Important To Get Life Insurance?", "What Makes Life Insurance Important?"]

In [8]:
Q_tokenized = [text_to_word_list(question) for question in Q]

encoded_questions = []

# Encode each question based on the vocabulary
for question_tokens in Q_tokenized:
    encoded_question = []
    for token in question_tokens:
        if token in vocabulary:
            encoded_question.append(vocabulary[token])
        elif token in stops:
            continue
        else:
            encoded_question.append(0)  # Handle unknown words with '<unk>' index
    encoded_questions.append(encoded_question)

In [9]:
Q_tokenized

[['why', 'is', 'it', 'important', 'to', 'get', 'life', 'insurance'],
 ['what', 'makes', 'life', 'insurance', 'important']]

In [10]:
print("Encoded Questions:")
for question, encoded in zip(Q, encoded_questions):
    print(f"Question: {question}")
    print(f"Encoded: {encoded}")
    print()

Encoded Questions:
Question: Why Is It Important To Get Life Insurance?
Encoded: [422, 264, 490, 2667]

Question: What Makes Life Insurance Important?
Encoded: [253, 490, 2667, 422]



In [20]:
encoded_question_1 = encoded_questions[0]
encoded_question_2 = encoded_questions[1]

### Recreate model Architecture

In [12]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, Lambda
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adadelta

In [13]:
TRAIN_CSV = r"products_weights_vocabs/train.csv"
TEST_CSV = r"products_weights_vocabs/test.csv"
embeddings = np.load('products_weights_vocabs/embeddings.npy')

In [14]:
embeddings.shape

(58564, 300)

In [15]:
n_hidden = 50
gradient_clipping_norm = 1.25
batch_size = 64
n_epoch = 25
dropout_rate = 0.2

In [16]:
def exponent_neg_manhattan_distance(left, right):
    return tf.math.exp(-tf.math.reduce_sum(tf.math.abs(left - right), axis=1, keepdims=True))

In [26]:
# Create dummy data
max_seq_length = 101  # Adjust based on your training
input_left = Input(shape=(max_seq_length,))
input_right = Input(shape=(max_seq_length,))

left_output = tf.keras.layers.Dense(10)(input_left)
right_output = tf.keras.layers.Dense(10)(input_right)

distance_layer = Lambda(lambda x: exponent_neg_manhattan_distance(x[0], x[1]))([left_output, right_output])

test_model = Model(inputs=[input_left, input_right], outputs=distance_layer)

dummy_data_left = tf.constant([[0] * max_seq_length], dtype=tf.float32)
dummy_data_right = tf.constant([[0] * max_seq_length], dtype=tf.float32)

test_output = test_model.predict([dummy_data_left, dummy_data_right])
print(test_output)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[[1.]]


In [31]:
left_input = Input(shape=(max_seq_length,), dtype='int32')
right_input = Input(shape=(max_seq_length,), dtype='int32')

embedding_dim = embeddings.shape[1]
vocab_size = len(vocabulary)

embedding_layer = Embedding(vocab_size, embedding_dim, input_length=max_seq_length, trainable=False)

encoded_left = embedding_layer(left_input)
encoded_right = embedding_layer(right_input)

shared_lstm = LSTM(n_hidden, dropout=dropout_rate, recurrent_dropout=dropout_rate)

left_output = shared_lstm(encoded_left)
right_output = shared_lstm(encoded_right)

malstm_distance = Lambda(lambda x: exponent_neg_manhattan_distance(x[0], x[1]))([left_output, right_output])

malstm = Model([left_input, right_input], [malstm_distance])



In [33]:
malstm.load_weights("malstm_weights.weights.h5")

ValueError: A total of 1 objects could not be loaded. Example error message for object <Embedding name=embedding, built=True>:

The shape of the target variable and the shape of the target value in `variable.assign(value)` must match. variable.shape=(58563, 300), Received: value.shape=(49558, 300). Target variable: <KerasVariable shape=(58563, 300), dtype=float32, path=embedding/embeddings>

List of objects that could not be loaded:
[<Embedding name=embedding, built=True>]

In [18]:
import keras
keras.config.enable_unsafe_deserialization() # Custom func dodesnt work without deseralization

In [22]:
model.summary()

In [23]:
print(padded_question_1.shape)
print(padded_question_2.shape)

(1, 101)
(1, 101)


In [None]:
predictions = model.predict([padded_question_1, padded_question_2])
print(predictions)