LSTM + Pytorch For User Stories Classification

1.1 Preprocess the data

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### The first part is training part

In [2]:
# load dataset
import pandas as pd
df = pd.read_csv('train.csv')
premise_data = df['premise'].tolist()
hypothesis_data = df['hypothesis'].tolist()
label_data = df['label'].tolist()

In [3]:
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [4]:
import re

# data clean
paired_data = list(zip(premise_data, hypothesis_data, label_data))

duplicates = set()
unique_paired_data = []
for pair in paired_data:
    if (pair in duplicates) or (pair[0] == pair[1]) or len(pair[0]) == 0 or len(pair[1]) == 0:
        continue
    else:
        duplicates.add(pair)
        unique_paired_data.append(pair)

premise_data, hypothesis_data, label_data = zip(*unique_paired_data)

premise_data_clean_garbled = [re.sub(r'[^a-zA-Z0-9\s]', '', text) for text in premise_data]
hypothesis_data_clean_garbled = [re.sub(r'[^a-zA-Z0-9\s]', '', text) for text in hypothesis_data]

cleaned_premise_data = [' '.join(re.sub(r'\b\w*www\w*\b', '', text).split()) for text in premise_data_clean_garbled]
cleaned_hypothesis_data = [' '.join(re.sub(r'\b\w*www\w*\b', '', text).split()) for text in hypothesis_data_clean_garbled]

In [5]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# preprocessing
preprocessed_premise_data = [word_tokenize(text) for text in cleaned_premise_data]
filtered_premise_data = [[word.lower() for word in premise if word.lower() not in stop_words] for premise in preprocessed_premise_data]

preprocessed_hypothesis_data = [word_tokenize(text) for text in cleaned_hypothesis_data]
filtered_hypothesis_data = [[word.lower() for word in hypothesis if word.lower() not in stop_words] for hypothesis in preprocessed_hypothesis_data]

lemmatized_premise_data = [[lemmatizer.lemmatize(word) for word in premise] for premise in filtered_premise_data]
lemmatized_hypothesis_data = [[lemmatizer.lemmatize(word) for word in hypothesis] for hypothesis in filtered_hypothesis_data]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
all_data = filtered_premise_data + filtered_hypothesis_data
# all_data = lemmatized_premise_data + lemmatized_hypothesis_data

vocab_set = set(word for text in all_data for word in text)

vocab = list(vocab_set)

In [7]:
from gensim.models import Word2Vec
from tensorflow.keras.layers import Embedding
import numpy as np

# load Word2Vec
model = Word2Vec(filtered_premise_data+filtered_hypothesis_data, vector_size=200, window=5, min_count=5)
# model = Word2Vec(lemmatized_premise_data+lemmatized_hypothesis_data, vector_size=64, window=5, min_count=5)

from gensim.models import KeyedVectors

word_vectors = model.wv

vocab_size = len(vocab)
vector_size = word_vectors.vector_size

embedding_matrix = np.zeros((vocab_size, vector_size))

for i, word in enumerate(vocab):
    if word in word_vectors:
        embedding_matrix[i] = word_vectors[word]

In [8]:
# load Word2Vec word vectors
premise_vectors = []
for sentence in filtered_premise_data:
# for sentence in lemmatized_premise_data:
    sentence_vectors = []
    for word in sentence:
        try:
            sentence_vectors.append(word_vectors.key_to_index[word])
        except KeyError:
            sentence_vectors.append(vector_size)
    premise_vectors.append(sentence_vectors)

hypothesis_vectors = []
for sentence in filtered_hypothesis_data:
# for sentence in lemmatized_hypothesis_data:
    sentence_vectors = []
    for word in sentence:
        try:
            sentence_vectors.append(word_vectors.key_to_index[word])
        except KeyError:
            sentence_vectors.append(vector_size)
    hypothesis_vectors.append(sentence_vectors)

label_data_list = list(label_data)

In [9]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# divide data into train set and validation set
p_train, p_val, h_train, h_val, label_train, label_val = train_test_split(
    premise_vectors, hypothesis_vectors, label_data_list, test_size=0.2, random_state=42)


In [10]:
# find max length for padding
maxlen1 = int(np.max([len(text) for text in premise_vectors]))
maxlen2 = int(np.max([len(text) for text in hypothesis_vectors]))
maxlen1 = np.max([maxlen1, maxlen2])
maxlen2 = maxlen1


In [11]:
# padding
irregular_array = np.array(p_val, dtype=object)
padded_p_val = pad_sequences(irregular_array, padding='post', dtype='float32', maxlen=maxlen1)
irregular_array = np.array(h_val, dtype=object)
padded_h_val = pad_sequences(irregular_array, padding='post', dtype='float32', maxlen=maxlen2)
padded_label_val = np.array(label_val)


irregular_array = np.array(p_train, dtype=object)
padded_p_train = pad_sequences(irregular_array, padding='post', dtype='float32', maxlen=maxlen1)
irregular_array = np.array(h_train, dtype=object)
padded_h_train = pad_sequences(irregular_array, padding='post', dtype='float32', maxlen=maxlen2)
padded_label_train = np.array(label_train)

In [12]:
from tensorflow.keras.layers import Input, Embedding, Bidirectional, Flatten, Dot, Dropout, LSTM, Activation, TimeDistributed, Dense, Subtract, Lambda, Multiply, Concatenate, GlobalMaxPooling1D
from tensorflow.keras.models import Model
import tensorflow as tf
from tensorflow.keras.layers import Layer
import tensorflow.python.keras.backend as K
from tensorflow.keras.regularizers import l2

# del model_NLI

embedding_DROPOUT = 0.2
DROPOUT = 0.2
L2 = 0.001
lstm_size = 64

K.clear_session()

# input layer
input1 = Input(shape=(maxlen1, ))
input2 = Input(shape=(maxlen2, ))

# embedding layer
embedding = Embedding(input_dim=vocab_size, output_dim = vector_size, embeddings_regularizer=l2(L2))

lstm_output1 = embedding(input1)
lstm_output1 = Dropout(embedding_DROPOUT)(lstm_output1)
lstm_output2 = embedding(input2)
lstm_output2 = Dropout(embedding_DROPOUT)(lstm_output2)

# BiLSTM
lstm = Bidirectional(LSTM(lstm_size, return_sequences=True, kernel_regularizer=l2(L2)))

lstm_output1 = lstm(lstm_output1)
lstm_output1 = Dropout(DROPOUT)(lstm_output1)
lstm_output2 = lstm(lstm_output2)
lstm_output2 = Dropout(DROPOUT)(lstm_output2)

# attention weights
attention = Dot(axes=-1)([lstm_output1, lstm_output2])
weight_att_1 = Lambda(lambda x: tf.keras.activations.softmax(x, axis=1))(attention)
weight_att_2 = Lambda(lambda x: tf.keras.activations.softmax(x, axis=2))(attention)
aligned_1 = Dot(axes=(1, 1))([weight_att_1, lstm_output2])
aligned_2 = Dot(axes=(2, 1))([weight_att_2, lstm_output1])

# features concatenation
feature_1 = Concatenate()([lstm_output1, aligned_1, Multiply()([lstm_output1, aligned_1]), Lambda(lambda x: tf.abs(x))(Subtract()([lstm_output1, aligned_1]))])
feature_2 = Concatenate()([lstm_output2, aligned_2, Multiply()([lstm_output2, aligned_2]), Lambda(lambda x: tf.abs(x))(Subtract()([lstm_output2, aligned_2]))])

# BiLSTM
lstm_2 = Bidirectional(LSTM(lstm_size*2, return_sequences=True, kernel_regularizer=l2(L2)))

lstm2_output1 = lstm_2(feature_1)
lstm2_output1 = Dropout(DROPOUT)(lstm2_output1)
lstm2_output2 = lstm_2(feature_2)
lstm2_output2 = Dropout(DROPOUT)(lstm2_output2)

# pooling
premise_avg = Lambda(lambda x: tf.reduce_mean(x, axis=1))(lstm2_output1)
hypothesis_avg = Lambda(lambda x: tf.reduce_mean(x, axis=1))(lstm2_output2)
premise_max = Lambda(lambda x: tf.reduce_max(x, axis=1))(lstm2_output1)
hypothesis_max = Lambda(lambda x: tf.reduce_max(x, axis=1))(lstm2_output2)

final_feature = Concatenate()([premise_avg, premise_max, hypothesis_avg, hypothesis_max])

# dense
dense1 = Dense(32, activation='tanh', kernel_regularizer=l2(L2))(final_feature)
dense2 = Dense(1, activation='sigmoid', kernel_regularizer=l2(L2))(dense1)

# output
model_NLI = Model(inputs=[input1, input2], outputs=dense2)

# compile
model_NLI.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# whether use the word vectors of word2vec
# embedding.set_weights([embedding_matrix])
# embedding.set_weights([embedding_matrix])
# print("use weights from word2vec")

model_NLI.summary()


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 119)]                0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 119)]                0         []                            
                                                                                                  
 embedding (Embedding)       (None, 119, 200)             7692400   ['input_1[0][0]',             
                                                                     'input_2[0][0]']             
                                                                                                  
 dropout (Dropout)           (None, 119, 200)             0         ['embedding[0][0]']       

In [13]:
from keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=1, restore_best_weights=True)

# train the model
history = model_NLI.fit([padded_p_train, padded_h_train], padded_label_train, batch_size=64, epochs=10, validation_data = ([padded_p_val, padded_h_val], padded_label_val), callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 6: early stopping


In [14]:
model_NLI.save("model.h5")

  saving_api.save_model(


### The second section is the demo part which produces the prediction result

In [15]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
df_2 = pd.read_csv('test.csv', na_values="n/a")
premise_data_dev = df_2['premise'].tolist()
hypothesis_data_dev = df_2['hypothesis'].tolist()
# label_data_dev = df_2['label'].tolist()

# data clean
paired_data_dev = list(zip(premise_data_dev, hypothesis_data_dev))

duplicates = set()
unique_paired_data_dev = []
for pair in paired_data_dev:
#     print(pair)
    if (pair in duplicates) or (pair[0] == pair[1]) or pd.isna(pair[0]) or pd.isna(pair[1]):
        continue
    else:
        duplicates.add(pair)
        unique_paired_data_dev.append(pair)

premise_data_dev, hypothesis_data_dev = zip(*unique_paired_data_dev)

premise_data_clean_garbled_dev = [re.sub(r'[^a-zA-Z0-9\s]', '', text) for text in premise_data_dev]
hypothesis_data_clean_garbled_dev = [re.sub(r'[^a-zA-Z0-9\s]', '', text) for text in hypothesis_data_dev]

cleaned_premise_data_dev = [' '.join(re.sub(r'\b\w*www\w*\b', '', text).split()) for text in premise_data_clean_garbled_dev]
cleaned_hypothesis_data_dev = [' '.join(re.sub(r'\b\w*www\w*\b', '', text).split()) for text in hypothesis_data_clean_garbled_dev]

preprocessed_premise_data_dev = [word_tokenize(text) for text in cleaned_premise_data_dev]
filtered_premise_data_dev = [[word.lower() for word in premise if word.lower() not in stop_words] for premise in preprocessed_premise_data_dev]

preprocessed_hypothesis_data_dev = [word_tokenize(text) for text in cleaned_hypothesis_data_dev]
filtered_hypothesis_data_dev = [[word.lower() for word in hypothesis if word.lower() not in stop_words] for hypothesis in preprocessed_hypothesis_data_dev]

lemmatized_premise_data_dev = [[lemmatizer.lemmatize(word) for word in premise] for premise in filtered_premise_data_dev]
lemmatized_hypothesis_dat_dev = [[lemmatizer.lemmatize(word) for word in hypothesis] for hypothesis in filtered_hypothesis_data_dev]

premise_vectors_dev = []
for sentence in filtered_premise_data_dev:
    sentence_vectors_dev = []
    for word in sentence:
        try:
            sentence_vectors_dev.append(word_vectors.key_to_index[word])
        except KeyError:
            sentence_vectors_dev.append(vector_size)
    premise_vectors_dev.append(sentence_vectors_dev)

hypothesis_vectors_dev = []
for sentence in filtered_hypothesis_data_dev:
    sentence_vectors_dev = []
    for word in sentence:
        try:
            sentence_vectors_dev.append(word_vectors.key_to_index[word])
        except KeyError:
            sentence_vectors_dev.append(vector_size)
    hypothesis_vectors_dev.append(sentence_vectors_dev)

irregular_array = np.array(premise_vectors_dev, dtype=object)
padded_p_test = pad_sequences(irregular_array, padding='post', dtype='float32', maxlen=maxlen1)
irregular_array = np.array(hypothesis_vectors_dev, dtype=object)
padded_h_test = pad_sequences(irregular_array, padding='post', dtype='float32', maxlen=maxlen2)
# padded_label_test = np.array(label_data_dev)



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [16]:
from tensorflow.keras.models import load_model

model_NLI = load_model('model.h5')
predictions_test = model_NLI.predict([padded_p_test, padded_h_test])
binary_predictions = np.where(predictions_test >= 0.5, 1, 0)
df = pd.DataFrame(binary_predictions, columns=['prediction'])

df.to_csv('Group_36_B.csv', index=False)



### The next part is developemtn part for evaluation according to accuracy

In [17]:
from sklearn.metrics import f1_score
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
df_2 = pd.read_csv('dev.csv', na_values="n/a")
premise_data_dev = df_2['premise'].tolist()
hypothesis_data_dev = df_2['hypothesis'].tolist()
label_data_dev = df_2['label'].tolist()

# data clean
paired_data_dev = list(zip(premise_data_dev, hypothesis_data_dev, label_data_dev))

duplicates = set()
unique_paired_data_dev = []
for pair in paired_data_dev:
#     print(pair)
    if (pair in duplicates) or (pair[0] == pair[1]) or pd.isna(pair[0]) or pd.isna(pair[1]):
        continue
    else:
        duplicates.add(pair)
        unique_paired_data_dev.append(pair)

premise_data_dev, hypothesis_data_dev, label_data_dev = zip(*unique_paired_data_dev)

premise_data_clean_garbled_dev = [re.sub(r'[^a-zA-Z0-9\s]', '', text) for text in premise_data_dev]
hypothesis_data_clean_garbled_dev = [re.sub(r'[^a-zA-Z0-9\s]', '', text) for text in hypothesis_data_dev]

cleaned_premise_data_dev = [' '.join(re.sub(r'\b\w*www\w*\b', '', text).split()) for text in premise_data_clean_garbled_dev]
cleaned_hypothesis_data_dev = [' '.join(re.sub(r'\b\w*www\w*\b', '', text).split()) for text in hypothesis_data_clean_garbled_dev]

preprocessed_premise_data_dev = [word_tokenize(text) for text in cleaned_premise_data_dev]
filtered_premise_data_dev = [[word.lower() for word in premise if word.lower() not in stop_words] for premise in preprocessed_premise_data_dev]

preprocessed_hypothesis_data_dev = [word_tokenize(text) for text in cleaned_hypothesis_data_dev]
filtered_hypothesis_data_dev = [[word.lower() for word in hypothesis if word.lower() not in stop_words] for hypothesis in preprocessed_hypothesis_data_dev]

lemmatized_premise_data_dev = [[lemmatizer.lemmatize(word) for word in premise] for premise in filtered_premise_data_dev]
lemmatized_hypothesis_dat_dev = [[lemmatizer.lemmatize(word) for word in hypothesis] for hypothesis in filtered_hypothesis_data_dev]

premise_vectors_dev = []
for sentence in filtered_premise_data_dev:
    sentence_vectors_dev = []
    for word in sentence:
        try:
            sentence_vectors_dev.append(word_vectors.key_to_index[word])
        except KeyError:
            sentence_vectors_dev.append(vector_size)
    premise_vectors_dev.append(sentence_vectors_dev)

hypothesis_vectors_dev = []
for sentence in filtered_hypothesis_data_dev:
    sentence_vectors_dev = []
    for word in sentence:
        try:
            sentence_vectors_dev.append(word_vectors.key_to_index[word])
        except KeyError:
            sentence_vectors_dev.append(vector_size)
    hypothesis_vectors_dev.append(sentence_vectors_dev)

irregular_array = np.array(premise_vectors_dev, dtype=object)
padded_p_test = pad_sequences(irregular_array, padding='post', dtype='float32', maxlen=maxlen1)
irregular_array = np.array(hypothesis_vectors_dev, dtype=object)
padded_h_test = pad_sequences(irregular_array, padding='post', dtype='float32', maxlen=maxlen2)
padded_label_test = np.array(label_data_dev)


predictions = model_NLI.predict([padded_p_test, padded_h_test])

threshold = 0.5
binary_predictions = np.where(predictions >= threshold, 1, 0)


f1 = f1_score(label_data_dev, binary_predictions, average='weighted')
accuracy = np.mean(binary_predictions.squeeze() == label_data_dev)
print("Accuracy: ", accuracy)
print("f1: ", f1)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Accuracy:  0.64651715431457
f1:  0.6461957659088589
