LSTM + Pytorch For User Stories Classification

1.1 Preprocess the data

In [1]:
from nltk.corpus import stopwords
from nltk import word_tokenize
import torch


#read dataset
import pandas as pd
df = pd.read_csv('training_data/NLI/train.csv')
premise_data = df['premise'].tolist()
hypothesis_data = df['hypothesis'].tolist()
label_data = df['label'].tolist()

In [2]:
import re

# data clean
paired_data = list(zip(premise_data, hypothesis_data, label_data))

duplicates = set()
unique_paired_data = []
for pair in paired_data:
    if (pair in duplicates) or (pair[0] == pair[1]) or len(pair[0]) == 0 or len(pair[1]) == 0:
        continue
    else:
        duplicates.add(pair)
        unique_paired_data.append(pair)

premise_data, hypothesis_data, label_data = zip(*unique_paired_data)
    
premise_data_clean_garbled = [re.sub(r'[^a-zA-Z0-9\s]', '', text) for text in premise_data]
hypothesis_data_clean_garbled = [re.sub(r'[^a-zA-Z0-9\s]', '', text) for text in hypothesis_data]

cleaned_premise_data = [' '.join(re.sub(r'\b\w*www\w*\b', '', text).split()) for text in premise_data_clean_garbled]
cleaned_hypothesis_data = [' '.join(re.sub(r'\b\w*www\w*\b', '', text).split()) for text in hypothesis_data_clean_garbled]

In [3]:
import nltk
# nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

preprocessed_premise_data = [word_tokenize(text) for text in cleaned_premise_data]
filtered_premise_data = [[word.lower() for word in premise if word.lower() not in stop_words] for premise in preprocessed_premise_data]

preprocessed_hypothesis_data = [word_tokenize(text) for text in cleaned_hypothesis_data]
filtered_hypothesis_data = [[word.lower() for word in hypothesis if word.lower() not in stop_words] for hypothesis in preprocessed_hypothesis_data]


In [4]:
from gensim.models import Word2Vec
from tensorflow.keras.layers import Embedding
import numpy as np

# 加载预训练的 Word2Vec 模型
model = Word2Vec(filtered_premise_data+filtered_hypothesis_data, vector_size=300, window=5, min_count=5)
word_vectors = model.wv

premise_vectors = []
for sentence in filtered_premise_data:
    sentence_vectors = []
    for word in sentence:
        if word in model.wv:
            sentence_vectors.append(model.wv[word].tolist())
        else:
            sentence_vectors.append(np.zeros(model.vector_size).tolist())
    premise_vectors.append(sentence_vectors)
    
hypothesis_vectors = []
for sentence in filtered_hypothesis_data:
    sentence_vectors = []
    for word in sentence:
        if word in model.wv:
            sentence_vectors.append(model.wv[word].tolist())
        else:
            sentence_vectors.append(np.zeros(model.vector_size).tolist())
    hypothesis_vectors.append(sentence_vectors)


# # 应用词向量到 embedding 层
# vocab_size = len(word_vectors.key_to_index) + 1
# embedding_dim = model.vector_size
# embedding_matrix = np.zeros((vocab_size, embedding_dim))
# for word, i in word_vectors.key_to_index.items():
#     embedding_vector = word_vectors[word]
#     if embedding_vector is not None:
#         embedding_matrix[i] = embedding_vector

In [5]:
model.wv["list"].shape

(300,)

In [6]:
# from keras.initializers import Constant
# # 创建 embedding 层
# embedding_layer = Embedding(vocab_size, embedding_dim, embeddings_initializer=Constant(embedding_matrix), trainable=False)

In [7]:
maxlen1 = np.max([len(text) for text in filtered_premise_data])
maxlen2 = np.max([len(text) for text in filtered_hypothesis_data])

In [105]:
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dense, Subtract, Lambda, Multiply, Concatenate, GlobalMaxPooling1D
from tensorflow.keras.models import Model
import tensorflow as tf
from tensorflow.keras.layers import Layer
import tensorflow.python.keras.backend as K
from tensorflow.keras.regularizers import l2

input1 = Input(shape=(None,model.vector_size))
input2 = Input(shape=(None,model.vector_size))

lstm = Bidirectional(LSTM(64, return_sequences=True))

# # 使用 LSTM 层处理输入
lstm_output1 = lstm(input1)
lstm_output2 = lstm(input2)

# 沿着时间步骤轴进行最大池化
lstm_output1 = GlobalMaxPooling1D()(lstm_output1)
lstm_output2 = GlobalMaxPooling1D()(lstm_output2)

multiplied_features = Multiply()([lstm_output1, lstm_output2])

subtracted_features = Subtract()([lstm_output1, lstm_output2])
absolute_difference = Lambda(lambda x: K.abs(x), output_shape=lambda x: x)(subtracted_features)
print(absolute_difference.shape)

final_feature = Concatenate()([lstm_output1, lstm_output2, multiplied_features, absolute_difference])

# 全连接层
dense1 = Dense(128, activation='relu')(final_feature)
dense1 = Dense(32, activation='relu')(final_feature)
dense2 = Dense(1, activation='sigmoid')(dense1)

# 创建模型
model_NLI = Model(inputs=[input1, input2], outputs=dense2)

# 编译模型
model_NLI.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# 打印模型结构
model_NLI.summary()

(None, 128)


In [98]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
validation_size = int(len(premise_vectors) * 0.2)

p_val = premise_vectors[-validation_size:]
h_val = hypothesis_vectors[-validation_size:]
label_val = label_data[-validation_size:]

p_train = premise_vectors[:-validation_size]
h_train = hypothesis_vectors[:-validation_size]
label_train = label_data[:-validation_size]

irregular_array = np.array(p_val, dtype=object)
padded_p_val = pad_sequences(irregular_array, padding='post', dtype='float32', maxlen=maxlen1)
irregular_array = np.array(h_val, dtype=object)
padded_h_val = pad_sequences(irregular_array, padding='post', dtype='float32', maxlen=maxlen2)
padded_label_val = np.array(label_val)


irregular_array = np.array(p_train, dtype=object)
padded_p_train = pad_sequences(irregular_array, padding='post', dtype='float32', maxlen=maxlen1)
irregular_array = np.array(h_train, dtype=object)
padded_h_train = pad_sequences(irregular_array, padding='post', dtype='float32', maxlen=maxlen2)
padded_label_train = np.array(label_train)


In [93]:
# padded_label_train = padded_label_train[:, np.newaxis]

In [95]:
# padded_label_val = padded_label_val[:, np.newaxis]

In [99]:
model_NLI.fit([padded_p_train, padded_h_train], padded_label_train, batch_size=64, epochs=40, validation_data = ([padded_p_val, padded_h_val], padded_label_val))

Epoch 1/40
[1m337/337[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 66ms/step - accuracy: 0.5481 - loss: 0.6836 - val_accuracy: 0.5758 - val_loss: 0.6705
Epoch 2/40
[1m337/337[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 59ms/step - accuracy: 0.5895 - loss: 0.6619 - val_accuracy: 0.5915 - val_loss: 0.6604
Epoch 3/40
[1m337/337[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 60ms/step - accuracy: 0.6011 - loss: 0.6538 - val_accuracy: 0.5975 - val_loss: 0.6530
Epoch 4/40
[1m337/337[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 60ms/step - accuracy: 0.6074 - loss: 0.6446 - val_accuracy: 0.6075 - val_loss: 0.6499
Epoch 5/40
[1m337/337[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 61ms/step - accuracy: 0.6190 - loss: 0.6371 - val_accuracy: 0.6165 - val_loss: 0.6440
Epoch 6/40
[1m337/337[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 60ms/step - accuracy: 0.6227 - loss: 0.6310 - val_accuracy: 0.6054 - val_loss: 0.6477
Epoch 7/40
[1m3

<keras.src.callbacks.history.History at 0x24ebb34dfd0>

In [None]:
model_NLI.fit([padded_p_train, padded_h_train], padded_label_train, batch_size=64, epochs=40, validation_data = ([padded_p_val, padded_h_val], padded_label_val))

Epoch 1/40
[1m337/337[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 183ms/step - accuracy: 0.6643 - loss: 0.6016 - val_accuracy: 0.6222 - val_loss: 0.6488
Epoch 2/40
[1m337/337[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 156ms/step - accuracy: 0.6757 - loss: 0.5898 - val_accuracy: 0.6155 - val_loss: 0.6562
Epoch 3/40
[1m337/337[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 161ms/step - accuracy: 0.6751 - loss: 0.5896 - val_accuracy: 0.6135 - val_loss: 0.6520
Epoch 4/40
[1m337/337[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 160ms/step - accuracy: 0.6882 - loss: 0.5755 - val_accuracy: 0.6159 - val_loss: 0.6517
Epoch 5/40
[1m337/337[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 161ms/step - accuracy: 0.6929 - loss: 0.5764 - val_accuracy: 0.6257 - val_loss: 0.6483
Epoch 6/40
[1m316/337[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m3s[0m 144ms/step - accuracy: 0.6972 - loss: 0.5723