LSTM + Pytorch For User Stories Classification

1.1 Preprocess the data

In [1]:
from nltk.corpus import stopwords
from nltk import word_tokenize
import torch


#read dataset
import pandas as pd
df = pd.read_csv('training_data/NLI/train.csv')
premise_data = df['premise'].tolist()
hypothesis_data = df['hypothesis'].tolist()
label_data = df['label'].tolist()

In [2]:
import re

# data clean
paired_data = list(zip(premise_data, hypothesis_data, label_data))

duplicates = set()
unique_paired_data = []
for pair in paired_data:
    if (pair in duplicates) or (pair[0] == pair[1]) or len(pair[0]) == 0 or len(pair[1]) == 0:
        continue
    else:
        duplicates.add(pair)
        unique_paired_data.append(pair)

premise_data, hypothesis_data, label_data = zip(*unique_paired_data)
    
premise_data_clean_garbled = [re.sub(r'[^a-zA-Z0-9\s]', '', text) for text in premise_data]
hypothesis_data_clean_garbled = [re.sub(r'[^a-zA-Z0-9\s]', '', text) for text in hypothesis_data]

cleaned_premise_data = [' '.join(re.sub(r'\b\w*www\w*\b', '', text).split()) for text in premise_data_clean_garbled]
cleaned_hypothesis_data = [' '.join(re.sub(r'\b\w*www\w*\b', '', text).split()) for text in hypothesis_data_clean_garbled]

In [3]:
import nltk
# nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

preprocessed_premise_data = [word_tokenize(text) for text in cleaned_premise_data]
filtered_premise_data = [[word.lower() for word in premise if word.lower() not in stop_words] for premise in preprocessed_premise_data]

preprocessed_hypothesis_data = [word_tokenize(text) for text in cleaned_hypothesis_data]
filtered_hypothesis_data = [[word.lower() for word in hypothesis if word.lower() not in stop_words] for hypothesis in preprocessed_hypothesis_data]


In [70]:
from gensim.models import Word2Vec
from tensorflow.keras.layers import Embedding
import numpy as np

# 加载预训练的 Word2Vec 模型
model = Word2Vec(filtered_premise_data+filtered_hypothesis_data, vector_size=300, window=5, min_count=5)
word_vectors = model.wv

premise_vectors = []
for sentence in filtered_premise_data:
    sentence_vectors = []
    for word in sentence:
        if word in model.wv:
            sentence_vectors.append(model.wv[word])
        else:
            sentence_vectors.append(np.zeros(model.vector_size))
    premise_vectors.append(sentence_vectors)
    
hypothesis_vectors = []
for sentence in filtered_hypothesis_data:
    sentence_vectors = []
    for word in sentence:
        if word in model.wv:
            sentence_vectors.append(model.wv[word])
        else:
            sentence_vectors.append(np.zeros(model.vector_size))
    hypothesis_vectors.append(sentence_vectors)


# # 应用词向量到 embedding 层
# vocab_size = len(word_vectors.key_to_index) + 1
# embedding_dim = model.vector_size
# embedding_matrix = np.zeros((vocab_size, embedding_dim))
# for word, i in word_vectors.key_to_index.items():
#     embedding_vector = word_vectors[word]
#     if embedding_vector is not None:
#         embedding_matrix[i] = embedding_vector

In [71]:
# from keras.initializers import Constant
# # 创建 embedding 层
# embedding_layer = Embedding(vocab_size, embedding_dim, embeddings_initializer=Constant(embedding_matrix), trainable=False)

In [72]:
maxlen = np.max([len(text) for text in filtered_premise_data]+[len(text) for text in filtered_hypothesis_data])
maxlen = int(maxlen)

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (26936,) + inhomogeneous part.

In [81]:
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dense
from tensorflow.keras.models import Model
import tensorflow as tf
from tensorflow.keras.layers import Layer

input1 = Input(shape=(maxlen+1,model.vector_size))
input2 = Input(shape=(maxlen+1,model.vector_size))

# embedded_input1 = embedding_layer(input1)
# embedded_input2 = embedding_layer(input2)

lstm = Bidirectional(LSTM(64, return_sequences=False))

lstm_output1 = lstm(input1)
lstm_output2 = lstm(input2)

class DoComputeLayer(Layer):
    def __init__(self, **kwargs):
        super(DoComputeLayer, self).__init__(**kwargs)

    def call(self, inputs):
        u, v = inputs
        dot_product = u + v + tf.abs(tf.subtract(u, v)) + tf.reduce_sum(u * v, axis=-1)
        return dot_product

# 创建一个 DotProductLayer 实例
do_compute_layer = DoComputeLayer()

# 将两个张量传递给 DotProductLayer 层进行计算
compute_vector = do_compute_layer((lstm_output1, lstm_output2))

# 全连接层
dense1 = Dense(32, activation='relu')(sum_output)
dense2 = Dense(1, activation='sigmoid')(dense1)

# 创建模型
model_NLI = Model(inputs=[input1, input2], outputs=dense2)

# 编译模型
model_NLI.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# 打印模型结构
model_NLI.summary()

In [82]:
validation_size = int(len(premise_vectors) * 0.2)

p_val = premise_vectors[-validation_size:]
h_val = hypothesis_vectors[-validation_size:]
label_val = label_data[-validation_size:]

p_train = premise_vectors[:-validation_size]
h_train = hypothesis_vectors[:-validation_size]
label_train = label_data[:-validation_size]

model_NLI.fit([p_train, h_train], label_train, batch_size=32, epochs=90 , validation_data = ([p_val, h_val], label_val))

KeyboardInterrupt: 