# 1. 与 GoogleDrive 链接，轻松导入文件

In [None]:
# 下面两步是如何链接colab
from google.colab import drive
drive.mount('/content/drive/')

# 通常，直接给出路径然后导入即可
# 另外，%cd命令到该路径，然后按文件名导入

dir_path = '/content/drive/MyDrive/2023NLPCourse/Assignment1/Part_A'
file_name = 'IMDB Dataset.csv'

# cd 到该路径下
%cd /content/drive/MyDrive/2023NLPCourse/Assignment1/Part_A

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
/content/drive/MyDrive/2023NLPCourse/Assignment1/Part_A


# 2.数据导入和数据标签处理

In [None]:
import pandas as pd

data = pd.read_csv("Data/"+file_name, error_bad_lines=False)
data['label'] = data['sentiment'].replace(['positive', 'negative'],['1', '0'])



  data = pd.read_csv("Data/"+file_name, error_bad_lines=False)


# 3. 定义相关的超参数 并训练 Word2Vec 等模型

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, LSTM
from tensorflow.keras import utils
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
import nltk
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer
import gensim
import re
import numpy as np
from collections import Counter
import time

# Word2Vec 超参数
W2V_SIZE = 300
W2V_WINDOW = 7
W2V_EPOCH = 32
W2V_MIN_COUNT = 10

# keras 模型超参数
SEQUENCE_LENGTH = 80
EPOCHS = 32
BATCH_SIZE = 1024

# 情感参数
POSITIVE = "positive"
NEGATIVE = "negative"
NEUTRAL = "neutral"
SENTIMENT_THRESHOLDS = (0.4, 0.7) # 调整情绪阈值，得分0.4以下才为消极，得分0.7以上则为积极

target_cnt = Counter(data.sentiment)
nltk.download('stopwords')
stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")

# 简单的文本与处理函数，只是清楚一些无关的字符，移除停用词，词干提取
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"
def preprocess(text, stem=False):
    # 删除链接、用户和特殊字符
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# 4.数据预处理并将数据分割到训练测试集

In [None]:
%%time
data.review = data.review.apply(lambda x: preprocess(x))

# split data into train and test set
from sklearn.model_selection import train_test_split
X = data['review'].values
y = data['sentiment'].values
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.20, random_state=42)
documents = [_text.split() for _text in data.review]

CPU times: user 20.9 s, sys: 251 ms, total: 21.2 s
Wall time: 22.4 s


# 5. 使用gensium的模块训练word2vec模型（CBOW和Skip-gram）

In [None]:
# CBOW
CBOW_w2vmodel = gensim.models.Word2Vec(documents, vector_size=W2V_SIZE, window=W2V_WINDOW, min_count=W2V_MIN_COUNT, workers=8, sg=0)

# Skip-gram
Skipgram_w2vmodel = gensim.models.Word2Vec(documents, vector_size=W2V_SIZE, window=W2V_WINDOW, min_count=W2V_MIN_COUNT, workers=8, sg=1)

# 6. transform数据作为 BiLSTIM 模型的输入

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data.review)
vocab_size = len(tokenizer.word_index) + 1

# 限定文本的长度，最大长度为 80 个word
x_train = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=80)
x_test = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=80)

labels = data.sentiment.unique().tolist()
labels.append("neutral")

from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
encoder.fit(data.sentiment.tolist())

y_train = encoder.transform(Y_train.tolist()) 
y_test = encoder.transform(Y_test.tolist())

y_train = y_train.reshape(-1,1)
y_test = y_test.reshape(-1,1)

# 7. 从经过训练的 word2vec 模型获取 WordEmedding 矩阵(目的是将参数导入到tensorflow的bilstm模型的embedding层中)

In [None]:
CBOW_embedding_matrix = np.zeros((vocab_size, W2V_SIZE))
Skipgram_embedding_matrix = np.zeros((vocab_size, W2V_SIZE))
for word, i in tokenizer.word_index.items():
    if word in CBOW_w2vmodel.wv:
        CBOW_embedding_matrix[i] = CBOW_w2vmodel.wv[word]
    if word in Skipgram_w2vmodel.wv:
        Skipgram_embedding_matrix[i] = CBOW_w2vmodel.wv[word]

# 8.基于两个不同的词嵌入矩阵，构建两个不同的Bilstm模型

In [None]:
from tensorflow.keras.layers import Bidirectional

# CBOW_BiLSTM model
CBOW_BiLSTM_model = Sequential()
CBOW_BiLSTM_model.add(Embedding(vocab_size, W2V_SIZE, weights=[CBOW_embedding_matrix], 
                                input_length=SEQUENCE_LENGTH, trainable=False))
CBOW_BiLSTM_model.add(Dropout(0.5))
CBOW_BiLSTM_model.add(Bidirectional(LSTM(100, dropout=0.2)))
CBOW_BiLSTM_model.add(Dense(1, activation='sigmoid'))
CBOW_BiLSTM_model.compile(loss='binary_crossentropy', optimizer="adam", metrics=['accuracy'])

# Skipgram_BiLSTM model
Skipgram_BiLSTM_model = Sequential()
Skipgram_BiLSTM_model.add(Embedding(vocab_size, W2V_SIZE, weights=[Skipgram_embedding_matrix], 
                                input_length=SEQUENCE_LENGTH, trainable=False))
Skipgram_BiLSTM_model.add(Dropout(0.5))
Skipgram_BiLSTM_model.add(Bidirectional(LSTM(100, dropout=0.2)))
Skipgram_BiLSTM_model.add(Dense(1, activation='sigmoid'))
Skipgram_BiLSTM_model.compile(loss='binary_crossentropy', optimizer="adam", metrics=['accuracy'])

# 8. 训练两个模型

In [None]:
early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10)

# fit model
CBOW_BiLSTM_model.fit(x=x_train, y=y_train, epochs=20, batch_size=BATCH_SIZE,
                      validation_data=(x_test, y_test), verbose=1,callbacks=[early_stop])
Skipgram_BiLSTM_model.fit(x=x_train, y=y_train,epochs=20, batch_size=BATCH_SIZE,
                          validation_data=(x_test, y_test), verbose=1, callbacks=[early_stop])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7ff80c377580>

# 9. 混淆矩阵定义

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

def c_report(y_true, y_pred):
    print("Classification Report")
    print(classification_report(y_true, y_pred))
    acc_sc = accuracy_score(y_true, y_pred)
    print("Accuracy : "+ str(acc_sc))
    return acc_sc

def plot_confusion_matrix(y_true, y_pred):
    mtx = confusion_matrix(y_true, y_pred)
    sns.heatmap(mtx, annot=True, fmt='d', linewidths=.5, 
               cmap="Blues", cbar=False)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

# 10. 评估两个模型(基于CBOW与基于SKipGram的BiLSTM模型)

In [None]:
CBOW_preds = (CBOW_BiLSTM_model.predict(x_test) > 0.5).astype("int32")
Skipgram_preds = (Skipgram_BiLSTM_model.predict(x_test) > 0.5).astype("int32")

print("Evalution of CBOW_BiLSTM_Model")
c_report(y_test, CBOW_preds)

print("\n")

print("Evalution of Skipgram_BiLSTM_model")
c_report(y_test, Skipgram_preds)

Evalution of CBOW_BiLSTM_Model
Classification Report
              precision    recall  f1-score   support

           0       0.84      0.93      0.89      4961
           1       0.93      0.83      0.88      5039

    accuracy                           0.88     10000
   macro avg       0.89      0.88      0.88     10000
weighted avg       0.89      0.88      0.88     10000

Accuracy : 0.882


Evalution of Skipgram_BiLSTM_Model
Classification Report
              precision    recall  f1-score   support

           0       0.89      0.89      0.89      4961
           1       0.89      0.90      0.89      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000

Accuracy : 0.8922


0.8922

# 11. 保存模型

In [None]:
import pickle
pickle.dump(CBOW_BiLSTM_model, open('Models/CBOW_BiLSTM_model.save', 'wb'))
pickle.dump(Skipgram_BiLSTM_model, open('Models/Skipgram_BiLSTM_model.save', 'wb'))