# 当代人工智能实验一：文本分类
## ——Word2Vec

### 一. 引入必要模块
numpy将用于数据的处理。
time用于记录代码运行时间。
nltk用于实现分词与去停用词，gensim用于实现Word2Vec。
LogisticRegression用于进行逻辑回归。
TfidfVectorizer用于进行TF-IDF值的计算。
train_test_split用于进行训练集与验证集的划分。
classification_report用于衡量模型的训练表现。

In [1]:
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
from sklearn.linear_model import LogisticRegression
import time

### 二. 下载停用词

In [3]:
import nltk
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### 三. 读取训练与测试数据
每条训练集数据都包含“文本”与给定的“标签”。每条测试集数据都有编号与“文本”。

In [4]:
# 打开并读取训练数据文档
f_train = open('train_data.txt')
train_text = f_train.read()
# print(train_text)
# 观察数据特征，确定数据文档中的每一项均由一个回车分割，故采取切片
train_text = train_text.split("\n")
# 创造两个数组，存储训练数据
# labelList 存储每个数据的标签
# rawList 存储每个数据的文本内容
labelList = []
rawList = []
for i in range(len(train_text)-1):
    train_text[i] = eval(train_text[i])
    labelList.append(train_text[i]["label"])
    rawList.append(train_text[i]["raw"])
labelList = np.array(labelList)
rawList = np.array(rawList)

In [5]:
# 打开并读取测试数据文档
f_test = open('test.txt')
result = f_test.read()
# print(test_text)
# 观察数据特征，确定数据文档中的每一项均由一个回车分割，故采取切片
result = result.split("\n")
# 去除第一行
result.pop(0)
# 测试集的大小
TEST_LENGTH = 2000
result_id = list(range(TEST_LENGTH))
result_text = []
for i in range(TEST_LENGTH):
    comma_index = result[i].find(",")
    if comma_index != -1:
        result_text.append(result[i][comma_index+2:])
    else:
        print("ERROR: COMMA NOT FOUND")
result_text = np.array(result_text)
# result_text

In [6]:
ALL = np.append(rawList, result_text)
# 记录数据的个数
LENGTH_TRAIN = len(rawList)
LENGTH_ALL = len(ALL)
# ALL

### 四. 使用Word2Vec进行LogisticRegression
Word2Vec是将每个词进行向量化的过程。为了衡量一段文本的向量，我们将文本中的每个词对应的向量求和并求平均值，作为衡量段落向量的方法。

In [8]:
# tokenized_texts: 包含了已经被分词的文本数据的列表。
# vector_size: 指定每个单词的向量维度。
# window: 模型在训练过程中，模型考虑了上下文的词语个数。
# min_count: 只有出现次数不少于min_count次的单词才会被考虑。
# sg: 这指代了我们使用了哪一种模型。sg=1表示使用了Skip-gram模型，即通过给定一个词来预测它的上下文。sg=0表示使用了CBOW模型，即通过给定上下文词来预测目标词。
start_time = time.time()
tokenized_texts = []
for text in rawList:
    tokens = word_tokenize(text.lower())
    filtered_tokens = []
    for word in tokens:
        if word not in stop_words:
            filtered_tokens.append(word)
    tokenized_texts.append(filtered_tokens)

model = Word2Vec(tokenized_texts, vector_size=100, window=5, min_count=1, sg=1)
text_vectors = []
for tokenized_text in tokenized_texts:
    text_vector = np.zeros(model.vector_size)
    word_count = 0

    for word in tokenized_text:
        if word in model.wv:
            text_vector = text_vector + model.wv[word]
            word_count = word_count + 1

    if word_count > 0:
        text_vector = text_vector / word_count

    text_vectors.append(text_vector)

text_vectors = np.array(text_vectors)
end_time = time.time()
print("运行时间: ", end_time - start_time)
text_vectors

运行时间:  15.154523611068726


array([[-0.19730163,  0.06354035,  0.0813565 , ..., -0.06578381,
        -0.26233182, -0.02703605],
       [-0.07018086,  0.02195625,  0.02862046, ..., -0.01766919,
        -0.3069617 , -0.04676938],
       [-0.18382921,  0.05678034,  0.03795623, ..., -0.04429037,
        -0.24023913, -0.03515332],
       ...,
       [-0.2005331 ,  0.09426859, -0.03197032, ..., -0.02192535,
        -0.2147369 ,  0.02126227],
       [-0.19297416,  0.12304204,  0.02049277, ...,  0.23851865,
        -0.17868695,  0.09136746],
       [ 0.03058167,  0.07475319, -0.09276127, ...,  0.06028539,
        -0.1637193 ,  0.01784925]])

In [9]:
# 划分训练集与测试集，这里选取12.5%的数据作为测试集，剩余数据作为训练集。
# random_state数值是不同会让训练集与测试集不同，若写为None则每次都随机生成。
start_time = time.time()
accuracyTotal = 0
precisionTotal = 0
recallTotal = 0
f1Total = 0
LOOP_NUMBER = 5
target_names = ['class_0', 'class_1', 'class_2', 'class_3', 'class_4', 'class_5', 'class_6', 'class_7', 'class_8', 'class_9']

for loop in range(LOOP_NUMBER):
    text_train, text_test, label_train, label_test = train_test_split(text_vectors, labelList, test_size=0.125, random_state=None)
    model = LogisticRegression(max_iter=1000)
    model.fit(text_train, label_train)
    accuracy = model.score(text_test, label_test)

    y_pred = model.predict(text_test)
    classification_rep = classification_report(label_test, y_pred, target_names=target_names, output_dict=True)

    # 提取相应的指标值
    precision = classification_rep['weighted avg']['precision']
    recall = classification_rep['weighted avg']['recall']
    f1 = classification_rep['weighted avg']['f1-score']

    accuracyTotal += accuracy
    precisionTotal += precision
    recallTotal += recall
    f1Total += f1

# 计算平均值
accuracy_avg = accuracyTotal / LOOP_NUMBER
precision_avg = precisionTotal / LOOP_NUMBER
recall_avg = recallTotal / LOOP_NUMBER
f1_avg = f1Total / LOOP_NUMBER
end_time = time.time()

print("运行时间：", end_time - start_time)
print("模型准确率：", accuracy_avg)
print("模型精确度：", precision_avg)
print("模型召回率：", recall_avg)
print("模型F1-score：", f1_avg)

运行时间： 5.794030666351318
模型准确率： 0.7267999999999999
模型精确度： 0.7308270142764218
模型召回率： 0.7267999999999999
模型F1-score： 0.7272711431073512


我们发现，准确率并不够理想。
在这里，我们对Word2Vec的参数进行调整，调高vector_size增加向量的维数，调高window增加每个词考虑的上下文词语个数，增加min_count减少被考虑的词语个数。

In [10]:
# tokenized_texts: 包含了已经被分词的文本数据的列表。
# vector_size: 指定每个单词的向量维度。
# window: 模型在训练过程中，模型考虑了上下文的词语个数。
# min_count: 只有出现次数不少于min_count次的单词才会被考虑。
# sg: 这指代了我们使用了哪一种模型。sg=1表示使用了Skip-gram模型，即通过给定一个词来预测它的上下文。sg=0表示使用了CBOW模型，即通过给定上下文词来预测目标词。
start_time = time.time()
tokenized_texts = []
for text in rawList:
    tokens = word_tokenize(text.lower())
    filtered_tokens = []
    for word in tokens:
        if word not in stop_words:
            filtered_tokens.append(word)
    tokenized_texts.append(filtered_tokens)

model = Word2Vec(tokenized_texts, vector_size=200, window=30, min_count=15, sg=1)
text_vectors = []
for tokenized_text in tokenized_texts:
    text_vector = np.zeros(model.vector_size)
    word_count = 0

    for word in tokenized_text:
        if word in model.wv:
            text_vector = text_vector + model.wv[word]
            word_count = word_count + 1

    if word_count > 0:
        text_vector = text_vector / word_count

    text_vectors.append(text_vector)

text_vectors = np.array(text_vectors)
end_time = time.time()
print("运行时间: ", end_time - start_time)
text_vectors

运行时间:  28.94205117225647


array([[ 0.07211074, -0.07428951, -0.09086505, ...,  0.01681455,
        -0.07429978, -0.01893976],
       [ 0.12664423, -0.03567159, -0.1087318 , ..., -0.0536377 ,
        -0.06126191, -0.00337222],
       [ 0.08269291, -0.0120166 , -0.10447612, ..., -0.02787858,
        -0.08618506, -0.01647773],
       ...,
       [ 0.01953245, -0.06564941, -0.06536076, ...,  0.01694293,
        -0.19201713,  0.03092945],
       [ 0.0419427 , -0.10557784, -0.05688835, ...,  0.02594936,
        -0.16388077, -0.00737843],
       [ 0.06556548, -0.06687689, -0.09299784, ...,  0.00876947,
        -0.13964166,  0.04956957]])

In [11]:
# 划分训练集与测试集，这里选取12.5%的数据作为测试集，剩余数据作为训练集。
# random_state数值是不同会让训练集与测试集不同，若写为None则每次都随机生成。
start_time = time.time()
accuracyTotal = 0
precisionTotal = 0
recallTotal = 0
f1Total = 0
LOOP_NUMBER = 5
target_names = ['class_0', 'class_1', 'class_2', 'class_3', 'class_4', 'class_5', 'class_6', 'class_7', 'class_8', 'class_9']

for loop in range(LOOP_NUMBER):
    text_train, text_test, label_train, label_test = train_test_split(text_vectors, labelList, test_size=0.125, random_state=None)
    model = LogisticRegression(max_iter=1000)
    model.fit(text_train, label_train)
    accuracy = model.score(text_test, label_test)

    y_pred = model.predict(text_test)
    classification_rep = classification_report(label_test, y_pred, target_names=target_names, output_dict=True)

    # 提取相应的指标值
    precision = classification_rep['weighted avg']['precision']
    recall = classification_rep['weighted avg']['recall']
    f1 = classification_rep['weighted avg']['f1-score']

    accuracyTotal += accuracy
    precisionTotal += precision
    recallTotal += recall
    f1Total += f1

# 计算平均值
accuracy_avg = accuracyTotal / LOOP_NUMBER
precision_avg = precisionTotal / LOOP_NUMBER
recall_avg = recallTotal / LOOP_NUMBER
f1_avg = f1Total / LOOP_NUMBER
end_time = time.time()

print("运行时间：", end_time - start_time)
print("模型准确率：", accuracy_avg)
print("模型精确度：", precision_avg)
print("模型召回率：", recall_avg)
print("模型F1-score：", f1_avg)

运行时间： 4.952762842178345
模型准确率： 0.8942
模型精确度： 0.894911399801741
模型召回率： 0.8942
模型F1-score： 0.8939536882361626


可以看到，准确率获得了15%左右的提升。这是非常巨大的提升。
实际上，Word2Vec是一个针对词语向量化的技术，而为了衡量一段文本的向量将所有词语的向量求和并求平均值，实际上也忽略了每个词各自的重要程度，这在逻辑上并不够严密。一个比较常用的方式是将每个词语的向量值乘以其TF-IDF值，因此在这里我们将进行尝试。

In [12]:
start_time = time.time()
tokenized_texts = []
for text in rawList:
    tokens = word_tokenize(text.lower())
    filtered_tokens = []
    for word in tokens:
        if word not in stop_words:
            filtered_tokens.append(word)
    tokenized_texts.append(filtered_tokens)
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(rawList)

model = Word2Vec(tokenized_texts, vector_size=200, window=30, min_count=15, sg=1)
text_vectors = []

for i, tokenized_text in enumerate(tokenized_texts):
    text_vector = np.zeros(model.vector_size)

    for word in tokenized_text:
        if word in model.wv:
            word_index = tfidf_vectorizer.vocabulary_.get(word, -1)
            if word_index != -1:
                # 第i个文本的第word_index的TF-IDF值
                tfidf_value = tfidf_matrix[i, word_index]
                text_vector = text_vector + model.wv[word] * tfidf_value

    text_vectors.append(text_vector)

text_vectors = np.array(text_vectors)
end_time = time.time()
print("运行时间为: ", end_time - start_time)
text_vectors

运行时间为:  40.57243084907532


array([[ 0.29739653, -0.40799748, -0.59443968, ..., -0.03522741,
        -0.11605332, -0.07182262],
       [ 0.86891719, -0.44466566, -1.27038685, ...,  0.00270309,
         0.24782205, -0.25306409],
       [ 0.63510258,  0.22956835, -1.07194985, ..., -0.39280625,
        -0.16968163, -0.21911345],
       ...,
       [ 0.19282071,  0.03819563,  0.58099265, ..., -0.53214407,
        -0.9995433 , -0.19288503],
       [ 0.10797803, -0.2047696 , -0.23695472, ..., -0.38171517,
        -1.12015148,  0.05703247],
       [-0.57062523,  0.62114971, -0.28332049, ...,  0.37524278,
        -2.14118437,  0.14609616]])

In [13]:
# 划分训练集与测试集，这里选取12.5%的数据作为测试集，剩余数据作为训练集。
# random_state数值是不同会让训练集与测试集不同，若写为None则每次都随机生成。
start_time = time.time()
accuracyTotal = 0
precisionTotal = 0
recallTotal = 0
f1Total = 0
LOOP_NUMBER = 5
target_names = ['class_0', 'class_1', 'class_2', 'class_3', 'class_4', 'class_5', 'class_6', 'class_7', 'class_8', 'class_9']

for loop in range(LOOP_NUMBER):
    text_train, text_test, label_train, label_test = train_test_split(text_vectors, labelList, test_size=0.125, random_state=None)
    model = LogisticRegression(max_iter=1000)
    model.fit(text_train, label_train)
    accuracy = model.score(text_test, label_test)

    y_pred = model.predict(text_test)
    classification_rep = classification_report(label_test, y_pred, target_names=target_names, output_dict=True)

    # 提取相应的指标值
    precision = classification_rep['weighted avg']['precision']
    recall = classification_rep['weighted avg']['recall']
    f1 = classification_rep['weighted avg']['f1-score']

    accuracyTotal += accuracy
    precisionTotal += precision
    recallTotal += recall
    f1Total += f1

# 计算平均值
accuracy_avg = accuracyTotal / LOOP_NUMBER
precision_avg = precisionTotal / LOOP_NUMBER
recall_avg = recallTotal / LOOP_NUMBER
f1_avg = f1Total / LOOP_NUMBER
end_time = time.time()

print("运行时间为：", end_time - start_time)
print("模型准确率：", accuracy_avg)
print("模型精确度：", precision_avg)
print("模型召回率：", recall_avg)
print("模型F1-score：", f1_avg)

运行时间为： 15.353400468826294
模型准确率： 0.9096
模型精确度： 0.9106241785822583
模型召回率： 0.9096
模型F1-score： 0.9096586453335164


可以看到，准确率有了微弱的提升，说明加权乘以TF-IDF是一个可能有效的做法。然而，即使如此，其效果仍然不如直接使用TF-IDF。
基于这样的背景，我们只能认为对该问题而言，Word2Vec的性能劣于TF-IDF，不是最适合该问题的模型。
**在接下来的问题分析中，我们会放弃使用Word2Vec作为文本转换为向量的方式。**