# NLP套路模板

- 参考文档《手把手教你在Python中实现文本分类（附代码、数据集）》  
https://blog.csdn.net/Tw6cy6uKyDea86Z/article/details/80416475

In [None]:
#导入数据集预处理、特征工程和模型训练所需的库

from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from sklearn import decomposition, ensemble



import pandas as pd, xgboost, numpy, textblob, string

from keras.preprocessing import text, sequence

from keras import layers, models, optimizers

In [None]:
#加载数据集

file = open('data/corpus')
data = file.read()

labels, texts = [], []

for i, line in enumerate(data.split("\n")):
    content = line.split()
    labels.append(content[0])
    texts.append(content[1])
file.close()

In [None]:
# 创建dataframe
train_df = pd.DataFrame()
train_df['text'] = texts
train_df['label'] = labels

In [None]:
train_df.head()

In [None]:
train_df.shape

In [None]:
# 划分数据集
X_train, X_valid, y_train, y_valid = model_selection.train_test_split(train_df['text'], train_df['label'], test_size=0.2)

# 将标签变为0/1的label编码
encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_valid = encoder.fit_transform(y_valid)

In [None]:
#创建一个向量计数器对象

count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')

count_vect.fit(train_df['text'])

In [None]:
X_train_count = count_vect.transform(X_train)
X_valid_count = count_vect.transform(X_valid)

# 将每一个词转换成了一个统计词频的词袋模型
print(X_train[0])
print(X_train_count.toarray()[0])

In [None]:
# 词语级Tf_idf
tfidf_model = TfidfVectorizer()
tfidf_model.fit(train_df['text'])
X_train_tfidf = tfidf_model.transform(X_train)
X_valid_tfidf = tfidf_model.transform(X_valid)

In [None]:
# ngram级tf_idf
ngram_tfidf_model = TfidfVectorizer(analyzer='word', ngram_range=(1,2), max_features=5000)
ngram_tfidf_model.fit(train_df['text'])
X_train_tfidf_ngram = ngram_tfidf_model.transform(X_train)
X_valid_tfidf_ngram = ngram_tfidf_model.transform(X_train)

In [None]:
# 字符级tf_idf
char_tfidf_model = TfidfVectorizer(analyzer='char', ngram_range=(1,2), max_features=5000)
char_tfidf_model.fit(train_df['text'])
X_train_tfidf_char = char_tfidf_model.transform(X_train)
X_valid_tfidf_char = char_tfidf_model.transform(X_train)

In [None]:
#加载预先训练好的词嵌入向量
embeddings_index = {}

for i, line in enumerate(open('data/wiki-news-300d-1M.vec')):
    values = line.split()
    embeddings_index[values[0]] = numpy.asarray(values[1:], dtype='float32')



#创建一个分词器
token = text.Tokenizer()
token.fit_on_texts(trainDF['text'])
word_index = token.word_index



#将文本转换为分词序列，并填充它们保证得到相同长度的向量
train_seq_x = sequence.pad_sequences(token.texts_to_sequences(train_x), maxlen=70)
valid_seq_x = sequence.pad_sequences(token.texts_to_sequences(valid_x), maxlen=70)



#创建分词嵌入映射
embedding_matrix = numpy.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector

In [None]:
# 字符计数特征
train_df['char_count'] = train_df['text'].apply(len)

# 单词计数特征
train_df['word_count'] = train_df['text'].apply(lambda x:len(x.split()))

# 单词平均长度计数特征
train_df['word_density'] = train_df['char_count'] / (train_df['word_count'] + 1)

In [None]:
# 预训练主题模型
lda_model = decomposition.LatentDirichletAllocation(n_components=20, learning_method='online', max_iter==20)
X_topic = lda_model.fit_transform(X_train_count)
topic_word = lda_model.components_
vocab = count_vect.get_feature_names()

In [None]:
# 可视化主题模型
n_top_words = 10 
topic_summaries = []
for i, topic_dist in enumerate(topic_word):
    topic_words = numpy.array(vocab)[numpy.argsort(topic_dist)][:-(n_top_words+1):-1]
    topic_summaries.append(''.join(topic_words))

In [None]:
# 构建一个所有模型的标准模板
def train_model(classifier, X_train, y_train, X_valid, y_valid, is_neural_net=False):
    classifier.fit(feature_vector_train, y_train)
    predictions = classifier.predict(X_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return metrics.accuracy_score(predictions, y_valid)

In [None]:
#特征为计数向量的朴素贝叶斯
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_count, train_y, xvalid_count)
print("NB, Count Vectors: ", accuracy)

#特征为词语级别TF-IDF向量的朴素贝叶斯
accuracy = train_model(naive_bayes.MultinomialNB(), X_train_tfidf, y_train, X_valid_tfidf)
print("NB, WordLevel TF-IDF: ", accuracy)



#特征为多个词语级别TF-IDF向量的朴素贝叶斯
accuracy = train_model(naive_bayes.MultinomialNB(), X_train_tfidf_ngram, y_train, X_valid_tfidf_ngram)
print("NB, N-Gram Vectors: ", accuracy)



#特征为词性级别TF-IDF向量的朴素贝叶斯
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print("NB, CharLevel Vectors: ", accuracy)

#参考输出结果
# NB, Count Vectors:  0.7004
# NB, WordLevel TF-IDF:  0.7024
# NB, N-Gram Vectors:  0.5344
# NB, CharLevel Vectors:  0.6872

In [None]:
# Linear Classifier on Count Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_count, train_y, xvalid_count)
print("LR, Count Vectors: ", accuracy)

#特征为词语级别TF-IDF向量的线性分类器
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf, train_y, xvalid_tfidf)
print("LR, WordLevel TF-IDF: ", accuracy)

#特征为多个词语级别TF-IDF向量的线性分类器
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("LR, N-Gram Vectors: ", accuracy)

#特征为词性级别TF-IDF向量的线性分类器
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print("LR, CharLevel Vectors: ", accuracy)

#参考输出结果
# LR, Count Vectors:  0.7048
# LR, WordLevel TF-IDF:  0.7056
# LR, N-Gram Vectors:  0.4896
# LR, CharLevel Vectors:  0.7012

In [None]:
#特征为多个词语级别TF-IDF向量的SVM
accuracy = train_model(svm.SVC(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("SVM, N-Gram Vectors: ", accuracy)

#输出结果
#SVM, N-Gram Vectors:  0.5296

In [None]:
#特征为计数向量的RF
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_count, train_y, xvalid_count)
print("RF, Count Vectors: ", accuracy)

#特征为词语级别TF-IDF向量的RF
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf, train_y, xvalid_tfidf)
print("RF, WordLevel TF-IDF: ", accuracy)

#输出结果
# RF, Count Vectors:  0.6972
# RF, WordLevel TF-IDF:  0.6988

In [None]:
#特征为计数向量的Xgboost
accuracy = train_model(xgboost.XGBClassifier(), xtrain_count.tocsc(), train_y, xvalid_count.tocsc())
print("Xgb, Count Vectors: ", accuracy)

#特征为词语级别TF-IDF向量的Xgboost
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf.tocsc(), train_y, xvalid_tfidf.tocsc())
print("Xgb, WordLevel TF-IDF: ", accuracy)

#特征为词性级别TF-IDF向量的Xgboost
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf_ngram_chars.tocsc(), train_y, xvalid_tfidf_ngram_chars.tocsc())
print("Xgb, CharLevel Vectors: ", accuracy)

#输出结果
# Xgb, Count Vectors:  0.6324
# Xgb, WordLevel TF-IDF:  0.6364
# Xgb, CharLevel Vectors:  0.6548

In [None]:
# 构建浅层神经网络
def create_model_architecture(input_size):
    # create inputlayer
    input_layer = layers.Input((input_size, ), sparse=True)
    
    # create hidden layer 
    hidden_layer = layers.Dense(100, activation='relu')(input_layer)
    
    #create output layer
    output_layer = layers.Dense(1, activation='sigmoid')(hidden_layer)
    
    classifier = models.Model(inputs=input_layer, outputs = output_layer)
    classifier.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')
    
    return classifier

accuracy = train_model(classifier, xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram, is_neural_net=True)
print "NN, Ngram Level TF IDF Vectors",  accuracy

In [None]:
# 构建卷积神经网络
def create_cnn():
    # Add an Input Layer
    input_layer = layers.Input((70, ))
    
    # Add the word embedding Layer
    embedding_layer = layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

    # Add the convolutional Layer
    conv_layer = layers.Convolution1D(100, 3, activation="relu")(embedding_layer)

    # Add the pooling Layer
    pooling_layer = layers.GlobalMaxPool1D()(conv_layer)

    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="relu")(pooling_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    output_layer2 = layers.Dense(1, activation="sigmoid")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')

    return model


classifier = create_cnn()
accuracy = train_model(classifier, train_seq_x, train_y, valid_seq_x, is_neural_net=True)
print("CNN, Word Embeddings",  accuracy)

#输出结果
# Epoch 1/1
# 7500/7500 [==============================] - 12s 2ms/step - loss: 0.5847
# CNN, Word Embeddings 0.5296

In [None]:
#构建LSTM
def create_rnn_lstm():
    # Add an Input Layer
    input_layer = layers.Input((70, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

    # Add the LSTM Layer
    lstm_layer = layers.LSTM(100)(embedding_layer)

    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="relu")(lstm_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    output_layer2 = layers.Dense(1, activation="sigmoid")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')

    return model



classifier = create_rnn_lstm()
accuracy = train_model(classifier, train_seq_x, train_y, valid_seq_x, is_neural_net=True)
print("RNN-LSTM, Word Embeddings",  accuracy)

#输出结果
# Epoch 1/1
# 7500/7500 [==============================] - 22s 3ms/step - loss: 0.6899
# RNN-LSTM, Word Embeddings 0.5124

In [None]:
# 构建GRU
def create_rnn_gru():
   # Add an Input Layer
    input_layer = layers.Input((70, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

    # Add the GRU Layer
    lstm_layer = layers.GRU(100)(embedding_layer)

    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="relu")(lstm_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    output_layer2 = layers.Dense(1, activation="sigmoid")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')
  
    return model



classifier = create_rnn_gru()
accuracy = train_model(classifier, train_seq_x, train_y, valid_seq_x, is_neural_net=True)
print("RNN-GRU, Word Embeddings",  accuracy)

#输出结果
# Epoch 1/1
# 7500/7500 [==============================] - 19s 3ms/step - loss: 0.6898
# RNN-GRU, Word Embeddings 0.5124

In [None]:
def create_bidirectional_rnn():
    # Add an Input Layer
    input_layer = layers.Input((70, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

    # Add the LSTM Layer
    lstm_layer = layers.Bidirectional(layers.GRU(100))(embedding_layer)

    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="relu")(lstm_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    output_layer2 = layers.Dense(1, activation="sigmoid")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')

    return model

classifier = create_bidirectional_rnn()
accuracy = train_model(classifier, train_seq_x, train_y, valid_seq_x, is_neural_net=True)
print("RNN-Bidirectional, Word Embeddings",  accuracy)

#输出结果
# Epoch 1/1
# 7500/7500 [==============================] - 32s 4ms/step - loss: 0.6889
# RNN-Bidirectional, Word Embeddings 0.5124

In [None]:
# 构建RCNN
def create_rcnn():
    # Add an Input Layer
    input_layer = layers.Input((70, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

    # Add the recurrent layer
    rnn_layer = layers.Bidirectional(layers.GRU(50, return_sequences=True))(embedding_layer)

    # Add the convolutional Layer
    conv_layer = layers.Convolution1D(100, 3, activation="relu")(embedding_layer)

    # Add the pooling Layer
    pooling_layer = layers.GlobalMaxPool1D()(conv_layer)

    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="relu")(pooling_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    output_layer2 = layers.Dense(1, activation="sigmoid")(output_layer1)
    
    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')

    return model



classifier = create_rcnn()
accuracy = train_model(classifier, train_seq_x, train_y, valid_seq_x, is_neural_net=True)
print("CNN, Word Embeddings",  accuracy)

#输出结果
# Epoch 1/1
# 7500/7500 [==============================] - 11s 1ms/step - loss: 0.6902
# CNN, Word Embeddings 0.5124

- 文本预处理套路
- https://blog.csdn.net/wizardforcel/article/details/83933459

In [None]:
# 词袋模型
# 加载库
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# 创建文本
text_data = np.array(['I love Brazil. Brazil!',
                      'Sweden is best',
                      'Germany beats both'])

# 创建词袋特征矩阵
count = CountVectorizer()
bag_of_words = count.fit_transform(text_data)

# 展示特征矩阵
bag_of_words.toarray()

'''
array([[0, 0, 0, 2, 0, 0, 1, 0],
       [0, 1, 0, 0, 0, 1, 0, 1],
       [1, 0, 1, 0, 1, 0, 0, 0]], dtype=int64) 
'''

# 获取特征名称
feature_names = count.get_feature_names()

# 查看特征名称
feature_names

# ['beats', 'best', 'both', 'brazil', 'germany', 'is', 'love', 'sweden'] 

# 创建数据帧
pd.DataFrame(bag_of_words.toarray(), columns=feature_names)


In [None]:
# 解析HTML
# 加载库
from bs4 import BeautifulSoup

# 创建一些 HTML 代码
html = "<div class='full_name'><span style='font-weight:bold'>Masego</span> Azra</div>"

# 解析 html
soup = BeautifulSoup(html, "lxml")

# 寻找带有 "full_name" 类的 <div>，展示文本
soup.find("div", { "class" : "full_name" }).text

# 'Masego Azra' 

In [None]:
# 移除标点
# 加载库
import string
import numpy as np

# 创建文本
text_data = ['Hi!!!! I. Love. This. Song....', 
             '10000% Agree!!!! #LoveIT', 
             'Right?!?!']

# 创建函数，使用 string.punctuation 移除所有标点
def remove_punctuation(sentence: str) -> str:
    return sentence.translate(str.maketrans('', '', string.punctuation))

# 应用函数
[remove_punctuation(sentence) for sentence in text_data]

# ['Hi I Love This Song', '10000 Agree LoveIT', 'Right'] 

In [None]:
# 移除停用词
# 加载库
from nltk.corpus import stopwords

# 你第一次需要下载停止词的集合
import nltk
nltk.download('stopwords')

'''
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chrisalbon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!

True 
'''

# 创建单词标记
tokenized_words = ['i', 'am', 'going', 'to', 'go', 'to', 'the', 'store', 'and', 'park']

# 加载停止词
stop_words = stopwords.words('english')

# 展示停止词
stop_words[:5]

# ['i', 'me', 'my', 'myself', 'we'] 

# 移除停止词
[word for word in tokenized_words if word not in stop_words]

# ['going', 'go', 'store', 'park'] 


In [None]:
# 替换字符
# 导入库
import re

# 创建文本
text_data = ['Interrobang. By Aishwarya Henriette',
             'Parking And Going. By Karl Gautier',
             'Today Is The night. By Jarek Prakash']

# 移除句号
remove_periods = [string.replace('.', '') for string in text_data]

# 展示文本
remove_periods

'''
['Interrobang By Aishwarya Henriette',
 'Parking And Going By Karl Gautier',
 'Today Is The night By Jarek Prakash'] 
'''

# 创建函数
def replace_letters_with_X(string: str) -> str:
    return re.sub(r'[a-zA-Z]', 'X', string)

# 应用函数
[replace_letters_with_X(string) for string in remove_periods]

'''
['XXXXXXXXXXX XX XXXXXXXXX XXXXXXXXX',
 'XXXXXXX XXX XXXXX XX XXXX XXXXXXX',
 'XXXXX XX XXX XXXXX XX XXXXX XXXXXXX'] 
'''

In [None]:
# 词干提取
# 加载库
from nltk.stem.porter import PorterStemmer

# 创建单词标记
tokenized_words = ['i', 'am', 'humbled', 'by', 'this', 'traditional', 'meeting']

# 创建提取器
porter = PorterStemmer()

# 应用提取器
[porter.stem(word) for word in tokenized_words]

# ['i', 'am', 'humbl', 'by', 'thi', 'tradit', 'meet'] 

In [None]:
# 移除空白
# 创建文本
text_data = ['   Interrobang. By Aishwarya Henriette     ',
             'Parking And Going. By Karl Gautier',
             '    Today Is The night. By Jarek Prakash   ']

# 移除空白
strip_whitespace = [string.strip() for string in text_data]

# 展示文本
strip_whitespace

'''
['Interrobang. By Aishwarya Henriette',
 'Parking And Going. By Karl Gautier',
 'Today Is The night. By Jarek Prakash'] 
'''

In [None]:
# 词性标签
# 加载库
from nltk import pos_tag
from nltk import word_tokenize

# 创建文本
text_data = "Chris loved outdoor running"

# 使用预训练的词性标注器
text_tagged = pos_tag(word_tokenize(text_data))

# 展示词性
text_tagged

# [('Chris', 'NNP'), ('loved', 'VBD'), ('outdoor', 'RP'), ('running', 'VBG')] 

In [None]:
# TF-IDF

# 加载库
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# 创建文本
text_data = np.array(['I love Brazil. Brazil!',
                      'Sweden is best',
                      'Germany beats both'])

# 创建 tf-idf 特征矩阵
tfidf = TfidfVectorizer()
feature_matrix = tfidf.fit_transform(text_data)

# 展示 tf-idf 特征矩阵
feature_matrix.toarray()

'''
array([[ 0.        ,  0.        ,  0.        ,  0.89442719,  0.        ,
         0.        ,  0.4472136 ,  0.        ],
       [ 0.        ,  0.57735027,  0.        ,  0.        ,  0.        ,
         0.57735027,  0.        ,  0.57735027],
       [ 0.57735027,  0.        ,  0.57735027,  0.        ,  0.57735027,
         0.        ,  0.        ,  0.        ]]) 
'''

# 展示 tf-idf 特征矩阵
tfidf.get_feature_names()

# ['beats', 'best', 'both', 'brazil', 'germany', 'is', 'love', 'sweden'] 

# 创建数据帧
pd.DataFrame(feature_matrix.toarray(), columns=tfidf.get_feature_names())

In [None]:
#文本分词
# 加载库
from nltk.tokenize import word_tokenize, sent_tokenize

# 创建文本
string = "The science of today is the technology of tomorrow. Tomorrow is today."

# 对文本分词
word_tokenize(string)

'''
['The',
 'science',
 'of',
 'today',
 'is',
 'the',
 'technology',
 'of',
 'tomorrow',
 '.',
 'Tomorrow',
 'is',
 'today',
 '.'] 
'''

# 对句子分词
sent_tokenize(string)

# ['The science of today is the technology of tomorrow.', 'Tomorrow is today.'] 

In [1]:
# 词袋模型 CountVectorizer  (一个小例子)
from sklearn.feature_extraction.text import CountVectorizer

sentences = ['John likes ice cream John', 'John hates chocolate.']

vectorizer = CountVectorizer(analyzer='word', min_df=0, lowercase=True)
vectorizer.fit(sentences)
# CountVectorizer训练将出现的每一个词建立索引并计算频次,注意，频次是指在1篇文档下的频次！
print(vectorizer.vocabulary_)
# CountVectorizer将出现的句子根据对应的索引和词频进行向量化表示
print(vectorizer.transform(sentences).toarray())

{'john': 4, 'likes': 5, 'ice': 3, 'cream': 1, 'hates': 2, 'chocolate': 0}
[[0 1 0 1 2 1]
 [1 0 1 0 1 0]]


In [None]:
# 绘制学习曲线
import matplotlib.pyplot as plt
%matplotlib inline 
plt.style.use('ggplot')

def plot_history(history):
    acc = history.history['acc']
    val_acc = history.history['val_acc']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()
    
plot_history(history)

In [None]:
# 使用CountVectorizer，每个向量的长度相同，等于总语料库的大小，其中的值表示该词在这句话中的出现次数。
# 使用Tokenizer，每个向量等于每个文本的长度，其数值并不表示计数，而是对应于字典tokenizer.word_index中的单词值。
# 词嵌入
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(sentences_train)

X_train = tokenizer.texts_to_sequences(sentences_train)
X_valid = tokenizer.texts_to_sequences(sentences_valid)

# vocabsize一定程度代表了tokenizer的一个值最大有可能是多少
vocab_size = len(tokenizer.word_index) + 1
# tokenizer把每一个词转化成了一个字典的key，这样就把一句话进行了稠密表示，然而这种方式并不能表征词与词之间的关系
print(sentences_train[0])
print(X_train[0])

# 可以看到'out'对应的Value为35
print(tokenizer.word_index)

In [None]:
# 对长度不一的Tokenizer进行补齐操作
from keras.preprocessing.sequence import pad_sequences
# maxlen一定程度代表了这句话最长可能由多少个单词组成
maxlen = 100 
print(sentences_train[:2])
#首先看一下X_train,此时由每句话转成的向量长度不等
print(X_train[:2])

# 使用pad_sequence()进行补齐,maxlen控制补齐后的长度
print('开始填充序列...')
X_train = pad_sequences(X_train, maxlen=100)
X_valid = pad_sequences(X_valid, maxlen=100)
print(X_train[:2])

In [None]:
# Keras实现Tokenizer
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense

# 让每一个词由50维的稠密向量表示，这样我们就由不可学习的tokenizer稠密表示
# 变为可学习的embedding稠密表示
embedding_dim = 50 

model = Sequential()

# 构建嵌入层，注意各参数对应的指标
model.add(Embedding(input_dim=vocab_size,     # 词典大小，每一个值的最大值
                    output_dim=embedding_dim, # 每一个词用多大维度的稠密向量来表示
                    input_length=maxlen))     # 输入层的维度
# 将嵌入层平铺的展开层
model.add(Flatten())
# 是个神经元的隐藏层
model.add(Dense(10, activation='relu'))
# 激活函数为sigmoid的输出层
model.add(Dense(1, activation='sigmoid'))

# 编译模型，设定优化器，损失函数，评估指标
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
# 查看一下该模型
model.summary()

history = model.fit(X_train, y_train,
                    epochs=10,
                    validation_data=(X_valid, y_valid),
                    verbose=1,                    
                    batch_size=10)
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_valid, y_valid, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))
plot_history(history)

In [None]:
# 添加池化层
from keras.models import Sequential
from keras.layers import Embedding, GlobalMaxPool1D, Dense

embedding_dim = 50 

model = Sequential() 
model.add(Embedding(input_dim=vocab_size,
                    output_dim=embedding_dim,
                    input_length=maxlen))
model.add(GlobalMaxPool1D())
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.summary()

In [None]:
# 练习使用预训练好的Glove词嵌入
# 首先下载词嵌入,并针对word_index中出现的词做筛选
import numpy as np

def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1  # Adding again 1 because of reserved 0 index
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    with open(filepath, encoding='UTF-8') as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word] 
                embedding_matrix[idx] = np.array(
                    vector, dtype=np.float32)[:embedding_dim]

    return embedding_matrix

# 创建嵌入矩阵
embedding_dim = 50
embedding_matrix = create_embedding_matrix('./data/glove.6B.50d.txt',tokenizer.word_index, embedding_dim)

# 统计Glove对本预料的覆盖比
nonzero_elements = np.count_nonzero(np.count_nonzero(embedding_matrix, axis=1))
print('{}%的向量是非零的,被预训练的词嵌入模型覆盖'.format(round((nonzero_elements / vocab_size) * 100,2)))

In [None]:
# 使用预训练词嵌入的DNN模型训练
model = Sequential() 
model.add(Embedding(input_dim=vocab_size, 
                    output_dim=embedding_dim,
                    input_length=maxlen,
                    weights=[embedding_matrix], # 将词嵌入矩阵赋值给嵌入层
                    trainable=False))            # 参数不可更新    
model.add(GlobalMaxPool1D())
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

history = model.fit(X_train, y_train,
                    epochs=50,
                    validation_data=(X_valid, y_valid),
                    verbose=1,                    
                    batch_size=10)
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_valid, y_valid, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))
plot_history(history)

In [None]:
# 再试下fine tuning更新词嵌入模型
model = Sequential() 
model.add(Embedding(input_dim=vocab_size, 
                    output_dim=embedding_dim,
                    input_length=maxlen,
                    weights=[embedding_matrix], # 将词嵌入矩阵赋值给嵌入层
                    trainable=True))            # 参数不可更新    
model.add(GlobalMaxPool1D())
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

In [None]:
# TextCNN
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D
model = Sequential() 
model.add(Embedding(input_dim=vocab_size,
                    output_dim=embedding_dim,
                    input_length=maxlen))
model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam',loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
# 使用随机搜索调节参数，为了让keras可以应用sklearn中的随机搜索
# 需要将keras包装成KerasClassifier
def Create_model(num_filters, kernel_size, vocab_size, maxlen):
    model = Sequential() 
    model.add(Embedding(vocab_size, embedding_dim, input_length=maxlen))
    model.add(Conv1D(num_filters, kernel_size, activation='relu'))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(10, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(optimizer='adam', 
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

In [None]:
param_grid = dict(num_filters=[64, 128],
                  kernel_size=[5, 7],
                  vocab_size=[5000],
                  embedding_dim=[50],
                  maxlen=[100])

In [None]:
# 使用随机搜索对Text CNN 进行调参
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import RandomizedSearchCV

# Main settings 
epochs = 20 
embedding_dim = 50
maxlen = 100 
output_file = 'data/output.txt'

# Run grid search for each source (yelp, amazon, imdb)
for source, fram in df.groupby('source'):
    print('Running grid search for dataset:', source)
    sentences = df['sentence'].values
    y = df['label'].values
    
    # Train test split
    sentences_train, sentences_valid, y_train, y_valid = train_test_split(sentences, y, test_size=0.25, random_state=1000)
    
    # Tokenize words 
    tokenizer = Tokenizer(num_words=5000)
    tokenizer.fit_on_texts(sentences_train)
    X_train = tokenizer.texts_to_sequences(sentences_train)
    X_valid = tokenizer.texts_to_sequences(sentences_train)
    
    # Adding 1 because of 0 in index
    vocab_size = len(tokenizer.word_index) + 1 
    
    # Pad sequences with zeros 
    X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)  # padding 在起始补还是在结尾补,truncating当需要截断序列时，从起始还是结尾截断
    X_valid = pad_sequences(X_valid, padding='post', maxlen=maxlen)
    
    # Parameter grid for grid search 
    param_grid = dict(num_filters=[32, 64, 128],
                      kernel_size=[1, 3, 5, 7],
                      vocab_size=[vocab_size],
                      maxlen=[maxlen])
    model = KerasClassifier(build_fn=Create_model,
                            epochs=10, 
                            batch_size=10,
                            verbose=True)
    grid = RandomizedSearchCV(estimator=model, param_distributions=param_grid,cv=2, verbose=1)
    grid_result = grid.fit(X_train, y_train)
    
    # Evaluate testing set 
    valid_accuracy = grid.score(X_valid, y_valid)
    
        # Save and evaluate results
    prompt = input(f'finished {source}; write to file and proceed? [y/n]')
    if prompt.lower() not in {'y', 'true', 'yes'}:
        break
    with open(output_file, 'a') as f:
        s = ('Running {} data set\nBest Accuracy : '
             '{:.4f}\n{}\nValid Accuracy : {:.4f}\n\n')
        output_string = s.format(
            source,
            grid_result.best_score_,
            grid_result.best_params_,
            valid_accuracy)
        print(output_string)
        f.write(output_string)

In [None]:
# Word2vec
实例：
#导入word2vec模块
from gensim.models import Word2Vec
#训练word2vec模型
model = Word2Vec(LineSentence('word2vec.txt'),min_count=1,size=200,iter=10)
#获取每个词和其表征向量
w2v = dict(zip(model.wv.index2word, model.wv.syn0))
# 通常来说在该方法在大量语料的时候能获得更好的效果，word2vec.txt是每条记录的分词结果，分词结果用空格隔开，每条记录用回车区分，如：
# 踢 下线 国际社区支局
# 合同号 推送 电子发票 可否 天宫殿支局
# 回不了笼 普子支局
在训练之后，可以先把模型保存下来再导入，如：
model.save('word2vec')
model = Word2Vec.load('word2vec')

In [None]:
# 在jupyter notebook中显示图片
<img src="./image/LSTM.png" width="500" height="40" align=center>