In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Nov  4 15:44:42 2018
Word2vec embeddings: https://radimrehurek.com/gensim/models/word2vec.html
@author: loewi
"""

import os
from time import time
from feature_extraction import __get
import numpy as np

path = r'/Users/loewi/Documents/Pre_Learn/classification/20news-bydate/'
os.chdir(path)
#print(os.getcwd())

In [2]:
print('Preparing data...')

t0 = time() 

newsgroups_train = __get('20news-bydate-train')
newsgroups_test = __get('20news-bydate-test')

duration = time() - t0
print('%0.2fs get data package ：）'%duration)

data_train,data_test = newsgroups_train['data'], newsgroups_test['data'] #list of strings
label_train, label_test = newsgroups_train['docs'], newsgroups_test['docs'] #array
print('Data prepared ：）')
print()

Preparing data...
9.54s get data package ：）
Data prepared ：）



In [6]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Activation

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
from nltk.corpus import stopwords
from gensim.models import Word2Vec
import re

print('Cleaning...')

def token_pattern(string):
    token_pattern = re.compile(r'(?u)\b[a-zA-Z_][a-zA-Z_]+\b')
    return token_pattern.findall(string)

stop = set(stopwords.words('english'))

def clean(string):        
    result = [ _ for _ in token_pattern(string) if not _ in stop]
    return result

sentences = []
for data in data_train:    
    sentences.append(clean(data))
    
print('%d sentences '%len(sentences))

Cleaning...
11314 sentences 


In [15]:
# Word2Vec(sentences=None, corpus_file=None, size=100, alpha=0.025, 
#            window=5, min_count=5, max_vocab_size=None, sample=0.001, 
#            seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, 
#            ns_exponent=0.75, cbow_mean=1, hashfxn=<built-in function hash>, i
#            ter=5, null_word=0, trim_rule=None, sorted_vocab=1, 
#            batch_words=10000, compute_loss=False, callbacks=(), 
#            max_final_vocab=None)
model = Word2Vec(sentences,min_count=1)

MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100    
num_words = min(MAX_NUM_WORDS,len(model.wv.vocab))

embedding_matrix = np.zeros((num_words +1,EMBEDDING_DIM))
for i in range(num_words):
    if i>MAX_NUM_WORDS:
        continue
    embedding_vector = model.wv[model.wv.index2word[i]] 
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
print('shape of embedding matrix:',embedding_matrix.shape)

shape of embedding matrix: (20001, 100)


In [7]:
print('Tokenizing...')

tokenizer = Tokenizer(num_words = MAX_NUM_WORDS)
#keras.preprocessing.text.Tokenizer(num_words=None, #None或整数(最常见的)
#                                   filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\t\n',
#                                   lower=True,
#                                   split=" ",
#                                   char_level=False #char_level: 如果为 True, 每个字符将被视为一个标记
#                                   )
tokenizer.fit_on_texts(data_train)
sequences = tokenizer.texts_to_sequences(data_train)#返回值：2Dlist，每个list对应于一段输入文本
tokenizer.fit_on_texts(data_test)
sequences_test = tokenizer.texts_to_sequences(data_test)


word_index = tokenizer.word_index #dict{key= word, value = 排名或者索引(从1开始)}
print('Found %s unique tokens.'%len(word_index))
#word_counts:字典，将单词（字符串）映射为它们在训练期间出现的次数。仅在调用fit_on_texts之后设置。
#word_docs: 字典，将单词（字符串）映射为它们在训练期间所出现的文档或文本的数量。仅在调用fit_on_texts之后设置。
#word_index: 字典，将单词（字符串）映射为它们的排名或者索引。仅在调用fit_on_texts之后设置。
#document_count: 整数。分词器被训练的文档（文本或者序列）数量。仅在调用fit_on_texts或fit_on_sequences之后设置。
print()

Tokenizing...
Found 90215 unique tokens.



In [8]:
MAX_SEQUENCE_LENGTH = 1000
#keras.preprocessing.sequence.pad_sequences(sequences, maxlen=None, dtype='int32'
#                                           padding='pre', truncating='pre', value=0.)
#sequences：浮点数或整数构成的两层嵌套列表 
#maxlen：None或整数，为序列的最大长度。大于此长度的序列将被截短，小于此长度的序列将在后部填0 
#padding：‘pre’或‘post’，确定当需要补0时，在序列的起始还是结尾补
#truncating：‘pre’或‘post’，确定当需要截断序列时，从起始还是结尾截断
#value：浮点数，此值将在填充时代替默认的填充值0
X_train = pad_sequences(sequences, maxlen = MAX_SEQUENCE_LENGTH)#长度不足1000的用0填充(前端填充)
X_test = pad_sequences(sequences_test, maxlen = MAX_SEQUENCE_LENGTH) 
#to_categorical(y, num_classes=None) 
#y: 类别向量，num_classes:总共类别数
y_train = to_categorical(label_train) #扩列，总类别20列
y_test = to_categorical(label_test)

print('shape of training data',X_train.shape)
print('shape of training labels',y_train.shape)
print('shape of testing data',X_test.shape)
print('shape of testing labels',y_test.shape)
print()

shape of training data (11314, 1000)
shape of training labels (11314, 20)
shape of testing data (7532, 1000)
shape of testing labels (7532, 20)



In [17]:
#LSTM
#keras.layers.embeddings.Embedding(
#                                input_dim, output_dim, 
#                                embeddings_initializer='uniform', embeddings_regularizer=None, 
#                                activity_regularizer=None, embeddings_constraint=None, 
#                                mask_zero=False, input_length=None
#                                )
#Embedding层只能作为模型的第一层
embedding_layer = Embedding(num_words + 1, #input_dim：大或等于0的整数，字典长度，即输入数据最大下标+1
                            EMBEDDING_DIM,#output_dim：大于0的整数，代表全连接嵌入的维度
                            weights=[embedding_matrix], #(20001, 100)
                            input_length=MAX_SEQUENCE_LENGTH, 
#当输入序列的长度固定时，该值为其长度。如果要在该层后接Flatten层，然后接Dense层，则必须指定该参数，否则Dense层的输出维度无法自动推断。
                            )
print('Building model...')

model = Sequential() #序贯模型是多个网络层的线性堆叠，也就是“一条路走到黑”
model.add(embedding_layer)
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))  #100维
model.add(Dense(1))#dense层，大于0的整数，代表该层的输出维度
model.add(Activation('sigmoid')) #激活层是对一个层的输出施加激活函数
model.add(Dense(len(newsgroups_train['classes']), activation='softmax'))#Softmax将连续数值转化成相对概率
model.layers[1].trainable=False

print('Model completed ：）')
model.summary()


Building model...
Model completed ：）
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 1000, 100)         2000100   
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 101       
_________________________________________________________________
activation_2 (Activation)    (None, 1)                 0         
_________________________________________________________________
dense_4 (Dense)              (None, 20)                40        
Total params: 2,080,641
Trainable params: 2,000,241
Non-trainable params: 80,400
_________________________________________________________________


In [19]:
#编译
#compile(self, optimizer, loss, metrics=None, 
#        loss_weights=None, sample_weight_mode=None, 
#        weighted_metrics=None, target_tensors=None)
model.compile(
            optimizer='adam',#优化器
            loss='binary_crossentropy',#损失函数
            metrics=['accuracy'],#指标列表
            )

print('Training...')
#fit(self, x=None, y=None, batch_size=None, epochs=1, verbose=1, 
#        callbacks=None, validation_split=0.0, validation_data=None, 
#        shuffle=True, class_weight=None, sample_weight=None, 
#        initial_epoch=0, steps_per_epoch=None, validation_steps=None)
#batch_size:整数，指定进行梯度下降时每个batch包含的样本数。训练时一个batch的样本会被计算一次梯度下降，使目标函数优化一步
#epochs：整数，训练终止时的epoch值，训练将在达到该epoch值时停止，当没有设置
#validation_data：形式为（X，y）或（X，y，sample_weights）的tuple，是指定的验证集。此参数将覆盖validation_spilt
batch_size = 32
model.fit(X_train, y_train, batch_size=batch_size, epochs=5, validation_split = 0.2)

#evaluate(self, x, y, batch_size=32, verbose=1, sample_weight=None)
#x：输入数据，与fit一样，是numpy array或numpy array的list
#y：标签，numpy array
loss, acc = model.evaluate(X_test, y_test, batch_size=batch_size)

print('Loss:',loss)
print('Accuracy:',acc) 

Training...
Train on 9051 samples, validate on 2263 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Loss: 0.2090653621700933
Accuracy: 0.949999988079071


In [20]:
model.save('my_model_Word2Vec_LSTM.h5')
#model = load_model('my_model_Word2Vec_LSTM.h5') 
print('Model saved!')

Model saved!
