In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.models import Sequential, Model
from keras.layers import Input, Embedding, LSTM, Dense

Using TensorFlow backend.


In [2]:
train = pd.read_csv("data_small.csv")

In [3]:
tokenizer = Tokenizer(num_words=90000)
tokenizer.fit_on_texts(train['words'])
X = tokenizer.texts_to_sequences(train['words'])
X = pad_sequences(X, maxlen=1000, padding='post', truncating='post')

In [4]:
Y = pd.get_dummies(pd.DataFrame({'tag': [str(target) for target in train['tag']]})).values

In [5]:
X_train, X_val, y_train, y_val = train_test_split(X, Y, test_size=0.1, random_state=30)

In [6]:
print("X_train.shape:", end=' ')
print(X_train.shape)
print("y_train.shape:", end=' ')
print(y_train.shape)
print("X_val.shape:", end=' ')
print(X_val.shape)
print("y_val.shape:", end=' ')
print(y_val.shape)

X_train.shape: (408, 1000)
y_train.shape: (408, 2)
X_val.shape: (46, 1000)
y_val.shape: (46, 2)


In [7]:
# Keras 序贯（Sequential）模型
model = Sequential()

model.add(Embedding(100000, 300, input_length=1000, name='embedding'))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2, name='LSTM'))
model.add(Dense(2, activation='softmax', name='Dense'))

model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

model.fit(X_train, y_train, batch_size=32, epochs=5)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1000, 300)         30000000  
_________________________________________________________________
LSTM (LSTM)                  (None, 100)               160400    
_________________________________________________________________
Dense (Dense)                (None, 2)                 202       
Total params: 30,160,602
Trainable params: 30,160,602
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f84403edfd0>

In [8]:
# Keras 函数式（Functional）模型
inputs = Input(shape=(1000, ), name='imput')
x = Embedding(100000, 300, input_length=1000, trainable=True, name='embedding')(inputs)
x = LSTM(100, dropout=0.2, recurrent_dropout=0.2, name='LSTM')(x)
x = Dense(64, activation='relu', name='Dense1')(x)
x = Dense(64, activation='relu', name='Dense2')(x)
predictions = Dense(2, activation='softmax', name='Dense3')(x)

model = Model(inputs=inputs, outputs=predictions)

model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

model.fit(X_train, y_train, batch_size=32, epochs=5)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
imput (InputLayer)           (None, 1000)              0         
_________________________________________________________________
embedding (Embedding)        (None, 1000, 300)         30000000  
_________________________________________________________________
LSTM (LSTM)                  (None, 100)               160400    
_________________________________________________________________
Dense1 (Dense)               (None, 64)                6464      
_________________________________________________________________
Dense2 (Dense)               (None, 64)                4160      
_________________________________________________________________
Dense3 (Dense)               (None, 2)                 130       
Total params: 30,171,154
Trainable params: 30,171,154
Non-trainable params: 0
________________________________________________________________

<keras.callbacks.History at 0x7f83e4713860>