In [1]:
import pandas

df = pandas.read_csv('semantic.tsv', sep='\t', names=['text', 'is_positive'])

df = df.sample(frac=1).reset_index(drop=True)

In [11]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(df.text, df.is_positive, test_size=0.2, random_state=10)

In [4]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [5]:
from keras.preprocessing.sequence import pad_sequences

X_train = pad_sequences(X_train, maxlen=100, padding='post', truncating='post')
X_test = pad_sequences(X_test, maxlen=100, padding='post', truncating='post')

In [6]:
from keras.layers.embeddings import Embedding
from keras.layers.convolutional import Deconv2D
from keras import layers
from keras.models import Sequential

vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index


model = Sequential()
model.add(Embedding(input_dim=vocab_size,
                    output_dim=30,
                    input_length=100))
# model.add(layers.Flatten())
model.add(layers.GlobalMaxPool1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [7]:
model.fit(X_train, y_train, batch_size=32, epochs=10, validation_data=(X_test, y_test))

Train on 1600 samples, validate on 400 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x102e6f048>

In [8]:
model.evaluate(X_train, y_train)



[0.23905182480812073, 0.9975]

In [9]:
model.evaluate(X_test, y_test)



[0.6549072742462159, 0.64]