In [2]:
import pandas as pd

xy_train_df = pd.read_csv('xy_train.csv')
x_test_df = pd.read_csv('x_test.csv')

In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pprint import pprint

vocab_size = 40000
max_len = 40


x = xy_train_df.text
y = xy_train_df.label

x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.2)

# build vocabulary from training set
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(x_train)


def _preprocess(list_of_text):
    return pad_sequences(
        tokenizer.texts_to_sequences(list_of_text),
        maxlen=max_len,
        padding='post',
    )
    

# padding is done inside: 
x_train = _preprocess(x_train)
x_valid = _preprocess(x_valid)

print(x_train.shape, y_train.shape)
print(x_valid.shape, y_valid.shape)

(48000, 40) (48000,)
(12000, 40) (12000,)


In [7]:
pprint(tokenizer.sequences_to_texts(x_train[:5]))

["a smart border wall would destroy historic gravesites in south texas is 'no "
 "longer a reliably red state'",
 'my uncle on a secret mission in vietnam to rescue forgotten pows he passed '
 'away in 1996 from agent orange and we had to fight the va to get his records '
 'and this picture released',
 'colorized violence erupts as drug dealer harasses youth on the streets of '
 'san francisco 1982',
 'escaped nazi genetic experiment found wandering on the banks of the rhine c '
 '1943 colorized',
 'canadian ambassador says nafta deal reached in 1992 cbc report on toronto '
 'lawyer toronto star']


In [8]:
print('total words in the dictionary:', tokenizer.num_words)

total words in the dictionary: 40000


In [34]:
from __future__ import absolute_import, division, print_function, unicode_literals
import collections
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.compat.v1.keras.layers import CuDNNGRU


import tensorflow as tf
from tensorflow.keras.optimizers import Adam

seq_in = keras.Input(batch_shape=(None, max_len))
embedded = keras.layers.Embedding(tokenizer.num_words, 100)(seq_in)
averaged = tf.reduce_mean(embedded, axis=1)
pred = keras.layers.Dense(1, activation='sigmoid')(averaged)

model = keras.Model(
    inputs=seq_in,
    outputs=pred,
)

model.compile(
    optimizer=Adam(),
    loss='binary_crossentropy',
    metrics=['accuracy', 'AUC']
)



history = model.fit(x_train,
                    y_train,
                    epochs=20,
                    batch_size=64,
                    validation_data=(x_valid, y_valid),
                    verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [35]:
x_test = _preprocess(x_test_df.text)
y_predict = np.squeeze(model.predict(x_test))


pd.DataFrame(
    {'id': x_test_df.index,
     'label':y_predict}).to_csv('sample_submission.csv', index=False)