In [1]:
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, BatchNormalization
from collections import Counter
import csv
import pandas as pd
import numpy as np
import math

Using TensorFlow backend.


In [2]:
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)

In [3]:
def load_training_data(fp, labeled=True):
    with open(fp, 'r') as f:
        lines = list(map(lambda x: x.strip(), f.readlines()))
    if labeled:
        label = map(lambda x: int(x.split(' +++$+++ ')[0]), lines)
        lines = map(lambda x: x.split(' +++$+++ ')[1], lines)
    x = map(lambda x: x.split(), lines)
    return (list(x), list(label)) if labeled else list(x)

def load_testing_data(fp):
    with open(fp, 'r') as f:
        rows = csv.reader(f)
        next(rows)
        lines = [' '.join(row[1:]) for row in rows]
    lines = map(lambda x: x.split(' '), lines)
    return list(lines)

In [4]:
train_label_x, train_label_y = load_training_data('training_label.txt')
test_data = load_testing_data('testing_data.txt')
# train_no_label = load_data('training_nolabel.txt', False)

In [5]:
class BoWEncoder():
    def __init__(self, mincount=5, maxcount=100000, stop=[',', '.', '..', '...', '....', '.....', "'", '"', 'a', 'the']):
        self.mincount = mincount
        self.maxcount = maxcount
        self.counter = Counter()
        self.indexer = dict()
        self.stop = stop
    def update(self, s):
        self.counter.update(s)
    def conclude(self):
        for stp in self.stop:
            if stp in self.counter:
                del self.counter[stp]
        mc = list(zip(*self.counter.most_common()))[0]
        for s in reversed(mc):
            if self.counter[s] > self.mincount:
                break
            del self.counter[s]
        for s in mc:
            if self.counter[s] < self.maxcount:
                break
            del self.counter[s]
        for a in self.counter:
            self.indexer[a] = len(self.indexer)
    def encode(self, s):
        r = np.zeros(len(self.counter)).astype('uint32')
        for w in s:
            if w in self.counter:
                r[self.indexer[w]] += 1
        return r
    def __len__(self):
        return len(self.counter)

In [6]:
enc = BoWEncoder(10, math.inf)
for s in train_label_x:
    enc.update(s)
for s in test_data:
    enc.update(s)
enc.conclude()
print(len(enc))
enc_train_label_x = map(lambda x: enc.encode(x), train_label_x)
enc_test_data = map(lambda x: enc.encode(x), test_data)

14715


In [7]:
model = Sequential([
    Dense(2048, input_dim=len(enc)),
    BatchNormalization(),
    Activation('relu'),
    
#     Dropout(0.5),
    
#     Dense(1024),
#     BatchNormalization(),
#     Activation('relu'),
    
#     Dense(1024),
#     BatchNormalization(),
#     Activation('relu'),
    
#     Dropout(0.5),
    
#     Dense(512),
#     BatchNormalization(),
#     Activation('relu'),
    
    Dropout(0.5),
    
    Dense(128),
    BatchNormalization(),
    Activation('relu'),
    
    Dropout(0.5),
    
    Dense(32),
    BatchNormalization(),
    Activation('relu'),
    
    Dropout(0.5),
    
    Dense(32),
    BatchNormalization(),
    Activation('relu'),

    Dropout(0.5),
    
    Dense(1),
    BatchNormalization(),
    Activation('sigmoid')
])
model.compile(loss='binary_crossentropy', optimizer='adam')
# model.summary()

In [8]:
enc_train_label_x = np.array(list(enc_train_label_x))
train_label_y = np.array(train_label_y)
model.fit(enc_train_label_x, train_label_y, validation_split=0.1, batch_size=128, epochs=3, use_multiprocessing=True)

Train on 180000 samples, validate on 20000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f3c64588a90>

In [9]:
model.save('bow.h5')

In [10]:
enc_test_data = np.array(list(enc_test_data))
outputs = model.predict(enc_test_data)

In [11]:
outputs = outputs.reshape(len(outputs))
outputs[outputs > 0.5] = 1
outputs[outputs <= 0.5] = 0
outputs = outputs.astype(np.uint8)
outputs

array([0, 0, 0, ..., 1, 0, 0], dtype=uint8)

In [12]:
tmp = pd.DataFrame({"id":[str(i) for i in range(len(enc_test_data))],"label":outputs.reshape(len(outputs))})
tmp.to_csv('bow.csv', index=False)

In [20]:
test_inp = enc.encode("today is a good day , but it is hot".split())
model.predict(np.array([test_inp]))

array([[0.76685613]], dtype=float32)