In [1]:
import pandas as pd
from Utils.misc import batch

In [2]:
data = pd.read_csv("./data/HN/HN_posts_year_to_Sep_26_2016.csv", parse_dates=['created_at'])
data = data[["title", "num_points"]]

In [3]:
data['num_points'].describe()

count    293119.000000
mean         15.025324
std          58.504103
min           1.000000
25%           1.000000
50%           2.000000
75%           4.000000
max        5771.000000
Name: num_points, dtype: float64

In [4]:
data_mat = data.as_matrix()

In [5]:
def wordGen():
    for i in range(len(data)):
        for word in batch(lambda x: x.lower())(data.iloc[i]['title'].split(' ')):
            yield word
    return

In [6]:
from Utils.indexer import build_index

o2i, i2o, size = build_index(wordGen())
print(size)

147039


In [7]:
from DataLoader import FastText

WORD_EMB_DIM = 300
ft, orig_ft = FastText.selective_load('./data/FastText/wiki.en.vec', WORD_EMB_DIM, o2i, i2o, size)

Start: Loading FastText Vectors
End: Loaded 2518844 rows.


In [8]:
from DataLoader import GloVe

glove, orig_glove = GloVe.selective_load('./data/GloVe/glove.6B.300d.txt', WORD_EMB_DIM, o2i, i2o, size)

Start: Loading Glove Model
End: Loaded 400000 rows.


In [9]:
THRESHOLD = 50

In [10]:
count = 0
for i in range(len(data)):
    count = count + (0 if data_mat[i][1] <= THRESHOLD else 1)
print(1- (count / len(data)) )

0.9297247875436256


In [11]:
SEQ_LENGTH = 32

In [29]:
from keras.layers import Activation, dot, add, MaxPooling1D, MaxPooling2D, Bidirectional, Input, GRU, LSTM, SimpleRNN, Conv1D, Conv2D, Conv2DTranspose, Dense, Flatten, Dropout, Reshape, Embedding, Concatenate
from keras.models import Model, Sequential
from keras.regularizers import l2
from keras.optimizers import Adam
from keras.constraints import unit_norm
from keras.initializers import Identity
from keras import backend as K
from keras.engine.topology import Layer
import numpy as np
import tensorflow as tf

def create_baseline():
    
    GRU_DIM = 512
        
    inp = Input(shape=(SEQ_LENGTH,))
    ft_emb = Embedding(ft.shape[0], ft.shape[1], weights=[ft], input_length=SEQ_LENGTH, trainable=False)(inp)
    glove_emb = Embedding(glove.shape[0], glove.shape[1], weights=[glove], input_length=SEQ_LENGTH, trainable=False)(inp)
    
    x = Bidirectional(GRU(GRU_DIM // 2, activation='selu', return_sequences=True))(ft_emb)
    x = Bidirectional(GRU(GRU_DIM // 2, activation='selu'))(x)
    
    y = Bidirectional(GRU(GRU_DIM // 2, activation='selu', return_sequences=True))(glove_emb)
    y = Bidirectional(GRU(GRU_DIM // 2, activation='selu'))(y)
    
    predict = Concatenate()([x,y])
    predict = Dense(1, activation='sigmoid')(predict)
    model = Model(inp, predict)
    model.compile(loss='binary_crossentropy',
        optimizer='adam',
        metrics=['acc'])
    return model

In [30]:
model = create_baseline()
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_4 (InputLayer)             (None, 32)            0                                            
____________________________________________________________________________________________________
embedding_7 (Embedding)          (None, 32, 300)       44111700    input_4[0][0]                    
____________________________________________________________________________________________________
embedding_8 (Embedding)          (None, 32, 300)       44111700    input_4[0][0]                    
____________________________________________________________________________________________________
bidirectional_13 (Bidirectional) (None, 32, 512)       855552      embedding_7[0][0]                
___________________________________________________________________________________________

In [31]:
from Utils.generator import random_access

gen = {}
size = {}
gen['train'], gen['test'], size['train'], size['test'] = random_access(data_mat)

Training data size: 234495
Testing data size: 58624


In [32]:
from keras.preprocessing.sequence import pad_sequences

def sample_generator(gen, batch_size = 64):
    data = []
    label = []
    for row in gen:
        data.append(batch(o2i)(batch(lambda x: x.lower())(row[0].split(' '))))
        lab = 0 if row[1] <= THRESHOLD else 1
        label.append([lab])
        if len(data) == batch_size:
            yield (pad_sequences(np.array(data), maxlen=SEQ_LENGTH), np.array(label))
            data = []
            label = []

In [None]:
print(next(sample_generator(gen['train'], 2))[0].shape)
print(next(sample_generator(gen['test'], 3))[1].shape)

(2, 32)
(3, 1)


In [None]:
from keras_tqdm import TQDMNotebookCallback
from keras.callbacks import Callback, ModelCheckpoint

mc = ModelCheckpoint('./model/hn_fasttext_model.hdf5', monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=False, mode='auto', period=1)
class Metrics(Callback):
    def on_epoch_end(self, batch, logs={}):
        predict = np.asarray(self.model.predict(self.validation_data[0]))
        targ = self.validation_data[1]
        self.f1s=f1(targ, predict)
        return
metrics = Metrics()

BATCH_SIZE = 4096
model.fit_generator(
    sample_generator(gen['train'], BATCH_SIZE),
    size['train'] // BATCH_SIZE,
    validation_data = sample_generator(gen['test'], BATCH_SIZE),
    validation_steps = size['test'] // BATCH_SIZE,
    epochs=200000,
    callbacks = [mc]
    #verbose=0, callbacks=[TQDMNotebookCallback(),mc]
)

Epoch 1/200000
Epoch 2/200000
Epoch 3/200000
Epoch 4/200000
Epoch 5/200000