In [1]:
from utils import *
from evaluate import evaluate

Using TensorFlow backend.


## load data and vector

In [2]:
questions = pd.read_csv('./question_train_word.csv')
questions_topics = questions.topics.apply(lambda s: s.split(','))
questions_titles = questions.titles.astype('U').apply(lambda s: s.split(','))

In [3]:
print(questions_topics[0])
print(questions_titles[0])

['7739004195693774975', '3738968195649774859']
['w305', 'w13549', 'w22752', 'w11', 'w7225', 'w2565', 'w1106', 'w16', 'w31389', 'w6', 'w1019', 'w69288', 'w111', 'w3332', 'w109', 'w11', 'w25', 'w1110', 'w111']


In [4]:
from gensim.models import KeyedVectors
%time word_vec = KeyedVectors.load_word2vec_format('word_embedding.txt')

CPU times: user 1min 32s, sys: 2.18 s, total: 1min 34s
Wall time: 1min 46s


In [5]:
word_keys = list(set([w for t in questions_titles for w in t]))
print(len(word_keys))
word_keys_dict = {v: i for i, v in enumerate(word_keys)}

324960


In [6]:
from numpy.random import normal

def create_emb():
    n_fact = word_vec.vector_size
    emb = np.zeros((len(word_keys), n_fact))

    for i in range(0,len(emb)):
        word = word_keys[i]
        if word and word in word_vec:
            emb[i] = word_vec[word]
        else:
            # If we can't find the word in glove, randomly initialize
            emb[i] = normal(scale=0.6, size=(n_fact,))

    # This is our "rare word" id - we want to randomly initialize
    emb[-1] = normal(scale=0.6, size=(n_fact,))
    emb/=3
    return emb

In [7]:
emb = create_emb()

## transform training data structure

In [8]:
questions_titles_format = [[word_keys_dict[t] for t in title] for title in questions_titles]
print(questions_titles_format[0])

[228646, 299077, 272621, 161389, 83560, 116101, 4404, 244257, 226205, 106641, 283956, 170088, 102988, 139192, 18051, 161389, 118572, 155411, 102988]


In [9]:
from sklearn.preprocessing import MultiLabelBinarizer

In [10]:
mb = MultiLabelBinarizer(sparse_output=True)
y_train_sparse = mb.fit_transform(questions_topics)

In [11]:
x_train = sequence.pad_sequences(questions_titles_format, maxlen=35)

In [13]:
x_test_sample = sequence.pad_sequences(questions_titles_format[-100:], maxlen=35)
y_test_sample = y_train_sparse[-100:].toarray()

## fit function

In [39]:
def make_batches(size, batch_size):
    """Returns a list of batch indices (tuples of indices).

    # Arguments
        size: Integer, total size of the data to slice into batches.
        batch_size: Integer, batch size.

    # Returns
        A list of tuples of array indices.
    """
    num_batches = int(np.ceil(size / float(batch_size)))
    return [(i * batch_size, min(size, (i + 1) * batch_size))
            for i in range(0, num_batches)]

def train_generator(x, y, batch_size):
    sample_size = x.shape[0]

    index_array = np.arange(sample_size)
    np.random.shuffle(index_array)
    
    batch = make_batches(sample_size, batch_size)
    while True:
        for (batch_start, batch_end) in batch:
            batch_ids = index_array[batch_start:batch_end]
            yield (x[batch_ids], y[batch_ids].toarray())

In [41]:
def try_predict_generator(model, x_train, y_train, batch_size, num_epochs):
    
    steps_per_epoch = int(np.ceil(x_train.shape[0] / batch_size))

    history = model.fit_generator(train_generator(x_train, y_train, batch_size),
                                  steps_per_epoch=steps_per_epoch,
                                  epochs=num_epochs,
                                  validation_data=(x_test_sample, y_test_sample))

    predict_train = model.predict(x_train[:100])
    result = [[mb.classes_[a] for a in np.array(predict_train[i]).argsort()[-5:][::-1]] for i in range(100)]
    print(evaluate(zip(result, questions_topics[:100])))

    predict_test = model.predict(x_test_sample)
    result = [[mb.classes_[a] for a in np.array(predict_test[i]).argsort()[-5:][::-1]] for i in range(100)]
    print(evaluate(zip(result, questions_topics[-100:])))

In [42]:
def predict_generator(model, batch_size, num_epochs):
    training_size = int(x_train.shape[0] * 0.9)
    x_train_sample = x_train[:training_size]
    y_train_sample = y_train_sparse[:training_size]
    
    steps_per_epoch = int(np.ceil(training_size / batch_size))
    
    history = model.fit_generator(train_generator(x_train_sample, y_train_sample, batch_size),
                                  steps_per_epoch=steps_per_epoch,
                                  epochs=num_epochs,
                                  validation_data=(x_test, y_test))

    predict_train = model.predict(x_train[:100])
    result = [[mb.classes_[a] for a in np.array(predict_train[i]).argsort()[-5:][::-1]] for i in range(100)]
    print(evaluate(zip(result, questions_topics[:100])))

    predict_test = model.predict(x_test_sample)
    result = [[mb.classes_[a] for a in np.array(predict_test[i]).argsort()[-5:][::-1]] for i in range(100)]
    print(evaluate(zip(result, questions_topics[-100:])))

## Three Convolution: 1, 2, 3

In [43]:
filter_sizes = [1,2,3]
num_filters = 1024
sequence_lenght = 35

graph_in = Input ((sequence_lenght, 256))
convs = [ ] 
for fsz in filter_sizes: 
    x = Convolution1D(num_filters, fsz, activation="relu")(graph_in)
    x = MaxPooling1D()(x) 
    x = Flatten()(x) 
    convs.append(x)
graph = Concatenate()(convs)
graph = Model(graph_in, graph)

In [44]:
embedding_dim = word_vec.vector_size
sequence_lenght = 35
dropout_prob = [0.5, 0.5]


model = Sequential ([
    Embedding(len(word_keys), embedding_dim, input_length=sequence_lenght, weights=[emb], trainable=False),
    Dropout (dropout_prob[0]),
    graph,
    Dropout (dropout_prob[1]),
#     Dense (hidden_dims, activation="relu"),
    Dense (1999, activation='sigmoid'),
    ])
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])

In [47]:
try_predict_generator(model, x_train=x_train[:10**4], y_train=y_train_sparse[:10**4],
                      batch_size=64, num_epochs=1)

Epoch 1/1
(0.0431562023197697, 0.12455187434852898, 0.0660377358490566, 14, 212)
(0.016636500807644984, 0.04253727349071142, 0.0273224043715847, 5, 183)


In [48]:
try_predict_generator(model, x_train=x_train[:10**4], y_train=y_train_sparse[:10**4],
                      batch_size=64, num_epochs=1)

Epoch 1/1
(0.17844122326699743, 0.5499424965533144, 0.2641509433962264, 56, 212)
(0.03750008372242727, 0.0996984769661445, 0.060109289617486336, 11, 183)


In [49]:
try_predict_generator(model, x_train=x_train[:10**4], y_train=y_train_sparse[:10**4],
                      batch_size=64, num_epochs=1)

Epoch 1/1
(0.3983126939781944, 1.318349465881896, 0.5707547169811321, 121, 212)
(0.03349516638739445, 0.10502303510876577, 0.04918032786885246, 9, 183)


In [50]:
try_predict_generator(model, x_train=x_train[:10**4], y_train=y_train_sparse[:10**4],
                      batch_size=64, num_epochs=1)

Epoch 1/1
(0.5565085227281474, 1.869113769289865, 0.7924528301886793, 168, 212)
(0.052405006551539506, 0.14530350875888856, 0.08196721311475409, 15, 183)


In [51]:
filter_sizes = [1,2,3]
num_filters = 1024
sequence_lenght = 35

graph_in = Input ((sequence_lenght, 256))
convs = [ ] 
for fsz in filter_sizes: 
    x = Convolution1D(num_filters, fsz, activation="relu")(graph_in)
    x = MaxPooling1D()(x) 
    x = Flatten()(x) 
    convs.append(x)
graph = Concatenate()(convs)
graph = Model(graph_in, graph)

In [52]:
from keras.layers import BatchNormalization

embedding_dim = word_vec.vector_size
sequence_lenght = 35
dropout_prob = [0.5, 0.5]


model = Sequential ([
    Embedding(len(word_keys), embedding_dim, input_length=sequence_lenght, weights=[emb], trainable=False),
    BatchNormalization(),
    Dropout (dropout_prob[0]),
    graph,
    BatchNormalization(),
    Dropout (dropout_prob[1]),
#     Dense (hidden_dims, activation="relu"),
    Dense (1999, activation='sigmoid'),
    ])
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])

In [53]:
try_predict_generator(model, x_train=x_train[:10**4], y_train=y_train_sparse[:10**4],
                      batch_size=64, num_epochs=1)

Epoch 1/1


ZeroDivisionError: float division by zero