In [1]:
from utils import *
from evaluate import evaluate

Using TensorFlow backend.


In [2]:
questions = pd.read_csv('./question_train_word.csv')
questions_topics = questions.topics.apply(lambda s: s.split(','))
questions_titles = questions.title.astype('U').apply(lambda s: s.split(','))

In [3]:
print(questions_topics[0])
print(questions_titles[0])

['7739004195693774975', '3738968195649774859']
['w305', 'w13549', 'w22752', 'w11', 'w7225', 'w2565', 'w1106', 'w16', 'w31389', 'w6', 'w1019', 'w69288', 'w111', 'w3332', 'w109', 'w11', 'w25', 'w1110', 'w111']


In [4]:
from gensim.models import KeyedVectors
%time word_vec = KeyedVectors.load_word2vec_format('word_embedding.txt')

CPU times: user 1min 33s, sys: 2.37 s, total: 1min 36s
Wall time: 1min 59s


In [5]:
word_keys = list(set([w for t in questions_titles for w in t]))
print(len(word_keys))
word_keys_dict = {v: i for i, v in enumerate(word_keys)}

324960


In [6]:
from numpy.random import normal

def create_emb():
    n_fact = word_vec.vector_size
    emb = np.zeros((len(word_keys), n_fact))

    for i in range(0,len(emb)):
        word = word_keys[i]
        if word and word in word_vec:
            emb[i] = word_vec[word]
        else:
            # If we can't find the word in glove, randomly initialize
            emb[i] = normal(scale=0.6, size=(n_fact,))

    # This is our "rare word" id - we want to randomly initialize
    emb[-1] = normal(scale=0.6, size=(n_fact,))
    emb/=3
    return emb

In [7]:
emb = create_emb()

In [8]:
questions_titles_format = [[word_keys_dict[t] for t in title] for title in questions_titles]
print(questions_titles_format[0])

[149974, 9443, 68003, 55983, 189022, 173461, 187765, 142848, 155059, 305967, 10301, 237145, 300187, 152072, 236982, 55983, 116722, 95361, 300187]


In [9]:
from sklearn.preprocessing import MultiLabelBinarizer

In [10]:
mb_sparse = MultiLabelBinarizer(sparse_output=True)
y_sparse = mb_sparse.fit_transform(questions_topics)

In [36]:
# x_train = sequence.pad_sequences(questions_titles_format, maxlen=35)

def train_generator(x_train, y_sparse, batch_size):
    current = 0
    length = x_train.shape[0]
    while True:
        x = []
        y = []
        end = current + batch_size
        if end > length:
            yield (np.append(x_train[current: ], x_train[:end-length]), 
                   np.append(y_sparse[current: ].toarray(), y_sparse[:end-length].toarray()))
            current = end - length
        else:
            yield x_train[current: end], y_sparse[current: end].toarray()
            current = end

In [27]:
print(x_train.shape)
print(x_train[0].shape)
print(type(x_train[0]))
print(y_sparse[0].toarray().shape)
print(type(y_sparse[0].toarray()))

(2999967, 35)
(35,)
<class 'numpy.ndarray'>
(1, 1999)
<class 'numpy.ndarray'>


In [35]:
yy = y_sparse[:10].toarray()
print(yy.shape)
print(x_train[:10].shape)
y.shape

(10, 1999)
(10, 35)


In [33]:
x_test_sample = sequence.pad_sequences(questions_titles_format[-100:], maxlen=35)
y_test_sample = y[-100:]

In [42]:
def try_predict_generator(model, x_train, y_train, batch_size, samples_per_epoch, num_epochs):
    history = model.fit_generator(train_generator(x_train, y_train, batch_size),
                                  samples_per_epoch=samples_per_epoch,
                                  nb_epoch=num_epochs,
                                  validation_data=(x_test_sample, y_test_sample))

    predict_train = model.predict(x_train[:100])
    result = [[mb.classes_[a] for a in np.array(predict_train[i]).argsort()[-5:][::-1]] for i in range(100)]
    print(evaluate(zip(result, questions_topics[:100])))

    predict_test = model.predict(x_test_sample)
    result = [[mb.classes_[a] for a in np.array(predict_test[i]).argsort()[-5:][::-1]] for i in range(100)]
    print(evaluate(zip(result, questions_topics[-100:])))

## Three Convolution: 1, 2, 3

In [45]:
filter_sizes = [1,2,3]
num_filters = 1024
sequence_lenght = 35

graph_in = Input ((sequence_lenght, 256))
convs = [ ] 
for fsz in filter_sizes: 
    x = Convolution1D(num_filters, fsz, activation="relu")(graph_in)
    x = MaxPooling1D()(x) 
    x = Flatten()(x) 
    convs.append(x)
graph = Concatenate()(convs)
graph = Model(graph_in, graph)

In [46]:
embedding_dim = word_vec.vector_size
sequence_lenght = 35
dropout_prob = [0.5, 0.5]


model = Sequential ([
    Embedding(len(word_keys), embedding_dim, input_length=sequence_lenght, weights=[emb], trainable=False),
    Dropout (dropout_prob[0]),
    graph,
    Dropout (dropout_prob[1]),
#     Dense (hidden_dims, activation="relu"),
    Dense (1999, activation='sigmoid'),
    ])
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])

In [47]:
try_predict_generator(model, x_train=x_train[:10**5], y_train=y_sparse[:10**5],
                      batch_size=64, samples_per_epoch=157, num_epochs=1)



Epoch 1/1
(0.03528064537112816, 0.1102362585955132, 0.05188679245283019, 11, 212)
(0.01606928593298138, 0.039015987489955516, 0.0273224043715847, 5, 183)


In [48]:
try_predict_generator(model, x_train=x_train[:10**5], y_train=y_sparse[:10**5],
                      batch_size=64, samples_per_epoch=157, num_epochs=1)



Epoch 1/1
(0.1372372338170697, 0.42437261878844834, 0.2028301886792453, 43, 212)
(0.022884481246024765, 0.056964223899601055, 0.03825136612021858, 7, 183)


In [49]:
try_predict_generator(model, x_train=x_train[:10**5], y_train=y_sparse[:10**5],
                      batch_size=64, samples_per_epoch=157, num_epochs=1)



Epoch 1/1
(0.2446619903573918, 0.749611236328366, 0.3632075471698113, 77, 212)
(0.03403327379345225, 0.09022820192111107, 0.0546448087431694, 10, 183)


In [37]:
embedding_dim = word_vec.vector_size
sequence_lenght = 35
num_filters = 1024
dropout_prob = [0.5, 0.5]
filter_size = 1
# hidden_dims = 2000

model = Sequential ([
    Embedding(len(word_keys), embedding_dim, input_length=sequence_lenght, weights=[emb], trainable=False),
    Dropout (dropout_prob[0]),
    Convolution1D(num_filters, filter_size, activation="relu"),
    MaxPooling1D(3),
    Flatten(),
    Dropout (dropout_prob[1]),
#     Dense (hidden_dims, activation="relu"),
    Dense (1999, activation='sigmoid'),
    ])
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])

In [43]:
try_predict_generator(model, x_train=x_train[:10**5], y_train=y_sparse[:10**5],
                      batch_size=64, samples_per_epoch=157, num_epochs=1)



Epoch 1/1
(0.04083441996557766, 0.12222769378245382, 0.06132075471698113, 13, 212)
(0.015897035651000053, 0.03801586163110682, 0.0273224043715847, 5, 183)


In [18]:
try_predict(model, 64, 1)

Train on 10000 samples, validate on 100 samples
Epoch 1/1
(0.02651794079313444, 0.0706475965725332, 0.04245283018867924, 9, 212)
(0.012954365174587437, 0.031802512285510696, 0.02185792349726776, 4, 183)


In [19]:
try_predict(model, 64, 1)

Train on 10000 samples, validate on 100 samples
Epoch 1/1
(0.06343349805988843, 0.1763885196813408, 0.09905660377358491, 21, 212)
(0.022721357080904975, 0.05596409804075236, 0.03825136612021858, 7, 183)


In [None]:
try_predict(model, 64, 1)

Train on 10000 samples, validate on 100 samples
Epoch 1/1
 1664/10000 [===>..........................] - ETA: 89s - loss: 16.7577 - categorical_accuracy: 0.0180 