In [13]:
import pickle
import os

import numpy as np

from sklearn.preprocessing import OneHotEncoder
from keras.preprocessing.sequence import pad_sequences
from string import punctuation
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

if os.getcwd().endswith("doc2vec"):
    # goes one folder "up". Can't be run multiple times or your work directory will get rekt
    os.chdir(os.path.dirname(os.getcwd()))

from doc2vec.constants import GENSIM_MODEL_PATH, TRAIN_PATH, TEST_PATH, PUNC_CHARS, STOPWORDS_SET, SGJP_PATH, \
    TOKENIZER_PATH
from doc2vec.preprocess_utils import prepare_sgjp_dict, clear_offers, tokenize_texts, tokenize, \
    prepare_gensim_word_index_dict
from doc2vec.model_utils import model_cnn, model_doc2vec, gensim_model

In [14]:
def load_preprocess_data(pickle_path, stopwords_set=None, lemmatize_dict=None, remove_punct=None):
    with open(pickle_path, 'rb') as file:
        data = pickle.load(file)
    data['text'] = data['job_name'].str.cat(data['job_content'], sep=' ')
    data.dropna(inplace=True, subset=['text'])  # drop data where we don't have text
    return data['label'], clear_offers(data=data, text_col='text', stopwords_list=stopwords_set,
                                       lemmatize_dict=lemmatize_dict, remove_punct=remove_punct)


In [15]:
lemmatize_dict = prepare_sgjp_dict(SGJP_PATH)

train_labels, train_data_prep = load_preprocess_data(TRAIN_PATH, stopwords_set=STOPWORDS_SET,
                                                     lemmatize_dict=lemmatize_dict,
                                                     remove_punct=PUNC_CHARS + punctuation
                                                     )

test_labels, test_data_prep = load_preprocess_data(TEST_PATH, stopwords_set=STOPWORDS_SET,
                                                   lemmatize_dict=lemmatize_dict,
                                                   remove_punct=PUNC_CHARS + punctuation
                                                   )


2019-04-12 01:30:13.727831  Clearing data - DONE
2019-04-12 01:30:14.025829  Removing stopwords - DONE
2019-04-12 01:30:15.022792  Lemmatizing - DONE
2019-04-12 01:30:15.325831  Clearing data - DONE
2019-04-12 01:30:15.353831  Removing stopwords - DONE
2019-04-12 01:30:15.451830  Lemmatizing - DONE


In [16]:
# Let's see what our labels are
print(train_labels.value_counts())


Sprzedaż                                    3120
Finanse / Ekonomia                          1405
Inżynieria                                  1344
IT - Rozwój oprogramowania                  1197
Administracja biurowa                       1005
Produkcja                                    950
Human Resources / Zasoby ludzkie             705
Praca fizyczna                               696
IT - Administracja                           664
Obsługa klienta                              416
Marketing                                    406
Łańcuch dostaw                               375
Budownictwo                                  302
Hotelarstwo / Gastronomia / Turystyka        247
Zakupy                                       198
Prawo                                        188
Reklama / Grafika / Kreacja / Fotografia     182
Transport / Spedycja / Logistyka             179
Kontrola jakości                             176
Internet / e-Commerce / Nowe media           161
Nieruchomości       

In [17]:
# One-hot encode our labels
all_labels = np.array(train_labels.append(test_labels))
ohe = OneHotEncoder()
ohe.fit(all_labels.reshape(-1, 1))

train_labels_bin = ohe.transform(train_labels.values.reshape(-1, 1))
test_labels_bin = ohe.transform(test_labels.values.reshape(-1, 1))

In [18]:
# Tokenize the texts for neural network
tokenizer, train_tokens = tokenize_texts(train_data_prep, TOKENIZER_PATH, oov_token='unk')
test_tokens = tokenizer.texts_to_sequences(test_data_prep.str.join(' '))

In [19]:
MODEL_PARAMS = {
    'input_len': 300,
    'word_emb_size': 100,
    'doc_emb_size': 100,
    'filter_sizes': [5, 1],
    'num_filters': 300,
    'batch_size': 256,
    'n_epochs': 10,
    'nb_tokens': len(tokenizer.word_index) + 1,
    'output_len': len(ohe.categories_[0])
}

In [20]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(train_data_prep)]
model = Doc2Vec(documents, vector_size=MODEL_PARAMS['doc_emb_size'], window=2, min_count=1, workers=4)
model.save(GENSIM_MODEL_PATH)
# If you’re finished training a model (=no more updates, only querying, reduce memory usage), you can do:
model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

In [21]:
# Infer Doc2Vec representations for docs:
train_doc2vec = np.matrix(train_data_prep.apply(model.infer_vector).to_list())
test_doc2vec = np.matrix(test_data_prep.apply(model.infer_vector).to_list())


In [22]:
# Pad offers
train_pad_tokens = pad_sequences(train_tokens, maxlen=MODEL_PARAMS['input_len'])
test_pad_tokens = pad_sequences(test_tokens, maxlen=MODEL_PARAMS['input_len'])


In [23]:
# 1. Pure Word2Vec model
word2vec_model = model_cnn(params=MODEL_PARAMS)
print(word2vec_model.summary())
word2vec_model.fit(train_pad_tokens, train_labels_bin, batch_size=MODEL_PARAMS['batch_size'],
                   epochs=MODEL_PARAMS['n_epochs'], validation_split=0.1)
print(word2vec_model.evaluate(test_pad_tokens, test_labels_bin))


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 300)          0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 300, 100)     3063700     input_2[0][0]                    
__________________________________________________________________________________________________
conv1d_3 (Conv1D)               (None, 300, 300)     150300      embedding_2[0][0]                
__________________________________________________________________________________________________
conv1d_4 (Conv1D)               (None, 300, 300)     30300       embedding_2[0][0]                
__________________________________________________________________________________________________
max_poolin

In [24]:
# 2. Doc2Vec + Word2Vec model
doc2vec_word2vec_model = model_cnn(params=MODEL_PARAMS, doc2vec=True)
print(doc2vec_word2vec_model.summary())
doc2vec_word2vec_model.fit([train_pad_tokens, train_doc2vec], train_labels_bin, batch_size=MODEL_PARAMS['batch_size'],
                           epochs=MODEL_PARAMS['n_epochs'], validation_split=0.1)
doc2vec_word2vec_model.save('doc2vec_word2vec_model.h5')
print(doc2vec_word2vec_model.evaluate([test_pad_tokens, test_doc2vec], test_labels_bin))


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 300)          0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 300, 100)     3063700     input_3[0][0]                    
__________________________________________________________________________________________________
conv1d_5 (Conv1D)               (None, 300, 300)     150300      embedding_3[0][0]                
__________________________________________________________________________________________________
conv1d_6 (Conv1D)               (None, 300, 300)     30300       embedding_3[0][0]                
__________________________________________________________________________________________________
max_poolin

In [25]:
# 3. Pure Doc2Vec model
doc2vec_model = model_doc2vec(params=MODEL_PARAMS)
print(doc2vec_model.summary())
doc2vec_model.fit(train_doc2vec, train_labels_bin, batch_size=MODEL_PARAMS['batch_size'],
                  epochs=MODEL_PARAMS['n_epochs'], validation_split=0.1)
doc2vec_model.save('doc2vec_model.h5')
print(doc2vec_model.evaluate(test_doc2vec, test_labels_bin))


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         (None, 100)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 140)               14140     
_________________________________________________________________
dropout_7 (Dropout)          (None, 140)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 140)               19740     
_________________________________________________________________
dropout_8 (Dropout)          (None, 140)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 70)                9870      
_________________________________________________________________
dropout_9 (Dropout)          (None, 70)                0         
__________

In [None]:
# TODO: EXERCISES

# EXERCISE 1 - model
# In this exercise you will use word vectors trained by gensim.
# You need to tokenize the documents using gensim's vocabulary,
# and pass the embedding matrix to the Embedding layer in model definition. Make sure to freeze the layer weights!

gensim_emb_matrix = model.wv.vectors
gensim_word_index_dict = prepare_gensim_word_index_dict(model.wv.vocab)
gensim_train_tokens = train_data_prep.apply(tokenize, args=(gensim_word_index_dict,))
gensim_test_tokens = test_data_prep.apply(tokenize, args=(gensim_word_index_dict,))

GENSIM_MODEL_PARAMS = {
    'input_len': 300,
    'word_emb_size': 100,
    'doc_emb_size': 100,
    'filter_sizes': [5, 1],
    'num_filters': 300,
    'batch_size': 256,
    'n_epochs': 1,
    # TODO: WRITE YOUR CODE BELOW
    'nb_tokens': ...,
    'embedding_matrix': ...,
    # TODO: END OF YOUR CHANGES
    'output_len': len(ohe.categories_[0])
}

gensim_padded_train_tokens = pad_sequences(gensim_train_tokens, maxlen=MODEL_PARAMS['input_len'])
gensim_padded_test_tokens = pad_sequences(gensim_test_tokens, maxlen=MODEL_PARAMS['input_len'])

def gensim_model(params):
    text_input = Input(shape=(params['input_len'],))
    # TODO: WRITE YOUR CODE BELOW
    x = Embedding(...)
    # TODO: END OF YOUR CHANGES
    maxpool_pool = []
    for i in range(len(params["filter_sizes"])):
        conv = Conv1D(params['num_filters'], kernel_size=params["filter_sizes"][i],
                      kernel_initializer='he_normal', activation='relu', padding='same')(x)
        maxpool_pool.append(MaxPooling1D(pool_size=params['word_emb_size'], strides=None, padding="valid")(conv))
    x = Concatenate(axis=1)(maxpool_pool)
    x = Flatten()(x)

    doc2vec_input = Input(shape=(params['doc_emb_size'],))
    x = Concatenate(axis=1)([x, doc2vec_input])

    x = Dropout(0.1)(x)
    x = Dense(params['output_len'] * 2)(x)
    x = Dropout(0.1)(x)

    outp = Dense(params['output_len'], activation="sigmoid")(x)
    model = Model(inputs=[text_input, doc2vec_input], outputs=outp)

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    return model

pure_gensim_model = gensim_model(params=GENSIM_MODEL_PARAMS)
print(pure_gensim_model.summary())
pure_gensim_model.fit([gensim_padded_train_tokens, train_doc2vec], train_labels_bin,
                      batch_size=GENSIM_MODEL_PARAMS['batch_size'],
                      epochs=GENSIM_MODEL_PARAMS['n_epochs'], validation_split=0.1)
pure_gensim_model.save('pure_gensim_model.h5')
print(pure_gensim_model.evaluate([gensim_padded_test_tokens, test_doc2vec], test_labels_bin))

In [None]:
# EXERCISE 2 - working with doc2vec representations
# Choose 3 different offers.
# For each of those, find 10 that are most similar using the gensim model and print their content to console
for index, offer in train_data_prep.iloc[:3].iteritems():
    # TODO: WRITE YOUR CODE BELOW
