# FastText-GRU

In this notebook, we utilize pretrained FastText embeddings to create POI embeddings.

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.preprocessing import text, sequence

Using TensorFlow backend.


Load train and test sets.

In [2]:
labels = [
    'Active Life', 'Arts & Entertainment', 'Automotive', 'Beauty & Spas',
    'Education', 'Event Planning & Services', 'Financial Services', 'Food',
    'Health & Medical', 'Home Services', 'Hotels & Travel', 'Local Flavor',
    'Local Services', 'Mass Media', 'Nightlife', 'Pets', 'Professional Services',
    'Public Services & Government', 'Real Estate', 'Religious Organizations',
    'Restaurants', 'Shopping'
]

train_df = pd.read_csv('data/train.csv', na_filter=False)
test_df = pd.read_csv('data/test.csv', na_filter=False)

train_texts = train_df['sequence']
test_texts = test_df['sequence']

train_labels = train_df['categories'].str.get_dummies(sep=', ')
test_labels = test_df['categories'].str.get_dummies(sep=', ')

Process sequences via Keras Tokenizer.

In [3]:
max_features = 30000
max_len = 256

tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(train_texts) + list(test_texts))

X_train = tokenizer.texts_to_sequences(train_texts)
X_test = tokenizer.texts_to_sequences(test_texts)

X_train = sequence.pad_sequences(X_train, maxlen=max_len)
X_test = sequence.pad_sequences(X_test, maxlen=max_len)

y_train = train_labels.values
y_test = test_labels.values

Create the embedding matrix, using the pretrained vectors.

In [4]:
# Download pretrained embeddings from https://www.kaggle.com/yekenot/fasttext-crawl-300d-2m
EMBEDDING_FILE = 'ft_models/fasttext-crawl-300d-2M.vec'
embed_size = 300

def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
embedding_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE))

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))

for word, i in list(word_index.items())[:nb_words]:
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i-1] = embedding_vector

Classification via a GRU architecture.

In [6]:
inp = Input(shape=(max_len, ))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = SpatialDropout1D(0.2)(x)
x = Bidirectional(GRU(80, return_sequences=True))(x)
avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPooling1D()(x)
conc = concatenate([avg_pool, max_pool])
outp = Dense(len(labels), activation='sigmoid')(conc)

model = Model(inputs=inp, outputs=outp)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [7]:
batch_size = 32
epochs = 2

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, train_size=0.8)

hist = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val), verbose=2)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 122961 samples, validate on 30741 samples
Epoch 1/2
 - 860s - loss: 0.0906 - accuracy: 0.9718 - val_loss: 0.0652 - val_accuracy: 0.9801
Epoch 2/2
 - 858s - loss: 0.0605 - accuracy: 0.9813 - val_loss: 0.0608 - val_accuracy: 0.9814


In [8]:
y_pred = model.predict(X_test, batch_size=1024)
y_pred = y_pred > 0.5
scores = []

for label_idx, label_name in enumerate(labels):
    test_target = test_labels[label_name]
    preds = y_pred[:, label_idx]
    score = accuracy_score(test_target, preds)
    scores.append(score)
    print('Test score for class {} is {:.4f}'.format(label_name, score))

print('Mean test score is {:.4f}'.format(np.mean(scores)))

Test score for class Active Life is 0.9845
Test score for class Arts & Entertainment is 0.9813
Test score for class Automotive is 0.9839
Test score for class Beauty & Spas is 0.9874
Test score for class Education is 0.9881
Test score for class Event Planning & Services is 0.9668
Test score for class Financial Services is 0.9952
Test score for class Food is 0.9358
Test score for class Health & Medical is 0.9820
Test score for class Home Services is 0.9739
Test score for class Hotels & Travel is 0.9886
Test score for class Local Flavor is 0.9920
Test score for class Local Services is 0.9623
Test score for class Mass Media is 0.9982
Test score for class Nightlife is 0.9823
Test score for class Pets is 0.9951
Test score for class Professional Services is 0.9740
Test score for class Public Services & Government is 0.9958
Test score for class Real Estate is 0.9851
Test score for class Religious Organizations is 0.9990
Test score for class Restaurants is 0.9772
Test score for class Shopping i