In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score, hamming_loss, accuracy_score
import re
import nltk
from nltk.stem import WordNetLemmatizer

In [3]:
img_train = pd.read_pickle('train_images_valid.pickle')
img_test = pd.read_pickle('test_images_valid.pickle')

In [4]:
books_train = pd.read_pickle('all_books_train.pickle')
books_test = pd.read_pickle('all_books_test.pickle')

In [5]:
print(books_train.shape)
print(books_test.shape)

(36389, 14)
(12131, 14)


In [6]:
books_train = books_train[books_train.index.isin(img_train.index)]
books_test = books_test[books_test.index.isin(img_test.index)]

In [7]:
print(books_train.shape)
print(books_test.shape)

(36298, 14)
(12096, 14)


In [8]:
def transform_genres(genre):
    genres = genre.split('|')
    return list(set(genres))

In [9]:
len(set([item for sublist in books['genres_cut'] for item in sublist]))

NameError: name 'books' is not defined

In [10]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    pos = nltk.pos_tag(tokens)
    leave_tokens = [token[0] for token in pos if token[1].startswith('VB') or token[1].startswith('JJ') or 
                   token[1] =='NN' or token[1] =='NNS']
    return ' '.join(leave_tokens)
def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    tokens = text.split(' ')
    lemm_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(lemm_tokens)

In [11]:
def preprocess(text):
    clean = clean_text(text)
    token = tokenize(clean)
    lemm = lemmatize(token)
    return lemm

In [12]:
train = books_train[['book_desc', 'genres_cut']]
test = books_test[['book_desc', 'genres_cut']]

In [13]:
books_train['desc_proc'] = books_train['book_desc'].apply(preprocess)
books_test['desc_proc'] = books_test['book_desc'].apply(preprocess)

In [19]:
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(books_train['genres_cut'])
y_train = multilabel_binarizer.transform(books_train['genres_cut'])
y_test = multilabel_binarizer.transform(books_test['genres_cut'])

In [15]:
max_words = 10000

In [58]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=max_words, lower=True)
tokenizer.fit_on_texts(books_train['desc_proc'])
sequences_train = tokenizer.texts_to_sequences(books_train['desc_proc'])
sequences_test = tokenizer.texts_to_sequences(books_test['desc_proc'])

In [17]:
maxlen=500

In [18]:
x_train = pad_sequences(sequences_train, maxlen=maxlen)
x_test = pad_sequences(sequences_test, maxlen=maxlen)

In [20]:
genres = multilabel_binarizer.classes_

In [22]:
genres_count = []
for genre in genres:
    c = sum(l.count(genre) for l in list(books_train['genres_cut']))
    genres_count.append(c)

In [23]:
most_common_genres = pd.DataFrame({'genre': genres, 'count': genres_count})

In [29]:
most_common_genres['class_weight'] = len(books_train['genres_cut']) / most_common_genres['count']
class_weight = {}
for i, row in most_common_genres.iterrows():
    class_weight[i] = row['class_weight']

In [25]:
#https://stackoverflow.com/questions/54065733/how-to-employ-the-scikit-learn-evaluation-metrics-functions-with-keras-in-python
from keras import backend as K

def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [26]:
#https://groups.google.com/forum/#!topic/keras-users/_sjndHbejTY
def hn_multilabel_loss(y_true, y_pred):
    # Avoid divide by 0
    y_pred = K.clip(y_pred, K.epsilon(), 1 - K.epsilon())
    # Multi-task loss
    return K.mean(K.sum(- y_true * K.log(y_pred) - (1 - y_true) * K.log(1 - y_pred), axis=1))

In [27]:
def scores_to_labels(pred, threshold=0.5):
    new_pred = []
    for sample in pred:
        true_pred = [0 if x < threshold else 1 for x in sample]
        new_pred.append(true_pred)
    return np.array(new_pred)

### sum embeddings?

In [30]:
#simple feed-forward
from keras.models import Sequential
from keras.layers import Dense, Embedding, GlobalMaxPool1D, Dropout
from keras.optimizers import Adam
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
from keras.losses import binary_crossentropy

model = Sequential()
model.add(Embedding(max_words, 20, input_length=maxlen))
model.add(Dropout(0.15))
model.add(GlobalMaxPool1D())
model.add(Dense(len(genres), activation='sigmoid'))

model.compile(optimizer='adam', loss=hn_multilabel_loss, metrics=[f1])
callbacks = [
    ReduceLROnPlateau(),
    EarlyStopping(patience=4),
    ModelCheckpoint(filepath='model-simple.h5', save_best_only=True)
]

history = model.fit(x_train, y_train,
                    class_weight=class_weight,
                    epochs=20,
                    batch_size=32,
                    validation_split=0.1,
                    callbacks=callbacks)

Train on 32668 samples, validate on 3630 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [31]:
metrics = model.evaluate(x_test, y_test)
print("{}: {}".format(model.metrics_names[1], metrics[1]))
y_pred = scores_to_labels(model.predict(x_test))
print(f1_score(y_test, y_pred, average='micro'))
print(hamming_loss(y_test, y_pred))

f1: 0.4936594356777807
0.4948857132306783
0.09833649643432252


In [32]:
#1CNN
from keras.layers import Activation, GlobalMaxPool1D, Dropout,Conv1D

filter_length = 300

model2 = Sequential()
model2.add(Embedding(max_words, 20, input_length=maxlen))
model2.add(Dropout(0.1))
model2.add(Conv1D(filter_length, 3, padding='valid', activation='relu', strides=1))
model2.add(GlobalMaxPool1D())
model2.add(Dense(len(genres)))
model2.add(Activation('sigmoid'))

model2.compile(optimizer='adam', loss=hn_multilabel_loss, metrics=[f1])

callbacks = [
    ReduceLROnPlateau(), 
    EarlyStopping(patience=4), 
    ModelCheckpoint(filepath='model-conv1d.h5', save_best_only=True)
]

history = model2.fit(x_train, y_train,
                    class_weight=class_weight,
                    epochs=20,
                    batch_size=32,
                    validation_split=0.1,
                    callbacks=callbacks)

Train on 32668 samples, validate on 3630 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20


In [33]:
metrics = model2.evaluate(x_test, y_test)
print("{}: {}".format(model2.metrics_names[1], metrics[1]))
y_pred = scores_to_labels(model2.predict(x_test))
print(f1_score(y_test, y_pred, average='micro'))
print(hamming_loss(y_test, y_pred))

f1: 0.610508267920484
0.6116163189037683
0.0896523464458247


In [34]:
#1LSTM
from keras.layers import LSTM

model3 = Sequential()
model3.add(Embedding(max_words, 20, input_length=maxlen))
model3.add(Dropout(0.15))
model3.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model3.add(Dense(len(genres)))
model3.add(Activation('sigmoid'))

model3.compile(optimizer='adam', loss=hn_multilabel_loss, metrics=[f1])

callbacks = [
    ReduceLROnPlateau(), 
    EarlyStopping(patience=4), 
    ModelCheckpoint(filepath='model-lstm.h5', save_best_only=True)
]

history = model3.fit(x_train, y_train,
                    class_weight=class_weight,
                    epochs=20,
                    batch_size=32,
                    validation_split=0.1,
                    callbacks=callbacks)

Train on 32668 samples, validate on 3630 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20


In [31]:
def optimize_threshold(x, y, model):
    best_f1=0
    best_t = 0
    for t in np.linspace(0.0, 0.8, 25):
        f1 = f1_score(y, scores_to_labels(model3.predict(x), t), average='micro')
        if f1 > best_f1:
            best_t=t
            best_f1=f1
    return t

In [35]:
metrics = model3.evaluate(x_test, y_test)
print("{}: {}".format(model3.metrics_names[1], metrics[1]))
#threshold = optimize_threshold(x_val, y_val, model3)
y_pred = scores_to_labels(model3.predict(x_test), 0.35)
print(f1_score(y_test, y_pred, average='micro'))
print(hamming_loss(y_test, y_pred))

f1: 0.6333734587071433
0.6455325735190657
0.09055095468138946


In [36]:
#2CNN + 1LSTM
from keras.layers import MaxPooling1D

model4 = Sequential()
model4.add(Embedding(max_words, 20, input_length=maxlen))
model4.add(Dropout(0.1))
model4.add(Conv1D(128, 5))
model4.add(MaxPooling1D(5))
model4.add(Dropout(0.2))
model4.add(Conv1D(128, 5))
model4.add(MaxPooling1D(5))
model4.add(Dropout(0.2))
model4.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model4.add(Dense(len(genres)))
model4.add(Activation('sigmoid'))

model4.compile(optimizer='adam', loss=hn_multilabel_loss, metrics=[f1])

callbacks = [
    ReduceLROnPlateau(), 
    EarlyStopping(patience=7), 
    ModelCheckpoint(filepath='model-lstm-cnn.h5', save_best_only=True)
]

history = model4.fit(x_train, y_train,
                    class_weight=class_weight,
                    epochs=20,
                    batch_size=32,
                    validation_split=0.1,
                    callbacks=callbacks)

Train on 32668 samples, validate on 3630 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20


In [37]:
metrics = model4.evaluate(x_test, y_test)
print("{}: {}".format(model4.metrics_names[1], metrics[1]))
y_pred = scores_to_labels(model4.predict(x_test), 0.35)
print(f1_score(y_test, y_pred, average='micro'))
print(hamming_loss(y_test, y_pred))

f1: 0.6029014637072881
0.6152377968008887
0.09709641706924316


In [38]:
#1CNN + 1LSTM
from keras.layers import MaxPooling1D

model5 = Sequential()
model5.add(Embedding(max_words, 20, input_length=maxlen))
model5.add(Dropout(0.1))
model5.add(Conv1D(128, 5))
model5.add(MaxPooling1D(5))
model5.add(Dropout(0.2))
model5.add(LSTM(128, dropout=0.2, recurrent_dropout=0.3))
model5.add(Dense(len(genres)))
model5.add(Activation('sigmoid'))

model5.compile(optimizer='adam', loss=hn_multilabel_loss, metrics=[f1])

callbacks = [
    ReduceLROnPlateau(), 
    EarlyStopping(patience=5), 
    ModelCheckpoint(filepath='model-lstm-cnn2.h5', save_best_only=True)
]

history = model5.fit(x_train, y_train,
                    class_weight=class_weight,
                    epochs=20,
                    batch_size=32,
                    validation_split=0.1,
                    callbacks=callbacks)

Train on 32668 samples, validate on 3630 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20


In [39]:
metrics = model5.evaluate(x_test, y_test)
print("{}: {}".format(model5.metrics_names[1], metrics[1]))
y_pred = scores_to_labels(model5.predict(x_test), 0.35)
print(f1_score(y_test, y_pred, average='micro'))
print(hamming_loss(y_test, y_pred))

f1: 0.6117301418510064
0.6233747999213726
0.0964170692431562
