In [1]:
import pickle
import urllib.request as req
import pandas as pd
import numpy as np
import os

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import itertools

In [4]:
from keras import backend as K

Using TensorFlow backend.


In [5]:
#computes F1 score
def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.
        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.
        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [6]:
#computes hamming loss
def hn_multilabel_loss(y_true, y_pred):
    # Avoid divide by 0
    y_pred = K.clip(y_pred, K.epsilon(), 1 - K.epsilon())
    # Multi-task loss
    return K.mean(K.sum(- y_true * K.log(y_pred) - (1 - y_true) * K.log(1 - y_pred), axis=1))

In [7]:
#downloads images from links in 'image_url' column
def download_images(dataframe,out_folder):
    ids = dataframe.index.values
    for image_id in ids:
        url = dataframe.loc[image_id]['image_url']
        if os.path.isfile(out_folder + '/' + str(image_id) + '.jpg') == False:
            r = req.urlretrieve(url, out_folder + '/' + str(image_id) + '.jpg')
    return True

In [8]:
#return a collumn with names of downloaded images
def return_image_name_col(df):
    ids = df.index.values
    names = []
    for i in ids:
        names.append(str(i)+'.jpg')
    return np.array(names)

In [9]:
#load training set
pickle_in_train = open("all_books_train.pickle","rb")
train = pickle.load(pickle_in_train)

In [10]:
#load test set
pickle_in_test = open("all_books_test.pickle","rb")
test = pickle.load(pickle_in_test)

In [11]:
train.shape

(36389, 14)

In [12]:
test.shape

(12131, 14)

In [13]:
#show training data
train.head()

Unnamed: 0,book_authors,book_desc,book_edition,book_format,book_isbn,book_pages,book_rating,book_rating_count,book_review_count,book_title,genres,image_url,genres_list,genres_cut
6377,Nicole Jacquelyn,"""How is it, that someone can make decision aft...",,Kindle Edition,,305 pages,4.02,17962,1159,Craving Constellations,Romance|Romance|Contemporary Romance|Contemporary,https://images.gr-assets.com/books/1377563339l...,"[Romance, Contemporary, Contemporary Romance]","[Romance, Contemporary]"
19880,Adam Schell,A village in Tuscany is the setting for this j...,,Hardcover,9780390000000.0,340 pages,3.74,646,161,"Tomato Rhapsody: A Fable of Love, Lust & Forbi...",Fiction|Historical|Historical Fiction|Cultural...,https://images.gr-assets.com/books/1320422913l...,"[Historical Fiction, Fiction, Food and Drink, ...","[Fiction, Romance, Historical, Cultural]"
23352,Robert Moss,Have you ever said something was “only a dream...,,Hardcover,9781580000000.0,224 pages,4.04,345,41,"The Three ""Only"" Things: Tapping the Power of ...",Nonfiction|Spirituality|Psychology|Spiritualit...,https://images.gr-assets.com/books/1328746102l...,"[Nonfiction, New Age, Psychology, Spirituality]",[Nonfiction]
16021,David Ambrose,A train of seemingly random events - coinciden...,,Unknown Binding,9780740000000.0,,3.35,166,18,Coincidence,Fiction|Suspense|Historical|Science Fiction|My...,https://images.gr-assets.com/books/1408924887l...,"[Suspense, Fiction, Mystery, Historical, Scien...","[Fiction, Mystery, Historical, Science Fiction]"
49981,Richard North Patterson,When the body of eleven-year-old Thuy Sen is f...,,Hardcover,9780350000000.0,463 pages,3.89,2400,154,Conviction,Fiction|Mystery|Thriller|Legal Thriller|Thriller,https://images.gr-assets.com/books/1399495278l...,"[Thriller, Legal Thriller, Fiction, Mystery]","[Thriller, Fiction, Mystery]"


In [14]:
#download training images
#download_images(train,"train_images")

In [15]:
#download test images
#download_images(test, "test_images")

In [16]:
train_filename_col = return_image_name_col(train)

In [17]:
train_df = train[['genres_cut']]
train_df['file_name'] = train_filename_col

In [18]:
train_df.head()

Unnamed: 0,genres_cut,file_name
6377,"[Romance, Contemporary]",6377.jpg
19880,"[Fiction, Romance, Historical, Cultural]",19880.jpg
23352,[Nonfiction],23352.jpg
16021,"[Fiction, Mystery, Historical, Science Fiction]",16021.jpg
49981,"[Thriller, Fiction, Mystery]",49981.jpg


In [19]:
test_filename_col = return_image_name_col(test)
test_df = test[['genres_cut']]
test_df['file_name'] = test_filename_col
test_df.head()

Unnamed: 0,genres_cut,file_name
9102,"[Romance, Fiction]",9102.jpg
34116,"[History, Biography, Nonfiction]",34116.jpg
53879,"[Crime, Fiction, Mystery, Thriller]",53879.jpg
30327,"[Childrens, Fantasy]",30327.jpg
36487,"[Young Adult, Fantasy]",36487.jpg


During the image downloading the website fall and some images have turned not valid for training. 

train_df_valid, test_df_valid - datasets containing only valid images

In [20]:
train_df_valid = train_df.copy()

In [21]:
for index,row in train_df.iterrows():
    path = 'train_images/' + row['file_name']
    if os.stat(path).st_size < 1024:
        train_df_valid.drop(index = index,inplace=True)
    

In [22]:
train_df_valid.shape

(36298, 2)

In [23]:
test_df_valid = test_df.copy()
for index,row in test_df.iterrows():
    path = 'test_images/' + row['file_name']
    if os.stat(path).st_size < 1024:
        test_df_valid.drop(index = index,inplace=True)

In [24]:
test_df_valid.shape

(12096, 2)

In [25]:
#pickle_out = open("train_images_valid.pickle","wb")
#pickle.dump(train_df_valid, pickle_out)
#pickle_out.close()

In [26]:
#pickle_out = open("test_images_valid.pickle","wb")
#pickle.dump(test_df_valid, pickle_out)
#pickle_out.close()

In [27]:
from keras.models import Sequential
from keras_preprocessing.image import ImageDataGenerator
from keras.layers import Dense, Activation, Flatten, Dropout, BatchNormalization
from keras.layers import Conv2D, MaxPooling2D
from keras import regularizers, optimizers

In [28]:
from sklearn.model_selection import KFold

In [29]:
all_classes = np.unique(list(itertools.chain.from_iterable(train_df['genres_cut'])))

In [30]:
kf = KFold(n_splits=5,random_state=None, shuffle=False)

### Model construction

In [31]:
datagen=ImageDataGenerator(rescale=1./255.)
test_datagen=ImageDataGenerator(rescale=1./255.)

In [32]:
#bulid the model
model = Sequential()
model.add(Conv2D(32, (3, 3), padding='same',
                 input_shape=(100,100,3)))
model.add(Activation('relu'))
model.add(Conv2D(32, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Conv2D(64, (3, 3), padding='same'))
model.add(Activation('relu'))
model.add(Conv2D(64, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(512))
model.add(Activation('tahn'))
model.add(Dropout(0.5))
model.add(Dense(23, activation='sigmoid'))
model.compile(optimizers.rmsprop(lr=0.0001, decay=1e-6),loss=[hn_multilabel_loss],metrics=[f1])

In [33]:
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint

In [34]:
callbacks = [ ReduceLROnPlateau(),
            EarlyStopping(patience=4),
            ModelCheckpoint(filepath='model.h5', save_best_only=True)
            ]

### New train feature construction

In [35]:
predictions_lst = []

In [36]:
# Predict genres for every of 5 folds and save the predictions
# in predictions_lst
for train_index, test_index in kf.split(train_df_valid):
    #print(train_index,test_index)
    #train generator
    train_generator=datagen.flow_from_dataframe(
    dataframe=train_df_valid.iloc[train_index],
    directory="train_images",
    x_col="file_name",
    y_col="genres_cut",
    batch_size=32,
    seed=42,
    shuffle=True,
    class_mode="categorical",
    classes=list(all_classes),
    target_size=(100,100))
    
    #test generator 
    test_generator=test_datagen.flow_from_dataframe(
    dataframe=train_df_valid.iloc[test_index],
    directory="train_images",
    x_col="file_name",
    y_col=None,
    batch_size=1,
    seed=42,
    shuffle=False,
    class_mode=None,
    target_size=(100,100))
    
    #fitting the model
    STEP_SIZE_TRAIN=train_generator.n//train_generator.batch_size
    STEP_SIZE_TEST=test_generator.n//test_generator.batch_size
    model.fit_generator(generator=train_generator,
                    steps_per_epoch=STEP_SIZE_TRAIN,
                    epochs=10,
                    callbacks=callbacks)
    #prediction
    test_generator.reset()
    pred=model.predict_generator(test_generator,
    steps=STEP_SIZE_TEST,
    verbose=1)
    
    pred_bool = (pred > 0.5)
    

    predictions = []
    labels = train_generator.class_indices
    labels = dict((v,k) for k,v in labels.items())
    for row in pred_bool:
        l=[]
        for index,cls in enumerate(row):
            if cls:
                l.append(labels[index])
        predictions.append(l)
    predictions_lst.extend(predictions)


Found 29038 validated image filenames belonging to 23 classes.
Found 7260 validated image filenames.
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Found 29038 validated image filenames belonging to 23 classes.
Found 7260 validated image filenames.
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
190/907 [=====>........................] - ETA: 16:34 - loss: 6.2861 - f1: 0.4270

KeyboardInterrupt: 

In [None]:
len(predictions_lst)

In [None]:
new_feature_train = train_df_valid.copy()

In [None]:
#make dataset with new feature to use it for stacking later
new_feature_train['keras_pred'] = predictions_lst

In [None]:
new_feature_train.head()

In [None]:
#saving new dataset to pikle file
pickle_out = open("train_with_new_feature2.pickle","wb")
pickle.dump(new_feature_train, pickle_out)
pickle_out.close()

### New test feature construction

In [None]:
train_generator=datagen.flow_from_dataframe(
dataframe=train_df_valid,
directory="train_images",
x_col="file_name",
y_col="genres_cut",
batch_size=32,
seed=42,
shuffle=True,
class_mode="categorical",
classes=list(all_classes),
target_size=(100,100))

In [None]:
test_generator=test_datagen.flow_from_dataframe(
dataframe=test_df_valid,
directory="test_images",
x_col="file_name",
y_col=None,
batch_size=1,
seed=42,
shuffle=False,
class_mode=None,
target_size=(100,100))

In [None]:
STEP_SIZE_TRAIN=train_generator.n//train_generator.batch_size
STEP_SIZE_TEST=test_generator.n//test_generator.batch_size

In [None]:
#traing the model on all training set 
model.fit_generator(generator=train_generator,
                    steps_per_epoch=STEP_SIZE_TRAIN,
                    epochs=10,
                    callbacks=callbacks
)

In [None]:
test_generator.reset()
pred=model.predict_generator(test_generator,
steps=STEP_SIZE_TEST,
verbose=1)

In [None]:
pred[0]

In [None]:
pred_bool = (pred > 0.5)

In [None]:
#make predictions for the test set
predictions2 = []
labels = train_generator.class_indices
labels = dict((v,k) for k,v in labels.items())
for row in pred_bool:
    l=[]
    for index,cls in enumerate(row):
        if cls:
            l.append(labels[index])
    predictions2.append(l)

In [None]:
new_feature_test = test_df_valid.copy()

In [None]:
#make test dataset with new feature
new_feature_test['keras_pred'] = predictions2
new_feature_test.head()

In [None]:
#save new dataset to pickle file
pickle_out = open("test_with_new_feature2.pickle","wb")
pickle.dump(new_feature_test, pickle_out)
pickle_out.close()