In [1]:
import pickle
import urllib.request as req
import pandas as pd
import numpy as np
import os

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import itertools

In [4]:
from keras import backend as K

Using TensorFlow backend.


In [5]:
def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.
        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.
        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [6]:
#hamming loss
def hn_multilabel_loss(y_true, y_pred):
    # Avoid divide by 0
    y_pred = K.clip(y_pred, K.epsilon(), 1 - K.epsilon())
    # Multi-task loss
    return K.mean(K.sum(- y_true * K.log(y_pred) - (1 - y_true) * K.log(1 - y_pred), axis=1))

In [7]:
def return_image_name_col(df):
    ids = df.index.values
    names = []
    for i in ids:
        names.append(str(i)+'.jpg')
    return np.array(names)

In [8]:
#load training set
pickle_in_train = open("all_books_train.pickle","rb")
train = pickle.load(pickle_in_train)

In [9]:
#load test set
pickle_in_test = open("all_books_test.pickle","rb")
test = pickle.load(pickle_in_test)

In [10]:
train.shape

(36389, 14)

In [11]:
test.shape

(12131, 14)

In [12]:
train_filename_col = return_image_name_col(train)

In [13]:
train_df = train[['genres_cut']]
train_df['file_name'] = train_filename_col

In [14]:
train_df.head()

Unnamed: 0,genres_cut,file_name
6377,"[Romance, Contemporary]",6377.jpg
19880,"[Fiction, Romance, Historical, Cultural]",19880.jpg
23352,[Nonfiction],23352.jpg
16021,"[Fiction, Mystery, Historical, Science Fiction]",16021.jpg
49981,"[Thriller, Fiction, Mystery]",49981.jpg


In [15]:
test_filename_col = return_image_name_col(test)
test_df = test[['genres_cut']]
test_df['file_name'] = test_filename_col
test_df.head()

Unnamed: 0,genres_cut,file_name
9102,"[Romance, Fiction]",9102.jpg
34116,"[History, Biography, Nonfiction]",34116.jpg
53879,"[Crime, Fiction, Mystery, Thriller]",53879.jpg
30327,"[Childrens, Fantasy]",30327.jpg
36487,"[Young Adult, Fantasy]",36487.jpg


In [16]:
train_df_valid = train_df.copy()

In [17]:
for index,row in train_df.iterrows():
    path = 'train_images/' + row['file_name']
    if os.stat(path).st_size < 1024:
        train_df_valid.drop(index = index,inplace=True)

In [18]:
train_df_valid.shape

(36298, 2)

In [19]:
test_df_valid = test_df.copy()
for index,row in test_df.iterrows():
    path = 'test_images/' + row['file_name']
    if os.stat(path).st_size < 1024:
        test_df_valid.drop(index = index,inplace=True)

In [20]:
test_df_valid.shape

(12096, 2)

In [21]:
from keras.models import Sequential
from keras_preprocessing.image import ImageDataGenerator
from keras.layers import Dense, Activation, Flatten, Dropout, BatchNormalization
from keras.layers import Conv2D, MaxPooling2D
from keras import regularizers, optimizers

In [22]:
from sklearn.model_selection import KFold

In [23]:
all_classes = np.unique(list(itertools.chain.from_iterable(train_df['genres_cut'])))

In [24]:
kf = KFold(n_splits=5,random_state=None, shuffle=False)

In [25]:
genres_count = []
for cl in all_classes:
    c = sum(l.count(cl) for l in list(train_df_valid['genres_cut']))
    genres_count.append(c)

In [26]:
most_common_genres = pd.DataFrame({'genre':all_classes, 'count':genres_count})

In [27]:
most_common_genres['class_weight'] = len(train_df_valid['genres_cut'])/most_common_genres['count']
class_weights = {}
for i, row in most_common_genres.iterrows():
    class_weights[i] = row['class_weight']

In [28]:
class_weights

{0: 21.839951865222623,
 1: 20.741714285714284,
 2: 12.691608391608392,
 3: 7.965328066710555,
 4: 8.068015114469882,
 5: 23.646905537459283,
 6: 10.091187100361413,
 7: 16.85926614026939,
 8: 3.424662704028682,
 9: 1.895852919669905,
 10: 6.9007604562737646,
 11: 18.472264631043256,
 12: 21.214494447691408,
 13: 22.872085696282294,
 14: 13.408939785740673,
 15: 8.599384032219852,
 16: 6.41647516351423,
 17: 20.06522940851299,
 18: 9.408501814411613,
 19: 3.7905179615705933,
 20: 9.577308707124011,
 21: 16.061061946902655,
 22: 4.622182605373743}

In [29]:
datagen=ImageDataGenerator(rescale=1./255.)
test_datagen=ImageDataGenerator(rescale=1./255.)

In [30]:
#bulid the model
model = Sequential()
model.add(Conv2D(32, (3, 3), padding='same',
                 input_shape=(100,100,3)))
model.add(Activation('relu'))
model.add(Conv2D(32, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Conv2D(64, (3, 3), padding='same'))
model.add(Activation('relu'))
model.add(Conv2D(64, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(512))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(23, activation='sigmoid'))
model.compile(optimizers.rmsprop(lr=0.0001, decay=1e-6),loss=[hn_multilabel_loss],metrics=[f1])

In [31]:
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint

In [32]:
callbacks = [ ReduceLROnPlateau(),
            EarlyStopping(patience=4),
            ModelCheckpoint(filepath='model.h5', save_best_only=True)
            ]

In [33]:
predictions_lst = []

In [34]:
for train_index, test_index in kf.split(train_df_valid):
    #print(train_index,test_index)
    #train generator
    train_generator=datagen.flow_from_dataframe(
    dataframe=train_df_valid.iloc[train_index],
    directory="train_images",
    x_col="file_name",
    y_col="genres_cut",
    batch_size=32,
    seed=42,
    shuffle=True,
    class_mode="categorical",
    classes=list(all_classes),
    target_size=(100,100))
    
    #test generator 
    test_generator=test_datagen.flow_from_dataframe(
    dataframe=train_df_valid.iloc[test_index],
    directory="train_images",
    x_col="file_name",
    y_col=None,
    batch_size=1,
    seed=42,
    shuffle=False,
    class_mode=None,
    target_size=(100,100))
    
    #fitting the model
    STEP_SIZE_TRAIN=train_generator.n//train_generator.batch_size
    STEP_SIZE_TEST=test_generator.n//test_generator.batch_size
    model.fit_generator(generator=train_generator,
                    steps_per_epoch=STEP_SIZE_TRAIN,
                    epochs=30,
                    callbacks=callbacks,
                    class_weight=class_weights)
    #prediction
    test_generator.reset()
    pred=model.predict_generator(test_generator,
    steps=STEP_SIZE_TEST,
    verbose=1)
    
    pred_bool = (pred > 0.5)
    

    predictions = []
    labels = train_generator.class_indices
    labels = dict((v,k) for k,v in labels.items())
    for row in pred_bool:
        l=[]
        for index,cls in enumerate(row):
            if cls:
                l.append(labels[index])
        predictions.append(l)
    predictions_lst.extend(predictions)

Found 29038 validated image filenames belonging to 23 classes.
Found 7260 validated image filenames.
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Found 29038 validated image filenames belonging to 23 classes.
Found 7260 validated image filenames.
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Found 29038 validated image filenames belonging to 23 classes.
Found 7260 validated image filena

Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Found 29039 validated image filenames belonging to 23 classes.
Found 7259 validated image filenames.
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Found 29039 validated image filenames belonging to 23 classes.
Found 7259 validated image filenames.
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30


In [35]:
new_feature_train = train_df_valid.copy()
new_feature_train['keras_pred'] = predictions_lst


In [36]:
#pickle_out = open("train_with_weights.pickle","wb")
pickle.dump(new_feature_train, pickle_out)
pickle_out.close()

In [46]:
train_generator=datagen.flow_from_dataframe(
dataframe=train_df_valid,
directory="train_images",
x_col="file_name",
y_col="genres_cut",
batch_size=32,
seed=42,
shuffle=True,
class_mode="categorical",
classes=list(all_classes),
target_size=(100,100))

Found 36298 validated image filenames belonging to 23 classes.


In [48]:
test_generator=test_datagen.flow_from_dataframe(
dataframe=test_df_valid,
directory="test_images",
x_col="file_name",
y_col=None,
batch_size=1,
seed=42,
shuffle=False,
class_mode=None,
target_size=(100,100))

Found 12096 validated image filenames.


In [49]:
STEP_SIZE_TRAIN=train_generator.n//train_generator.batch_size
STEP_SIZE_TEST=test_generator.n//test_generator.batch_size

In [50]:
model.fit_generator(generator=train_generator,
                    steps_per_epoch=STEP_SIZE_TRAIN,
                    epochs=30,
                    callbacks=callbacks,
                    class_weight = class_weights
)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7eef9b080128>

In [51]:
test_generator.reset()
pred=model.predict_generator(test_generator,
steps=STEP_SIZE_TEST,
verbose=1)



In [52]:
pred_bool = (pred > 0.5)

In [53]:
predictions2 = []
labels = train_generator.class_indices
labels = dict((v,k) for k,v in labels.items())
for row in pred_bool:
    l=[]
    for index,cls in enumerate(row):
        if cls:
            l.append(labels[index])
    predictions2.append(l)

In [54]:
new_feature_test = test_df_valid.copy()

In [55]:
new_feature_test['keras_pred'] = predictions2
new_feature_test.head()

Unnamed: 0,genres_cut,file_name,keras_pred
9102,"[Romance, Fiction]",9102.jpg,[]
34116,"[History, Biography, Nonfiction]",34116.jpg,[Nonfiction]
53879,"[Crime, Fiction, Mystery, Thriller]",53879.jpg,[Fiction]
30327,"[Childrens, Fantasy]",30327.jpg,[Nonfiction]
36487,"[Young Adult, Fantasy]",36487.jpg,[]


In [56]:
pickle_out = open("test_with_weights.pickle","wb")
pickle.dump(new_feature_test, pickle_out)
pickle_out.close()

In [58]:
len(predictions_lst)

36298

In [59]:
len(predictions2)

12096

In [60]:
#load test set
pickle_in_test = open("test_with_new_feature.pickle","rb")
test2 = pickle.load(pickle_in_test)