In [63]:
import pandas as pd
import numpy as np
import random
import os
# Keras imports
from tensorflow.keras import layers, models, optimizers
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.callbacks import EarlyStopping

from transformers import DistilBertTokenizer

In [64]:
def vgg16_load_model(input_shape):
    model = VGG16(weights='imagenet', include_top=False, input_shape=input_shape)
    return model


def vgg16_set_nontrainable_layers(model):
  model.trainable = False
  return model


def vgg16_add_last_layers(model):
    '''Take a pre-trained model, set its parameters as non-trainable, and add additional trainable layers on top'''

    base_model = vgg16_set_nontrainable_layers(model)

    flattening_layer = layers.Flatten()

    dense_layer_1 = layers.Dense(224, activation='relu')
    dense_layer_1b = layers.Dropout(rate=0.4)

    dense_layer_2 = layers.Dense(112, activation='relu')
    dense_layer_2b = layers.Dropout(rate=0.3)

    dense_layer_3 = layers.Dense(60, activation='relu')
    dense_layer_3b = layers.Dropout(rate=0.2)

    prediction_layer = layers.Dense(30, activation='softmax')

    model = Sequential([
      base_model,
      flattening_layer,
      dense_layer_1,
      dense_layer_1b,
      dense_layer_2,
      dense_layer_2b,
      dense_layer_3,
      dense_layer_3b,
      prediction_layer
    ])

    return model


def vgg16_build_model(input_shape, optimizer):

  customized_vgg16 = vgg16_add_last_layers(vgg16_load_model(input_shape))

  opt = optimizer

  customized_vgg16.compile(loss='categorical_crossentropy',
                optimizer = opt,
                metrics=['accuracy'])

  return customized_vgg16

In [94]:
adam_opt = optimizers.Adam(learning_rate = 0.0001)
model_vgg16 = vgg16_build_model((224, 224, 3), adam_opt)
model_vgg16.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 vgg16 (Functional)          (None, 7, 7, 512)         14714688  
                                                                 
 flatten_5 (Flatten)         (None, 25088)             0         
                                                                 
 dense_24 (Dense)            (None, 224)               5619936   
                                                                 
 dropout_91 (Dropout)        (None, 224)               0         
                                                                 
 dense_25 (Dense)            (None, 112)               25200     
                                                                 
 dropout_92 (Dropout)        (None, 112)               0         
                                                                 
 dense_26 (Dense)            (None, 60)               

In [95]:
# distilBERT tokenizer
import transformers
tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

from tensorflow.keras import models, layers, metrics

# Get BERT layer
config = transformers.DistilBertConfig(dropout=0.2, attention_dropout=0.2)
dbert_tf = transformers.TFDistilBertModel.from_pretrained('distilbert-base-uncased', config=config, trainable=False)

def build_distibert_model():
    input_ids_in = layers.Input(shape=(40,), name='input_token', dtype='int32')

    x = dbert_tf(input_ids=input_ids_in)[0][:,0,:]
    x = layers.Dropout(0.2, name='dropout')(x)
    x = layers.Dense(64, activation='relu', name='dense')(x)
    x = layers.Dense(30, activation='softmax', name='img_classification')(x)

    model_tf = models.Model(inputs=input_ids_in, outputs = x, name='ClassificationModelTF')

    model_tf.compile(optimizer='adam',loss='sparse_categorical_crossentropy', metrics=[metrics.SparseCategoricalAccuracy()])
    return model_tf

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [96]:
model_nlp = build_distibert_model()
model_nlp.summary()

Model: "ClassificationModelTF"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_token (InputLayer)    [(None, 40)]              0         
                                                                 
 tf_distil_bert_model_4 (TFD  TFBaseModelOutput(last_h  66362880 
 istilBertModel)             idden_state=(None, 40, 7            
                             68),                                
                              hidden_states=None, att            
                             entions=None)                       
                                                                 
 tf.__operators__.getitem_7   (None, 768)              0         
 (SlicingOpLambda)                                               
                                                                 
 dropout (Dropout)           (None, 768)               0         
                                             

In [102]:
model_nlp = build_distibert_model() # comment-out to keep pre-trained weights not to start from scratch
input_text = model_nlp.input
output_text = model_nlp.output

model_img = vgg16_build_model((224, 224, 3), adam_opt) # comment-out to keep pre-trained weights not to start from scratch
input_img = model_img.input
output_img = model_img.output

In [103]:
# Let's combine the two streams of data and add two dense layers on top!
inputs = [input_text, input_img]

# combine layers
combined = layers.concatenate([output_text, output_img])
# 2 dense layers on top
x = layers.Dense(10, activation='relu')(combined)
outputs = layers.Dense(30, activation='softmax', name='classification')(x)


# combine model
model_combined = models.Model(inputs=inputs, outputs=outputs)

In [104]:
model_combined.summary() #🤮

Model: "model_5"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 vgg16_input (InputLayer)       [(None, 224, 224, 3  0           []                               
                                )]                                                                
                                                                                                  
 vgg16 (Functional)             (None, 7, 7, 512)    14714688    ['vgg16_input[0][0]']            
                                                                                                  
 flatten_7 (Flatten)            (None, 25088)        0           ['vgg16[0][0]']                  
                                                                                                  
 dense_33 (Dense)               (None, 224)          5619936     ['flatten_7[0][0]']        

In [105]:
data_df = pd.read_csv("../raw_data/train_x.csv", delimiter=" ", index_col="Id")
targets_df = pd.read_csv("../raw_data/train_y.csv", delimiter=" ", index_col="Id")

def load_and_preprocess_image(image_names):
    images = []
    for image_name in image_names:
        image_path = os.path.join("..", "raw_data", "Images", image_name)
        image = load_img(image_path, target_size=(224, 224))
        image = img_to_array(image)
        image = preprocess_input(image)
        images.append(image)
    return np.array(images)


# Create DistilBERT tokenizer
distilbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

def preprocess_text(text_titles):
    encoded_inputs = []
    for text_title in text_titles:
        encoded_input = distilbert_tokenizer.encode_plus(
            text_title,
            add_special_tokens=True,
            max_length=60,
            padding='max_length',
            truncation=True,
            return_tensors='tf'
        )
        encoded_inputs.append(encoded_input['input_ids'])
    return encoded_inputs


def custom_data_generator(data_df, targets_df, batch_size):
    num_samples = len(data_df)
    indices = list(range(num_samples))
    while True:
        # Shuffle the indices for each epoch
        random.shuffle(indices)
        
        # Generate batches
        for i in range(0, num_samples, batch_size):
            batch_indices = indices[i:i+batch_size]
            
            # Load and preprocess image and text data for the current batch
            image_names = data_df['Image_name'][batch_indices]
            batch_images = load_and_preprocess_image(image_names)
                
            text_titles = data_df['Title'][batch_indices]
            batch_texts = preprocess_text(text_titles)
                                
            batch_images = np.array(batch_images)
            batch_texts = np.squeeze(np.array(batch_texts))                    
                
                
            batch_targets = targets_df['Genre_id'][batch_indices].values
            
            yield [batch_texts, batch_images], batch_targets

In [107]:
model_combined.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

es = EarlyStopping(patience=10, restore_best_weights=True, monitor="val_accuracy")

batch_size = 32
epochs = 100

data_generator = custom_data_generator(data_df, targets_df, batch_size)

steps_per_epoch = len(data_df) // batch_size

history = model_combined.fit(data_generator, steps_per_epoch=steps_per_epoch, epochs=epochs,callbacks=[es])

Epoch 1/100


2023-06-18 10:34:31.895583: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
	 [[{{node Placeholder/_0}}]]




KeyboardInterrupt: 

(32,)