In [9]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
import matplotlib.pyplot as plt

## **Ingestion**

In [10]:
# Read csv file
df = pd.read_csv('../data/cleaned_dataset.csv', sep=',')
df.sample(5)

Unnamed: 0,username,id,caption,Fashion and Style,Food and Dining,Family and Relationships,Sports and Fitness,Entertainment,Business and Industry,Travel and Adventure,Arts and Culture,News,Pets,Technology and Gadgets
560,discodaydream,17903285383934736,make your mind a good place to be,1,0,0,0,0,0,0,0,0,0,0
11265,flowersfordreams,18159336298206323,is your christmas gift delayedon back order or...,0,0,1,0,0,1,0,0,1,0,0
13805,adamdodsworth,17904106711381951,definitely missing some proper adventures at t...,0,0,0,0,0,0,1,0,0,0,0
5102,evasonaike,17938162949250204,very honoured to be part of this years judging...,0,0,1,0,0,0,0,0,0,0,0
13293,aprilrossbeach,17895505051851035,tighten your pony its a new weekregardless of ...,0,0,0,1,0,0,1,0,0,0,0


In [11]:
all_interests= [
    'Fashion and Style',
    'Food and Dining',
    'Family and Relationships',
    'Sports and Fitness',
    'Entertainment',
    'Business and Industry',
    'Travel and Adventure',
    'Arts and Culture',
    'News',
    'Pets',
    'Technology and Gadgets'
]

## **Preprocessing**

In [12]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [13]:
token = tokenizer.encode_plus(
    df['caption'].iloc[0], 
    max_length=256, 
    truncation=True, 
    padding='max_length', 
    add_special_tokens=True,
    return_tensors='tf'
)

In [14]:
X_input_ids = np.zeros((len(df), 256))
X_attn_masks = np.zeros((len(df), 256))

In [17]:
def generate_training_data(df, ids, masks, tokenizer):
    for i, text in tqdm(enumerate(df['caption']), total=len(df), desc='Generating training data', ncols=100):
        tokenized_text = tokenizer.encode_plus(
            text,
            max_length=256, 
            truncation=True, 
            padding='max_length', 
            add_special_tokens=True,
            return_tensors='tf'
        )
        ids[i, :] = tokenized_text.input_ids
        masks[i, :] = tokenized_text.attention_mask
    return ids, masks

In [18]:
X_input_ids, X_attn_masks = generate_training_data(df, X_input_ids, X_attn_masks, tokenizer)

Generating training data: 100%|██████████████████████████████| 23749/23749 [00:29<00:00, 799.80it/s]


In [19]:
labels = df[all_interests].values.astype('int8')
labels.shape

(23749, 11)

In [20]:
# creating a data pipeline using tensorflow dataset utility, creates batches of data for easy loading...
dataset = tf.data.Dataset.from_tensor_slices((X_input_ids, X_attn_masks, labels))

In [21]:
def dataset_map_function(input_ids, attn_masks, labels):
    return {
        'input_ids': input_ids,
        'attention_mask': attn_masks
    }, labels

In [22]:
dataset = dataset.map(dataset_map_function) # converting to required format for tensorflow dataset 

In [23]:
batch_size = 16
dataset = dataset.shuffle(10000).batch(batch_size, drop_remainder=True) # batch size, drop any left out tensor

In [24]:
p = 0.7 # 70% of data for training
train_size = int((len(df)//batch_size)*p) # for each 16 batch of data we will have len(df)//16 samples, take 80% of that for train.
train_size

1038

In [25]:
# split the data into train and validation and test sets
train_dataset = dataset.take(train_size)
val_test_dataset = dataset.skip(train_size)
val_dataset = val_test_dataset.take(len(val_test_dataset)//2)
test_dataset = val_test_dataset.skip(len(val_test_dataset)//2)
# size of train, validation and test sets
len(train_dataset), len(val_dataset), len(test_dataset)

(1038, 223, 223)

## **Model & Training**

In [None]:
model = TFBertModel.from_pretrained('bert-base-cased') # bert base model with pretrained weights

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [None]:
# defining 2 input layers for input_ids and attn_masks
input_ids = tf.keras.layers.Input(shape=(256,), name='input_ids', dtype='int32')
attn_masks = tf.keras.layers.Input(shape=(256,), name='attention_mask', dtype='int32')

bert_embds = model.bert(input_ids, attention_mask=attn_masks)[1] # 0 -> activation layer (3D), 1 -> pooled output layer (2D)
intermediate_layer = tf.keras.layers.Dense(512, activation='relu', name='intermediate_layer')(bert_embds)
output_layer = tf.keras.layers.Dense(labels.shape[1], activation='softmax', name='output_layer')(intermediate_layer) # softmax -> calcs probs of classes

interest_model = tf.keras.Model(inputs=[input_ids, attn_masks], outputs=output_layer)
interest_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 256)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 256)]        0           []                               
                                                                                                  
 bert (TFBertMainLayer)         TFBaseModelOutputWi  108310272   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 256,                                           

In [None]:
optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=3e-5)
loss = tf.keras.losses.BinaryCrossentropy()
metrics = [
    tf.keras.metrics.BinaryAccuracy(name='accuracy'), 
    tf.keras.metrics.Precision(name='precision'), # how many of the samples you predicted as positive are actually positive
    tf.keras.metrics.Recall(name='recall'), # how many of this class you find over the whole number of element of this class
]

interest_model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

In [None]:
history = interest_model.fit(train_dataset, epochs=5, validation_data=val_dataset, verbose=1, batch_size=batch_size, shuffle=True)

## **Evaluation**

In [None]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r'@[A-Za-z0-9]+', '', text)
    text = re.sub(r'#[A-Za-z0-9]+', '', text)
    text = re.sub(r'[^A-Za-z0-9 ]+', '', text)
    # remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def prepare_data(input_text, tokenizer):
    input_text = clean_text(input_text)
    token = tokenizer.encode_plus(
        input_text,
        max_length=256, 
        truncation=True, 
        padding='max_length', 
        add_special_tokens=True,
        return_tensors='tf'
    )
    return {
        'input_ids': tf.cast(token.input_ids, tf.float64),
        'attention_mask': tf.cast(token.attention_mask, tf.float64)
    }

def make_prediction(model, processed_data, classes=all_interests):
    probs = model.predict(processed_data)[0]
    threshold = 0.1
    # Return a pandas series of interests with probability greater than threshold
    return pd.Series(data=probs, index=classes)[lambda x: x > threshold].sort_values(ascending=False)

In [None]:
# Score the model on the test data and show random samples
test_loss, test_acc, test_prec, test_rec = interest_model.evaluate(test_dataset.take(10), verbose=1)
print(f'Test loss: {test_loss}')
print(f'Test accuracy: {test_acc}')

2023-05-30 21:52:18.362210: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_2' with dtype int8 and shape [24277,11]
	 [[{{node Placeholder/_2}}]]
2023-05-30 21:52:18.362685: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_2' with dtype int8 and shape [24277,11]
	 [[{{node Placeholder/_2}}]]


Test loss: 0.08449555933475494
Test accuracy: 0.9011363387107849


In [None]:
# Save the model
interest_model.save('../artifacts/bert_interest_model_v2.h5')

## **Inference**

In [None]:
interest_model = tf.keras.models.load_model('../artifacts/bert_interest_model.h5')
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [None]:
caption = 'We are looking for a Data Scientist with 3+ years of experience in Machine Learning and Deep Learning'
# caption = 'Cats are the best animals in the world'
# caption="Siraje ate my peanut butter and he eats pizza once a week."
processed_data = prepare_data(caption, tokenizer)
result = make_prediction(interest_model, processed_data=processed_data)
result



Technology and Gadgets    0.992274
dtype: float32