In [1]:
import warnings
warnings.filterwarnings("ignore")

In [1]:
import tensorflow as tf
from transformers import *
from transformers import BertTokenizer, TFBertModel, BertConfig
import transformers



In [None]:
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")  
config = BertConfig.from_pretrained("bert-base-uncased", output_hidden_states=False)
bert_model = TFBertModel.from_pretrained('bert-base-uncased', config=config)

In [4]:
sent = 'This is a example Text that we are using for Understanding Purpose, another word that we gonna use is Kaggle'
tokens = bert_tokenizer.tokenize(sent)
print(tokens)

['this', 'is', 'a', 'example', 'text', 'that', 'we', 'are', 'using', 'for', 'understanding', 'purpose', ',', 'another', 'word', 'that', 'we', 'gonna', 'use', 'is', 'ka', '##ggle']


In [5]:
ids = bert_tokenizer.convert_tokens_to_ids(tokens)
print("convert_tokens_to_ids:",ids)

ids_encode = bert_tokenizer.encode(sent,add_special_tokens = True,max_length =30,pad_to_max_length = True,
return_attention_mask = True)
print("\n\nencode:",ids_encode)

ids_encode_plus = bert_tokenizer.encode_plus(sent,add_special_tokens = True,max_length =30,pad_to_max_length = True,
return_attention_mask = True)
print("\n\nencode_plus:",ids_encode_plus)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


convert_tokens_to_ids: [2023, 2003, 1037, 2742, 3793, 2008, 2057, 2024, 2478, 2005, 4824, 3800, 1010, 2178, 2773, 2008, 2057, 6069, 2224, 2003, 10556, 24679]


encode: [101, 2023, 2003, 1037, 2742, 3793, 2008, 2057, 2024, 2478, 2005, 4824, 3800, 1010, 2178, 2773, 2008, 2057, 6069, 2224, 2003, 10556, 24679, 102, 0, 0, 0, 0, 0, 0]


encode_plus: {'input_ids': [101, 2023, 2003, 1037, 2742, 3793, 2008, 2057, 2024, 2478, 2005, 4824, 3800, 1010, 2178, 2773, 2008, 2057, 6069, 2224, 2003, 10556, 24679, 102, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]}


In [6]:
print("convert_ids_to_tokens:",bert_tokenizer.convert_ids_to_tokens(ids))

print("\ndecode:",bert_tokenizer.decode(ids_encode_plus['input_ids']))

convert_ids_to_tokens: ['this', 'is', 'a', 'example', 'text', 'that', 'we', 'are', 'using', 'for', 'understanding', 'purpose', ',', 'another', 'word', 'that', 'we', 'gonna', 'use', 'is', 'ka', '##ggle']

decode: [CLS] this is a example text that we are using for understanding purpose, another word that we gonna use is kaggle [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]


In [7]:
import pandas as pd
import re
import numpy as np
from tqdm import tqdm

train = pd.read_csv('F://bbc-text.csv')
train.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [8]:
train['category'].value_counts()

sport            511
business         510
politics         417
tech             401
entertainment    386
Name: category, dtype: int64

In [9]:
def clean_text(temp):
    temp = re.sub("@\S+", " ", temp)
    temp = re.sub("https*\S+", " ", temp)
    temp = re.sub("#\S+", " ", temp)
    temp = re.sub("\'\w+", '', temp)
    temp = re.sub(r'\w*\d+\w*', '', temp)
    temp = re.sub('\s{2,}', " ", temp)
    
    return temp.strip()

In [10]:
train['text_clean'] = train['text'].apply(clean_text)
sentences = train['text_clean']

In [11]:
input_ids=[]
attention_masks=[]

for sent in tqdm(sentences):
    bert_inp = bert_tokenizer.encode_plus(sent,
                                          add_special_tokens=True,
                                          max_length =128,
                                          pad_to_max_length=True,
                                          return_attention_mask=True)
    
    input_ids.append(bert_inp['input_ids'])
    attention_masks.append(bert_inp['attention_mask'])

input_ids = np.asarray(input_ids)
attention_masks = np.array(attention_masks)
target = np.array(pd.get_dummies(train['category']))

100%|█████████████████████████████████████████████████████████████████████████████| 2225/2225 [00:15<00:00, 144.69it/s]


In [12]:
bert_tokenizer.convert_ids_to_tokens(101)

'[CLS]'

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test, train_mask, test_mask = train_test_split(input_ids, target, attention_masks, test_size=0.2)

In [14]:
bert_model.summary()

Model: "tf_bert_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
Total params: 109,482,240
Trainable params: 109,482,240
Non-trainable params: 0
_________________________________________________________________


In [16]:
def create_model(model_):
    input_ids = tf.keras.Input(shape=(128,), dtype='int32')
    attention_masks = tf.keras.Input(shape=(128,), dtype='int32')
    
    output = model_(input_ids, attention_masks)
    output = output[0]      # this is inline in config.output_hidden_states as we want only the top head
    
    output = output[:,0,:]  #  We are only interested in <cls> or classification token of the model which can be extracted
                            #  using the slice operation. Now we have 2D data and build the network as one desired.
                            #  While converting 3D data to 2D we may miss on valuable info.
    
    output = tf.keras.layers.Dense(32, activation='relu')(output)
    output = tf.keras.layers.Dropout(0.2)(output)
    output = tf.keras.layers.Dense(5, activation='softmax')(output)
    model = tf.keras.models.Model(inputs=[input_ids,attention_masks], outputs=output)
    
    
    for layer in model.layers[:3]:
        layer.trainable = False
    return model

model = create_model(bert_model)
model.summary()  

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 128)]        0           []                               
                                                                                                  
 input_4 (InputLayer)           [(None, 128)]        0           []                               
                                                                                                  
 tf_bert_model (TFBertModel)    TFBaseModelOutputWi  109482240   ['input_3[0][0]',                
                                thPoolingAndCrossAt               'input_4[0][0]']                
                                tentions(last_hidde                                               
                                n_state=(None, 128,                                         

In [18]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [58]:
type(X_train)

numpy.ndarray

In [19]:
history = model.fit([X_train, train_mask], y_train, batch_size=32, epochs=4, validation_data=([X_test, test_mask], y_test))

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [31]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 128)]        0           []                               
                                                                                                  
 input_4 (InputLayer)           [(None, 128)]        0           []                               
                                                                                                  
 tf_bert_model (TFBertModel)    TFBaseModelOutputWi  109482240   ['input_3[0][0]',                
                                thPoolingAndCrossAt               'input_4[0][0]']                
                                tentions(last_hidde                                               
                                n_state=(None, 128,                                         

In [98]:
Ind2Label = {
    0: 'politics',
    1: 'entertainment',
    2: 'business',
    3: 'sport',
    4: 'technology'
}

def predict(model, text):
    bert_inputs = bert_tokenizer.encode_plus(text, max_length=128, padding='max_length', truncation=True)
    input_ids = np.array([bert_inputs['input_ids']])
    attention_mask = np.array([bert_inputs['attention_mask']])
    predictions = model.predict([input_ids, attention_mask])
    print(f"predictions: {predictions}")
    predicted_index = np.argmax(predictions)
    print(f"prediction is: {Ind2Label[predicted_index]}")
    
text = "Serie A leaders Napoli moved 18 points clear at the top of the table thanks to a routine away victory over Sassuolo."
predict(model, text)

predictions: [[7.4171767e-02 2.2354107e-03 8.1301958e-04 9.0162307e-01 2.1156736e-02]]
prediction is: sport


In [94]:
tf.keras.models.save_model(model=model, filepath='text-classifierr.h5')

In [103]:
loaded_model = tf.keras.models.load_model('text-classifierr.h5', custom_objects={"TFBertModel": transformers.TFBertModel})

Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}





In [104]:
predict(loaded_model, text)

predictions: [[7.4171767e-02 2.2354107e-03 8.1301958e-04 9.0162307e-01 2.1156736e-02]]
prediction is: sport
