<a href="https://colab.research.google.com/github/GeorgeMarica/XLNET-text-classification/blob/main/BERT_full_sentence.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import dependencies

In [None]:
!pip install bert-for-tf2

In [None]:
import bert
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import pandas as pd
from google.colab import drive
from tensorflow.keras.models import Model
from tensorflow.keras.layers  import Input, GlobalAveragePooling1D, Dense, Conv1D, Dropout, LSTM, Bidirectional
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from keras.models import model_from_json

BERT model and tokenizer definition

In [None]:
def build_model(max_seq_length, max_no_categories, bert_layer):
  input_word_ids = Input(shape=(max_seq_length,), dtype=tf.int32, name="input_word_ids")
  input_mask = Input(shape=(max_seq_length,), dtype=tf.int32,name="input_mask")
  segment_ids = Input(shape=(max_seq_length,), dtype=tf.int32,name="segment_ids")
  pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
  x = Bidirectional(LSTM(units=150, activation='tanh', dropout=0.2))(sequence_output)
  out = Dense(max_no_categories, activation="softmax", name="dense_output")(x)
  model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
  model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
  return model

In [None]:
def build_tokenizer(bert_layer):
  FullTokenizer=bert.bert_tokenization.FullTokenizer
  vocab_file=bert_layer.resolved_object.vocab_file.asset_path.numpy()
  tokenizer=FullTokenizer(vocab_file)
  return tokenizer

BERT full sentence embeddings

In [None]:
def get_masks(tokens, max_seq_length):
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))

def get_segments(tokens, max_seq_length):
    segments = []
    current_segment_id = True
    for token in tokens:
        segments.append(int(current_segment_id==True))
        if token == "[SEP]":
            current_segment_id = not(current_segment_id)
    current_segment_id = not(current_segment_id)
    return segments + [int(current_segment_id==True)] * (max_seq_length - len(tokens))

def get_ids(tokens, tokenizer, max_seq_length):
    token_ids = tokenizer.convert_tokens_to_ids(tokens,)
    return token_ids + [0] * (max_seq_length-len(token_ids))

def create_single_input(sentence,max_seq_length,tokenizer):
  stokens = tokenizer.tokenize(sentence)
  stokens = stokens[:max_seq_length-2]
  stokens = ["[CLS]"] + stokens + ["[SEP]"]
  return get_ids(stokens, tokenizer, max_seq_length), get_masks(stokens, max_seq_length), get_segments(stokens, max_seq_length)

def create_input_array(sentences, max_seq_length, tokenizer):
  input_ids, input_masks, input_segments = [], [], []
  for sentence in sentences:
    ids,masks,segments=create_single_input(sentence,max_seq_length, tokenizer)
    input_ids.append(ids)
    input_masks.append(masks)
    input_segments.append(segments)
  return [np.asarray(input_ids, dtype=np.int32), np.asarray(input_masks, dtype=np.int32), np.asarray(input_segments, dtype=np.int32)]

Model evaluation and saving

In [None]:
def model_evaluation(test_x, test_y, model):
  yhat_probs = model.predict(test_x, verbose=0, batch_size=64)
  yhat_classes = np.argmax(yhat_probs,axis=1)
  yhat_probs = yhat_probs[:, 0]
  eval_dict={}
  eval_dict.update({'accuracy':accuracy_score(test_y, yhat_classes)}) # accuracy: (tp + tn) / (p + n)
  eval_dict.update({'precision':precision_score(test_y, yhat_classes, average='weighted')}) # precision tp / (tp + fp)
  eval_dict.update({'recall':recall_score(test_y, yhat_classes, average='weighted')}) # recall: tp / (tp + fn)
  eval_dict.update({'F1 score':f1_score(test_y, yhat_classes, average='weighted')})
  eval_dict.update({'Cohens kappa':cohen_kappa_score(test_y, yhat_classes, weights='linear')})
  eval_dict.update({'Confusion_matrix':confusion_matrix(test_y, yhat_classes)})
  return eval_dict

In [None]:
def save_model(model, model_save_name):
  model.save_weights('/content/gdrive/My Drive/drug_data/' + model_save_name + '.h5')
  with open('/content/gdrive/My Drive/drug_data/' + model_save_name + '.json', 'w') as f:
    js=model.to_json()
    f.write(js)
  f.close()
  return

MAIN EXECUTION

Import data and initialize static parameters

In [None]:
drive.mount('/content/gdrive')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
def truncate_text(text, max_seq_length):
    return ' '.join(text.split()[:max_seq_length])
epochs = 5
max_seq_length=256
model_save_name = 'bert_full_sentence'
text_column = 'user_review'
classes_column = 'user_suggestion'
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/XLNET/data/train.csv', sep=',')
df['short'] = [truncate_text(text, max_seq_length) for text in df[text_column].values.tolist()]

Train, evaluate, save

In [None]:
x = df['short'].values
y = df[classes_column].values.astype(int)
max_no_categories = np.amax(y)+1

In [None]:
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.3, random_state=42)

In [None]:
bert_layer=hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/2", trainable=True)
bert_tokenizer = build_tokenizer(bert_layer)
train_inputs = create_input_array(train_x, max_seq_length, bert_tokenizer)

In [None]:
bert_model =  build_model(max_seq_length, max_no_categories,bert_layer)
bert_model.fit(train_inputs, train_y, epochs=epochs, batch_size=16,validation_split=0.3,shuffle=True, verbose=1)

In [None]:
test_inputs = create_input_array(test_x, max_seq_length, bert_tokenizer)
yhat_probs = bert_model.predict(test_inputs, verbose=0, batch_size=32)
yhat_classes = np.argmax(yhat_probs,axis=1)
report = classification_report(y_pred=yhat_classes, y_true=test_y)
print(report)

In [None]:
save_model(bert_model, model_save_name)

In [None]:
# with open('/content/gdrive/My Drive/drug_data/bert_full_sentence.json') as f:
#   bert_model = model_from_json(f.read())
# bert_model.load_weights('/content/gdrive/My Drive/drug_data/bert_full_sentence.h5')