Let's use DistilBERT to do the Twitter sentiment analysis task: https://www.kaggle.com/kazanova/sentiment140. The task is to classify tweets as positive (4) or negative (0). I'll relabel to 1 and 0.

Some of this is adapted from the tutorial here: https://swatimeena989.medium.com/bert-text-classification-using-keras-903671e0207d, but on a new dataset (and with some additions).

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import string
from sklearn.utils import shuffle
import pickle

from tensorflow.keras.callbacks import ModelCheckpoint

import re
import time
import warnings
warnings.filterwarnings("ignore")
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import LancasterStemmer

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

from transformers import TFDistilBertModel, DistilBertConfig, DistilBertTokenizerFast, TFDistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments

import tensorflow.keras 
from tensorflow.keras.models import Sequential, Model 
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense, Dropout, Input, Embedding

In [None]:
dataset_cols = ["target", "ids", "date", "flag", "user", "text"]
dataset = pd.read_csv('/home/garrett/KagglesData/training.1600000.processed.noemoticon.csv', header=None, encoding='ISO-8859-1', names=dataset_cols)

In [None]:
dataset.shape
dataset.head()

Put all text in lowercase, remove Twitter handles, punctuation, websites, text in brackets or html tags, and words containing numbers. This does remove words like '2nite', but identifying and parsing 'textspeak' is a challenge that's a little beyond the scope of what I'm trying to investigate here (which is mainly just learning to correctly implement a form of BERT).

In [None]:
def preprocess(text):
    text = text.apply(lambda x: x.lower())
    text = text.apply(lambda x: re.sub(r'@\w+', '', x))
    text = text.apply(lambda x:re.sub('\[.*?\]', '', x))
    text = text.apply(lambda x:re.sub('https?://\S+|www\.\S+', '', x))
    text = text.apply(lambda x:re.sub('<.*?>+', '', x))
    text = text.apply(lambda x:re.sub('[%s]' % re.escape(string.punctuation), ' ', x))
    text = text.apply(lambda x:re.sub('\n', '', x))
    text = text.apply(lambda x:re.sub('\w*\d\w*', '', x))
    return text

In [None]:
df = shuffle(dataset,random_state=42)
df.head()

In [None]:
df = df.loc[:, ~df.columns.str.contains('ids', case=False)] 
df = df.loc[:, ~df.columns.str.contains('date', case=False)] 
df = df.loc[:, ~df.columns.str.contains('flag', case=False)]
df = df.loc[:, ~df.columns.str.contains('user', case=False)]
print(df['text'][671155])
df['text']=preprocess(df['text'])
print(df['text'][671155])
#df.tail()

Map the target onto a ground-truth of 0  or 1, check to make sure every tweet is labeled.

In [None]:
df['gt'] = df['target'].map({0:0,4:1})
sentences=df['text']
labels=df['gt']
len(sentences),len(labels)

In [None]:
df=df.dropna()                 
df=df.reset_index(drop=True)
print('Available labels: ',df['gt'].unique())

Data exploration:

In [None]:
bert_tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
bert_model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased",num_labels=2)

In [None]:
input_ids=[]
attention_masks=[]

for sent in sentences:
    bert_inp=bert_tokenizer.encode_plus(sent,add_special_tokens = True,max_length =64,pad_to_max_length = True,return_attention_mask = True)
    input_ids.append(bert_inp['input_ids'])
    attention_masks.append(bert_inp['attention_mask'])

input_ids=np.asarray(input_ids)
attention_masks=np.array(attention_masks)
labels=np.array(labels)

In [None]:
len(input_ids),len(attention_masks),len(labels)

In [None]:
print('Preparing the pickle file.....')

pickle_inp_path='./bert_inp.pkl'
pickle_mask_path='./bert_mask.pkl'
pickle_label_path='./bert_label.pkl'

pickle.dump((input_ids),open(pickle_inp_path,'wb'))
pickle.dump((attention_masks),open(pickle_mask_path,'wb'))
pickle.dump((labels),open(pickle_label_path,'wb'))


print('Pickle files saved as ',pickle_inp_path,pickle_mask_path,pickle_label_path)

In [None]:
print('Loading the saved pickle files..')

input_ids=pickle.load(open(pickle_inp_path, 'rb'))
attention_masks=pickle.load(open(pickle_mask_path, 'rb'))
labels=pickle.load(open(pickle_label_path, 'rb'))

print('Input shape {} Attention mask shape {} Input label shape {}'.format(input_ids.shape,attention_masks.shape,labels.shape))

In [None]:
trainval_inp,test_inp,trainval_label,test_label,trainval_mask,test_mask=train_test_split(input_ids,labels,attention_masks,test_size=0.1)
train_inp,val_inp,train_label,val_label,train_mask,val_mask=train_test_split(trainval_inp,trainval_label,trainval_mask,test_size=0.1)

In [None]:
log_dir='tensorboard_data/tb_bert'
model_save_path='./models/bert_model.h5'

callbacks = [tensorflow.keras.callbacks.ModelCheckpoint(filepath=model_save_path,save_weights_only=True,monitor='val_loss',mode='min',save_best_only=True),tensorflow.keras.callbacks.TensorBoard(log_dir=log_dir)]

loss = tensorflow.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tensorflow.keras.metrics.SparseCategoricalAccuracy('accuracy')
optimizer = tensorflow.keras.optimizers.Adam(learning_rate=2e-5,epsilon=1e-08)

bert_model.compile(loss=loss,optimizer=optimizer,metrics=[metric])

In [None]:
print('\nBert Model',bert_model.summary())


In [None]:
history=bert_model.fit([train_inp,train_mask],train_label,batch_size=128,epochs=4,validation_data=([val_inp,val_mask],val_label),callbacks=callbacks)

In [None]:
%load_ext tensorboard
log_dir='tensorboard_data/bert_model'
%tensorboard --logdir {log_dir}