Bert Classification using TFBertForSequenceClassification

In [None]:
import numpy as np 
import pandas as pd 
import random as rn
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import seaborn as sns
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from tokenizers import BertWordPieceTokenizer

import transformers
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures

import logging
transformers.logging.set_verbosity_error()
# logging.getLogger("tensorflow").setLevel(logging.WARNING)



In [None]:
df_reviews = pd.read_csv('game_train.csv')
df_reviews.head()

Unnamed: 0,review_id,title,year,user_review,user_suggestion
0,473,Sakura Clicker,2017.0,This has helped me through my stage 8 terminal...,1
1,22971,Crusaders of the Lost Idols,2017.0,Awesome idea. I support this game. I love that...,1
2,18225,RaceRoom Racing Experience,2014.0,"This game is just, such a♥♥♥♥♥♥take. The devel...",0
3,17132,Black Squad,2018.0,Early Access Reviewhere what i honesty think a...,1
4,8103,DCS World Steam Edition,2014.0,Very detailed sim and a joy to fly using a fli...,1


Data preprocessing: Changing text to string, remove Early Access Reviews, drop the review will null scores and split the dataset into train, validation and holdout sets.

In [None]:
df_reviews = df_reviews[df_reviews.user_review.str.strip() != 'Early Access Review']
df_reviews.shape

(10494, 5)

In [None]:
# convert review text to string
df_reviews["user_review"] = df_reviews["user_review"].astype(str)
df_reviews.user_review = df_reviews.user_review.apply(lambda s: s.strip())

# drop the reviews with null score
df_reviews_2 = df_reviews[~df_reviews.user_review.isin(['nan'])]
df_reviews_2.drop_duplicates(['user_review', 'user_suggestion'], inplace = True)
print(df_reviews_2.shape)

df_reviews_2["user_suggestion"].value_counts()

(10494, 5)


1    5986
0    4508
Name: user_suggestion, dtype: int64

In [None]:
import re

# replace ♥
def replace_hearts_with_PAD(text):
    return re.sub(r"[♥]+", ' **** ' ,text)

df_reviews_2['user_review'] = df_reviews_2.user_review.apply(replace_hearts_with_PAD)

reviews = df_reviews_2["user_review"].values.tolist()
labels = df_reviews_2["user_suggestion"].tolist()

In [None]:
# split the dataset into train, validation and holdout sets (60-20-20)
training_sentences, test_sentences, training_labels, test_labels = train_test_split(reviews, labels, test_size=.4)

Sentiment Classification with BERT: we used BertTokenizer and TFBertForSequenceClassification from 'bert-base-cased'.

In [None]:
# BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [None]:
# example of the output of the tokenizer: 
tokenizer([training_sentences[0]], truncation=True,
                            padding=True, max_length=512)


{'input_ids': [[101, 4503, 11737, 4960, 1708, 1186, 146, 1309, 2140, 1354, 146, 112, 173, 1176, 1126, 3294, 3621, 1342, 119, 146, 1108, 1579, 1702, 1120, 1122, 1115, 1115, 4106, 1104, 1773, 170, 3621, 1342, 1110, 27799, 2716, 4353, 1106, 2520, 1106, 1103, 1825, 1506, 1103, 1952, 1121, 1128, 117, 1120, 1655, 1111, 1143, 119, 1409, 1178, 1177, 1128, 1169, 4137, 3451, 10049, 1105, 179, 1389, 3781, 9304, 5773, 1128, 1508, 1487, 119, 1252, 1114, 1103, 1342, 1217, 1714, 117, 146, 1879, 1106, 1660, 1142, 1141, 170, 2046, 117, 1105, 1122, 112, 188, 7284, 7310, 119, 1188, 1110, 1103, 13710, 1104, 1800, 1150, 1215, 1106, 1505, 6734, 131, 1109, 26532, 117, 1133, 8186, 112, 189, 1541, 1125, 1103, 1159, 1106, 6799, 1122, 1290, 6282, 1344, 21050, 117, 1105, 1150, 1541, 117, 2276, 1193, 7871, 1459, 5426, 22444, 131, 146, 1567, 1142, 1342, 1170, 1178, 170, 1374, 2005, 117, 1105, 146, 1138, 1185, 2255, 1106, 2059, 1122, 112, 188, 1280, 1106, 3968, 1228, 119, 146, 112, 182, 7688, 15604, 21155, 1174, 110

In [None]:
# tokenize training and validation sentences
train_encodings = tokenizer(training_sentences,
                            truncation=True,
                            padding=True)

test_encodings = tokenizer(test_sentences,
                            truncation=True,
                            padding=True)

In [None]:
# convert the input encodings and labels into a TensorFlow Dataset object
train_dataset = tf.data.Dataset.from_tensor_slices((
                            dict(train_encodings),
                            training_labels
                            ));

test_dataset = tf.data.Dataset.from_tensor_slices((
                            dict(test_encodings),
                            test_labels
                            ));

Training the model

In [None]:
model = TFBertForSequenceClassification.from_pretrained('bert-base-cased',num_labels=2)

# create an optimizer and compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5, epsilon=1e-08)
model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy'])

In [None]:
pip install Numpy==1.18.5

In [None]:
pip install TensorFlow==2.4

We set the epochs=2, dont want to spend lots of time on training the model.

In [None]:
# train and fine-tune the model
history = model.fit(train_dataset.shuffle(100).batch(8),
          epochs=2,
          batch_size=8,
          validation_data=test_dataset.shuffle(100).batch(8), verbose=1)

Epoch 1/2


  return py_builtins.overload_of(f)(*args)




As we could the first epoch of the model only have 0.7345 accuracy, which is relative low when compare to simple transformer. So we chose to use the trained simple transformer to predict the result. 