In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from pathlib import Path
from keras.preprocessing.text import Tokenizer
from transformers import BertTokenizer, TFBertForSequenceClassification
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = pd.read_csv("dataset.csv", nrows=200000) #only the first 200K, lack of memory for more

# Here is the first version with LSTM

In [None]:
data['review_text'] = data['review_text'].astype(str)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['review_text'])
sequences = tokenizer.texts_to_sequences(data['review_text'])

In [None]:
padded_sequences = pad_sequences(sequences, maxlen=686, padding='post')

In [None]:
sentiment_labels = (data['review_score'] > 0).astype(int)
vocab_size = len(tokenizer.word_index) + 1

In [None]:
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, sentiment_labels, test_size=0.2, random_state=42)

model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=max([len(seq) for seq in sequences])))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test))

In [None]:
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

loss, accuracy = model.evaluate(X_test, y_test)
print('Test Accuracy: ', accuracy)

f1 = f1_score(y_test, y_pred)
print('F1 Score: ', f1)

# The BERT version using uncased, base pretrained BERT

In [3]:
X = []
y = []
# for review_text in data['review_text']:
#     if isinstance(review_text, str):
#         X.append(review_text)

for row in data.iterrows():
    if isinstance(row[1]['review_text'], str):
        X.append(row[1]['review_text'])
        y.append(row[1]['review_score'] > 0)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# Tokenize input texts
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_encodings = bert_tokenizer(list(X_train), padding=True, truncation=True, max_length=150, return_tensors='tf')
test_encodings = bert_tokenizer(list(X_test), padding=True, truncation=True, max_length=150, return_tensors='tf')

In [6]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train
)).shuffle(len(X_train)).batch(16)

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    y_test
)).batch(16)

In [7]:
bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        print(e)

In [8]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = ['accuracy']
bert_model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
bert_model.fit(train_dataset, epochs=5, validation_data=test_dataset)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x27aca1bd2e0>

In [9]:
y_pred = bert_model.predict(test_dataset)[0]
y_pred_labels = np.argmax(y_pred, axis=1)
print(classification_report(y_test, y_pred_labels))

              precision    recall  f1-score   support

       False       0.00      0.00      0.00      3984
        True       0.90      1.00      0.95     35950

    accuracy                           0.90     39934
   macro avg       0.45      0.50      0.47     39934
weighted avg       0.81      0.90      0.85     39934



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
