In [29]:
import re

import tensorflow as tf

import pandas as pd

import swifter

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from transformers import (
    AutoTokenizer, 
    TFAutoModelForSequenceClassification,
    default_data_collator
)
    
from datasets import Dataset

In [2]:
strategy = tf.distribute.MirroredStrategy(["GPU:2", "GPU:3"])

2023-04-29 02:36:52.562024: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-04-29 02:36:52.562416: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-04-29 02:36:52.562746: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:2', '/job:localhost/replica:0/task:0/device:GPU:3')


In [3]:
DATA_DIRECTORY = "."
BATCH_SIZE = 32
MODEL_TYPE = "bert-base-uncased"
EPOCHS = 100

In [4]:
df = pd.read_csv(f"{DATA_DIRECTORY}/IMDB Dataset.csv")

In [5]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [6]:
df["sentiment"] = df["sentiment"].swifter.allow_dask_on_strings(enable=True).apply(lambda x: 0. if x == "positive" else 1.)

Pandas Apply:   0%|          | 0/50000 [00:00<?, ?it/s]

In [7]:
def clean_text(text):
    text = re.sub(r"<.*?>", " ", text.lower())
    text = re.sub(r"https?://\S+", " ", text)
    text = " ".join(text.split())
    
    return text

In [8]:
df["review"] = df["review"].swifter.allow_dask_on_strings(enable=True).apply(clean_text)

Dask Apply:   0%|          | 0/96 [00:00<?, ?it/s]

In [9]:
train, val = train_test_split(df, test_size=0.3, random_state=42, stratify=df["sentiment"])

In [10]:
test = val.sample(frac=0.5, random_state=42)
val = val.drop(test.index)

In [11]:
train, val, test = train.reset_index(drop=True), val.reset_index(drop=True), test.reset_index(drop=True)

In [12]:
train_ds, val_ds, test_ds = Dataset.from_pandas(train), Dataset.from_pandas(val), Dataset.from_pandas(test)

In [13]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_TYPE)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [14]:
def tokenize(example):
    return tokenizer(example["review"], padding="max_length", truncation=True, return_tensors="np")

In [15]:
train_ds = train_ds.map(tokenize, batched=True, remove_columns=["review"])
val_ds = val_ds.map(tokenize, batched=True, remove_columns=["review"])
test_ds = test_ds.map(tokenize, batched=True, remove_columns=["review"])

Map:   0%|          | 0/35000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7500 [00:00<?, ? examples/s]

Map:   0%|          | 0/7500 [00:00<?, ? examples/s]

In [16]:
train_ds = train_ds.to_tf_dataset(
    columns=["input_ids", "token_type_ids", "attention_mask"],
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=default_data_collator,
    label_cols=["sentiment"]
)

val_ds = val_ds.to_tf_dataset(
    columns=["input_ids", "token_type_ids", "attention_mask"],
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=default_data_collator,
    label_cols=["sentiment"]
)

test_ds = test_ds.to_tf_dataset(
    columns=["input_ids", "token_type_ids", "attention_mask"],
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=default_data_collator,
    label_cols=["sentiment"]
)

In [17]:
with strategy.scope():
    model = TFAutoModelForSequenceClassification.from_pretrained(MODEL_TYPE, num_labels=1)

Downloading tf_model.h5:   0%|          | 0.00/536M [00:00<?, ?B/s]

2023-04-29 02:38:13.171099: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:637] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
with strategy.scope():
    model.compile(
        optimizer=tf.keras.optimizers.Adam(1e-5),
        loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
        metrics=tf.keras.metrics.BinaryAccuracy()
    )

In [19]:
model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(
            patience=3, 
            restore_best_weights=True
        )
    ]
)

2023-04-29 02:38:18.750560: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int64 and shape [35000]
	 [[{{node Placeholder/_0}}]]
2023-04-29 02:38:18.750850: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int64 and shape [35000]
	 [[{{node Placeholder/_0}}]]
2023-04-29 02:38:18.752366: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:786] AUTO sharding policy will apply DATA sharding policy as it failed to apply FILE sharding policy because of the following reason: Found an unshardable source dataset: name: "TensorSliceDataset/_1"
op: "TensorSliceDataset"
i

Epoch 1/100
INFO:tensorflow:batch_all_reduce: 198 all-reduces with algorithm = nccl, num_packs = 1
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:GPU:2 then broadcast to ('/job:localhost/replica:0/task:0/device:GPU:2', '/job:localhost/replica:0/task:0/device:GPU:3').
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:GPU:2 then broadcast to ('/job:localhost/replica:0/task:0/device:GPU:2', '/job:localhost/replica:0/task:0/device:GPU:3').
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:GPU:2 then broadcast to ('/job:localhost/replica:0/task:0/device:GPU:2', '/job:localhost/replica:0/task:0/device:GPU:3').
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/de

2023-04-29 02:39:05.488289: I tensorflow/compiler/xla/service/service.cc:169] XLA service 0x7fd12cacb200 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-04-29 02:39:05.488324: I tensorflow/compiler/xla/service/service.cc:177]   StreamExecutor device (0): NVIDIA GeForce RTX 4090, Compute Capability 8.9
2023-04-29 02:39:05.488332: I tensorflow/compiler/xla/service/service.cc:177]   StreamExecutor device (1): NVIDIA GeForce RTX 4090, Compute Capability 8.9
2023-04-29 02:39:05.488341: I tensorflow/compiler/xla/service/service.cc:177]   StreamExecutor device (2): NVIDIA GeForce RTX 4090, Compute Capability 8.9
2023-04-29 02:39:05.488347: I tensorflow/compiler/xla/service/service.cc:177]   StreamExecutor device (3): NVIDIA GeForce RTX 4090, Compute Capability 8.9
2023-04-29 02:39:05.494937: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
20



2023-04-29 02:48:47.992529: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int64 and shape [7500]
	 [[{{node Placeholder/_0}}]]
2023-04-29 02:48:47.992771: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int64 and shape [7500]
	 [[{{node Placeholder/_0}}]]
2023-04-29 02:48:47.993706: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:786] AUTO sharding policy will apply DATA sharding policy as it failed to apply FILE sharding policy because of the following reason: Found an unshardable source dataset: name: "TensorSliceDataset/_1"
op: "TensorSliceDataset"
inp

Epoch 2/100


2023-04-29 02:49:18.634938: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int64 and shape [35000]
	 [[{{node Placeholder/_0}}]]
2023-04-29 02:49:18.635319: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int64 and shape [35000]
	 [[{{node Placeholder/_0}}]]




2023-04-29 02:57:54.961491: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int64 and shape [7500]
	 [[{{node Placeholder/_0}}]]
2023-04-29 02:57:54.962153: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int64 and shape [7500]
	 [[{{node Placeholder/_0}}]]
2023-04-29 02:57:55.409307: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int64 and shape [7500]
	



2023-04-29 02:58:20.408946: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int64 and shape [35000]
	 [[{{node Placeholder/_0}}]]
2023-04-29 02:58:20.409288: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int64 and shape [35000]
	 [[{{node Placeholder/_0}}]]


Epoch 3/100

2023-04-29 03:06:45.653618: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int64 and shape [7500]
	 [[{{node Placeholder/_0}}]]
2023-04-29 03:06:45.654245: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int64 and shape [7500]
	 [[{{node Placeholder/_0}}]]
2023-04-29 03:06:46.011667: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int64 and shape [7500]
	



2023-04-29 03:07:11.043919: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int64 and shape [35000]
	 [[{{node Placeholder/_0}}]]
2023-04-29 03:07:11.044368: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int64 and shape [35000]
	 [[{{node Placeholder/_0}}]]


Epoch 4/100

2023-04-29 03:15:30.853224: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int64 and shape [7500]
	 [[{{node Placeholder/_0}}]]
2023-04-29 03:15:30.853880: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int64 and shape [7500]
	 [[{{node Placeholder/_0}}]]
2023-04-29 03:15:30.994886: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int64 and shape [7500]
	

Epoch 5/100


2023-04-29 03:15:56.050911: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int64 and shape [35000]
	 [[{{node Placeholder/_0}}]]
2023-04-29 03:15:56.051628: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int64 and shape [35000]
	 [[{{node Placeholder/_0}}]]




2023-04-29 03:24:17.686556: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int64 and shape [7500]
	 [[{{node Placeholder/_0}}]]
2023-04-29 03:24:17.687239: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int64 and shape [7500]
	 [[{{node Placeholder/_0}}]]
2023-04-29 03:24:17.826087: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int64 and shape [7500]
	



<keras.callbacks.History at 0x7fd414099310>

In [21]:
predictions = model.predict(test_ds.map(lambda x, y: x))
predictions = tf.math.sigmoid(predictions.logits)

2023-04-29 03:32:52.785429: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int64 and shape [7500]
	 [[{{node Placeholder/_0}}]]
2023-04-29 03:32:52.785980: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int64 and shape [7500]
	 [[{{node Placeholder/_0}}]]
2023-04-29 03:32:52.788346: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:786] AUTO sharding policy will apply DATA sharding policy as it failed to apply FILE sharding policy because of the following reason: Found an unshardable source dataset: name: "TensorSliceDataset/_1"
op: "TensorSliceDataset"
inp



In [26]:
predictions = [0. if x[0] < 0.5 else 1. for x in predictions.numpy().tolist()]

In [28]:
y_test = list(test_ds.map(lambda x, y: y).unbatch().as_numpy_iterator())

2023-04-29 03:35:17.715960: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int64 and shape [7500]
	 [[{{node Placeholder/_0}}]]


In [32]:
print(f"Accuracy on test set: {round(accuracy_score(y_test, predictions) * 100, 2)}%")

Accuracy on test set: 94.37%
