In [1]:
import pandas as pd
import torch
from sklearn.metrics import classification_report
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [23]:
# Setting up the parameters
maximum_features = 30522  # Maximum number of words to consider as features
maximum_length = 128  # Maximum length of input sequences
word_embedding_dims = 50  # Dimension of word embeddings
no_of_filters = 128  # Number of filters in the convolutional layer
kernel_size = 3  # Size of the convolutional filters
hidden_dim_1 = 128  # Number of neurons in the hidden layer

batch_size = 64  # Batch size for training
epochs = 10  # Number of training epochs
threshold = 0.7  # Threshold for binary classification

DATASET_SIZE = 10_000

df = pd.read_csv("../jigsaw/dataset_text_target.csv")
df_true = df[df.target > threshold]
df_false = df[df.target <= threshold]
df = pd.concat([df_true[DATASET_SIZE // 2:DATASET_SIZE], df_false[DATASET_SIZE // 2:DATASET_SIZE]], axis=0)
mapper = lambda x: 1 if x > 0.5 else 0
df.target = df.target.apply(mapper)


In [24]:

x_test = df.comment_text
y_test = df.target

In [25]:
# Tokenize and encode the data using the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

X_test_encoded = tokenizer.batch_encode_plus(
    x_test.tolist(),
    padding='max_length',
    truncation=True,
    max_length=maximum_length,
    add_special_tokens=True,
    return_tensors='pt',  # Return PyTorch tensors
)

# Create PyTorch Datasets
test_dataset = TensorDataset(X_test_encoded['input_ids'], torch.tensor(y_test.values, dtype=torch.float32))

# Create DataLoaders
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [26]:
import coremltools as ct

In [27]:
model = ct.models.MLModel("../conversions/ToxicCNN.mlpackage")

In [28]:
X_test_encoded.to("cpu")

{'input_ids': tensor([[  101,  2012,  2023,  ...,     0,     0,     0],
        [  101,  2129,  5223,  ...,     0,     0,     0],
        [  101,  7087, 10231,  ...,     0,     0,     0],
        ...,
        [  101,  1998,  2054,  ...,     0,     0,     0],
        [  101,  1998,  2023,  ...,     0,     0,     0],
        [  101,  1998,  2065,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [29]:
results = []
for i, row in enumerate(X_test_encoded["input_ids"]):
    res = model.predict({"input_ids": [row.numpy().astype("float32")]})
    results.append(res["var_72"][0][0] > threshold)
    print(f"{((i + 1) / len(X_test_encoded['input_ids'])) * 100:.2f}%", end="\r")

100.00%

In [30]:
print(classification_report(results, y_test))

              precision    recall  f1-score   support

       False       0.97      0.91      0.94      5168
        True       0.91      0.97      0.94      4832

    accuracy                           0.94     10000
   macro avg       0.94      0.94      0.94     10000
weighted avg       0.94      0.94      0.94     10000



In [31]:
sentences = x_test.tolist()
predicted = results
truth = y_test.tolist()
size = len(sentences)

mismatches = []
for i in range(size):
    if bool(predicted[i]) == bool(truth[i]):
        continue
    mismatches.append((sentences[i], bool(predicted[i]), bool(truth[i])))

In [32]:
errors = pd.DataFrame(mismatches, columns=['sentence', 'predicted', 'truth'])
errors.to_csv("cnn_errors_coreml.csv", index=False)