In [4]:
# importing the necessary libraries
import pandas as pd
from sklearn.metrics import classification_report
from transformers import BertTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [18]:
# Setting up the parameters
maximum_features = 30522  # Maximum number of words to consider as features
maximum_length = 128  # Maximum length of input sequences
word_embedding_dims = 50  # Dimension of word embeddings
no_of_filters = 128  # Number of filters in the convolutional layer
kernel_size = 3  # Size of the convolutional filters
hidden_dim_1 = 128  # Number of neurons in the hidden layer

batch_size = 64  # Batch size for training
epochs = 10  # Number of training epochs
threshold = 0.6  # Threshold for binary classification


In [19]:
DATASET_SIZE = 10_000

In [20]:
df = pd.read_csv("../jigsaw/dataset_text_target.csv")
df_true = df[df.target > threshold]
df_false = df[df.target <= threshold]
df = pd.concat([df_true[DATASET_SIZE // 2:DATASET_SIZE], df_false[DATASET_SIZE // 2:DATASET_SIZE]], axis=0)
mapper = lambda x: 1 if x > 0.5 else 0
df.target = df.target.apply(mapper)

In [21]:
x_test = df.comment_text
y_test = df.target

In [22]:
#Tokenize and encode the data using the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [23]:
X_test_encoded = tokenizer.batch_encode_plus(
    x_test.tolist(),
    padding='max_length',
    truncation=True,
    max_length=maximum_length,
    add_special_tokens=True,
    return_tensors='tf'
)

In [24]:
from keras.models import load_model

In [25]:
model = load_model("../cnn_model_trained/cnn_model_trained.keras")



In [26]:
model.summary()

In [27]:
# Predicting the probabilities for test data
y_pred_prob = model.predict(X_test_encoded["input_ids"])

# Converting the probabilities to binary classes based on a threshold
y_pred = (y_pred_prob > threshold).astype(int)

print(classification_report(y_test, y_pred))

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step
              precision    recall  f1-score   support

           0       0.96      0.88      0.92      4917
           1       0.89      0.97      0.93      5083

    accuracy                           0.92     10000
   macro avg       0.93      0.92      0.92     10000
weighted avg       0.93      0.92      0.92     10000



In [28]:
sentences = x_test.tolist()
predicted = y_pred.flatten().tolist()
truth = y_test.tolist()
size = len(sentences)

mismatches = []
for i in range(size):
    if predicted[i] == truth[i]:
        continue
    mismatches.append((sentences[i], predicted[i], truth[i]))

In [29]:
errors = pd.DataFrame(mismatches, columns=['sentence', 'predicted', 'truth'])

In [31]:
errors.to_csv("cnn_errors_tf.csv", index=False)