In [3]:
# importing the necessary libraries
import pandas as pd
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, MaxPooling1D
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

In [4]:
# Setting up the parameters
maximum_features = 30522  # Maximum number of words to consider as features
maximum_length = 128  # Maximum length of input sequences
word_embedding_dims = 50  # Dimension of word embeddings
no_of_filters = 128  # Number of filters in the convolutional layer
kernel_size = 3  # Size of the convolutional filters
hidden_dim_1 = 128  # Number of neurons in the hidden layer

batch_size = 64  # Batch size for training
epochs = 10  # Number of training epochs
threshold = 0.5  # Threshold for binary classification


In [5]:
DATASET_SIZE = 10_000

In [6]:
df = pd.read_csv("../jigsaw/dataset_text_target.csv")
df_true = df[df.target > 0.5]
df_false = df[df.target <= 0.5]
df = pd.concat([df_true[:DATASET_SIZE//2], df_false[:DATASET_SIZE//2]], axis=0)
mapper = lambda x: 1 if x > 0.5 else 0
df.target = df.target.apply(mapper)

In [7]:
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df.comment_text, df.target, test_size=0.2, random_state=42, shuffle=True)


In [8]:
#Tokenize and encode the data using the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [9]:
X_train_encoded = tokenizer.batch_encode_plus(
    x_train.tolist(),
    padding='max_length',
    truncation=True,
    max_length=maximum_length,
    add_special_tokens=True,
    return_tensors='tf',
)
X_test_encoded = tokenizer.batch_encode_plus(
    x_test.tolist(),
    padding='max_length',
    truncation=True,
    max_length=maximum_length,
    add_special_tokens=True,
    return_tensors='tf'
)

2025-07-22 21:35:05.594295: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M3
2025-07-22 21:35:05.594518: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2025-07-22 21:35:05.594538: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
I0000 00:00:1753200305.594932 6123510 pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
I0000 00:00:1753200305.595189 6123510 pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [10]:
from keras.models import load_model

In [12]:
model = load_model("../cnn_model_trained/cnn_model_trained.keras")



In [13]:
model.summary()

In [16]:
# Predicting the probabilities for test data
y_pred_prob = model.predict(X_test_encoded["input_ids"])

# Converting the probabilities to binary classes based on a threshold
y_pred = (y_pred_prob > threshold).astype(int)

print(classification_report(y_test, y_pred))

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
              precision    recall  f1-score   support

           0       0.94      0.90      0.92       988
           1       0.91      0.95      0.93      1012

    accuracy                           0.92      2000
   macro avg       0.92      0.92      0.92      2000
weighted avg       0.92      0.92      0.92      2000



In [27]:
sentences = x_test.tolist()
predicted = y_pred.flatten().tolist()
truth = y_test.tolist()
size = len(sentences)

mismatches = []
for i in range(size):
    if predicted[i] == truth[i]:
        continue
    mismatches.append((sentences[i], predicted[i], truth[i]))

In [31]:
errors = pd.DataFrame(mismatches, columns=["sentence", "predicted", "truth"])

In [32]:
errors

Unnamed: 0,sentence,predicted,truth
0,I'd never defend anyone being inconsiderate or...,1,0
1,This isn't rocket science.\nThe Constitution s...,1,0
2,Indeed it has worked very well when compared t...,1,0
3,Bernie has a tiny bit of the Donald in him - ....,1,0
4,"I agree. Go Trump, and take Ted with you.",1,0
...,...,...,...
148,She went to a chiropractor for a shoulder inju...,1,0
149,"It was not murder. LaVoy fled, endangered oth...",1,0
150,“Will Whiteness History Month make all white p...,1,0
151,"When this first went on the ballot, I research...",0,1


In [34]:
errors.to_csv("cnn_errors.csv")