In [37]:
# importing the necessary libraries
import pandas as pd
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense
from keras.preprocessing.sequence import pad_sequences
from keras.datasets import imdb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

In [38]:
# Setting up the parameters
maximum_features = 30522  # Maximum number of words to consider as features
maximum_length = 128  # Maximum length of input sequences
word_embedding_dims = 50  # Dimension of word embeddings
no_of_filters = 250  # Number of filters in the convolutional layer
kernel_size = 3  # Size of the convolutional filters
hidden_dims = 250  # Number of neurons in the hidden layer
batch_size = 32  # Batch size for training
epochs = 10  # Number of training epochs
threshold = 0.5  # Threshold for binary classification


In [39]:
df = pd.read_csv("jigsaw/dataset_text_target.csv")
df_true = df[df.target > 0.5]
df_false = df[df.target <= 0.5]
df = pd.concat([df_true[:500], df_false[:500]], axis=0)
mapper = lambda x: 1 if x > 0.5 else 0
df.target = df.target.apply(mapper)

In [40]:
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df.comment_text, df.target, test_size=0.2, random_state=42, stratify=df.target, shuffle=True)


In [41]:
#Tokenize and encode the data using the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [42]:
X_train_encoded = tokenizer.batch_encode_plus(
    x_train.tolist(),
    padding='max_length',
    truncation=True,
    max_length=maximum_length,
    add_special_tokens=True,
    return_tensors='tf',
)
X_test_encoded = tokenizer.batch_encode_plus(
    x_test.tolist(),
    padding='max_length',
    truncation=True,
    max_length=maximum_length,
    add_special_tokens=True,
    return_tensors='tf'
)

In [43]:
# Building the model
model = Sequential()
# Adding the embedding layer to convert input sequences to dense vectors
model.add(Embedding(maximum_features, word_embedding_dims,
                    input_length=maximum_length))
# Adding the 1D convolutional layer with ReLU activation
model.add(Conv1D(no_of_filters, kernel_size, padding='valid',
                 activation='relu', strides=1))
# Adding the global max pooling layer to reduce dimensionality
model.add(GlobalMaxPooling1D())
# Adding the dense hidden layer with ReLU activation
model.add(Dense(hidden_dims, activation='relu'))

# Adding the output layer with sigmoid activation for binary classification
model.add(Dense(1, activation='sigmoid'))



In [44]:
# Compiling the model with binary cross-entropy loss and Adam optimizer
model.compile(loss='binary_crossentropy',
              optimizer='adam', metrics=['accuracy'])

In [45]:

# Training the model
model.fit(X_train_encoded["input_ids"], y_train, batch_size=batch_size,
          epochs=epochs, validation_data=(X_test_encoded["input_ids"], y_test))

Epoch 1/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.4965 - loss: 0.6939 - val_accuracy: 0.5150 - val_loss: 0.6882
Epoch 2/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.6774 - loss: 0.6672 - val_accuracy: 0.7350 - val_loss: 0.6423
Epoch 3/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.9549 - loss: 0.5133 - val_accuracy: 0.7650 - val_loss: 0.5199
Epoch 4/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.9778 - loss: 0.1646 - val_accuracy: 0.8250 - val_loss: 0.4845
Epoch 5/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 1.0000 - loss: 0.0213 - val_accuracy: 0.8300 - val_loss: 0.4951
Epoch 6/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 1.0000 - loss: 0.0050 - val_accuracy: 0.8450 - val_loss: 0.5029
Epoch 7/10
[1m25/25[0m [32m━━━━━

<keras.src.callbacks.history.History at 0x306f8afd0>

In [47]:

# Predicting the probabilities for test data
y_pred_prob = model.predict(X_test_encoded["input_ids"])

# Converting the probabilities to binary classes based on threshold
y_pred = (y_pred_prob > threshold).astype(int)

# Calculating the evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Printing the evaluation metrics
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1-score:', f1)

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step 
Accuracy: 0.835
Precision: 0.819047619047619
Recall: 0.86
F1-score: 0.8390243902439024


In [50]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.81      0.83       100
           1       0.82      0.86      0.84       100

    accuracy                           0.83       200
   macro avg       0.84      0.83      0.83       200
weighted avg       0.84      0.83      0.83       200

