In [3]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import classification_report
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer

  from .autonotebook import tqdm as notebook_tqdm


### 2. Setting up Parameters and Loading Data

In [12]:
# Setting up the parameters
maximum_features = 30522  # Maximum number of words to consider as features
maximum_length = 128  # Maximum length of input sequences
word_embedding_dims = 50  # Dimension of word embeddings
no_of_filters = 128  # Number of filters in the convolutional layer
kernel_size = 3  # Size of the convolutional filters
hidden_dim_1 = 128  # Number of neurons in the hidden layer

batch_size = 64  # Batch size for training
epochs = 10  # Number of training epochs
threshold = 0.7  # Threshold for binary classification

DATASET_SIZE = 10_000

df = pd.read_csv("../jigsaw/dataset_text_target.csv")
df_true = df[df.target > threshold]
df_false = df[df.target <= threshold]
df = pd.concat([df_true[DATASET_SIZE // 2:DATASET_SIZE], df_false[DATASET_SIZE // 2:DATASET_SIZE]], axis=0)
mapper = lambda x: 1 if x > 0.5 else 0
df.target = df.target.apply(mapper)


In [13]:

x_test = df.comment_text
y_test = df.target

In [14]:
# Tokenize and encode the data using the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

X_test_encoded = tokenizer.batch_encode_plus(
    x_test.tolist(),
    padding='max_length',
    truncation=True,
    max_length=maximum_length,
    add_special_tokens=True,
    return_tensors='pt',  # Return PyTorch tensors
)

# Create PyTorch Datasets
test_dataset = TensorDataset(X_test_encoded['input_ids'], torch.tensor(y_test.values, dtype=torch.float32))

# Create DataLoaders
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [15]:
class CNNTextClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_size, hidden_dim):
        super(CNNTextClassifier, self).__init__()

        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # Convolutional layers
        self.conv1 = nn.Conv1d(embedding_dim, n_filters, kernel_size=filter_size, padding='valid')
        self.pool1 = nn.MaxPool1d(kernel_size=3)

        self.conv2 = nn.Conv1d(n_filters, n_filters, kernel_size=filter_size, padding='valid')
        self.pool2 = nn.MaxPool1d(kernel_size=3)

        self.conv3 = nn.Conv1d(n_filters, n_filters, kernel_size=filter_size, padding='valid')
        # Global Max Pooling is achieved with AdaptiveMaxPool1d
        self.global_pool = nn.AdaptiveMaxPool1d(1)

        # Dense layers
        self.fc1 = nn.Linear(n_filters, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, 1)

    def forward(self, input_ids):
        # input_ids shape: (batch_size, seq_len)
        embedded = self.embedding(input_ids)
        # embedded shape: (batch_size, seq_len, embedding_dim)

        # PyTorch Conv1d expects (batch_size, channels, seq_len)
        # So we permute the dimensions
        embedded = embedded.permute(0, 2, 1)

        x = self.pool1(F.relu(self.conv1(embedded)))
        x = self.pool2(F.relu(self.conv2(x)))
        x = F.relu(self.conv3(x))

        x = self.global_pool(x).squeeze(2)  # Squeeze to remove the last dimension

        x = F.relu(self.fc1(x))
        output = torch.sigmoid(self.fc2(x))

        return output


# Instantiate the model
model = CNNTextClassifier(
    vocab_size=maximum_features,
    embedding_dim=word_embedding_dims,
    n_filters=no_of_filters,
    filter_size=kernel_size,
    hidden_dim=hidden_dim_1
)

model.load_state_dict(torch.load("../cnn_model_trained_torch/cnn_model_trained_pytorch.pth"))
device = "mps" if torch.mps.is_available() else "cpu"
model.to(device)

CNNTextClassifier(
  (embedding): Embedding(30522, 50)
  (conv1): Conv1d(50, 128, kernel_size=(3,), stride=(1,), padding=valid)
  (pool1): MaxPool1d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=valid)
  (pool2): MaxPool1d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=valid)
  (global_pool): AdaptiveMaxPool1d(output_size=1)
  (fc1): Linear(in_features=128, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=1, bias=True)
)

### 5. Training the Model

We define the loss function and optimizer, then write an explicit loop to train the model over 10 epochs and validate its performance.

In [16]:
model.eval()
y_pred_prob = []
y_true = []

with torch.no_grad():
    for input_ids, labels in test_loader:
        input_ids = input_ids.to(device)
        outputs = model(input_ids).squeeze()
        y_pred_prob.extend(outputs.cpu().numpy())
        y_true.extend(labels.cpu().numpy())

y_pred = (np.array(y_pred_prob) > threshold).astype(int)

# Calculating and printing evaluation metrics
print('\nClassification Report:')
print(classification_report(y_true, y_pred))


Classification Report:
              precision    recall  f1-score   support

         0.0       0.91      0.97      0.94      4843
         1.0       0.97      0.91      0.94      5157

    accuracy                           0.94     10000
   macro avg       0.94      0.94      0.94     10000
weighted avg       0.94      0.94      0.94     10000



In [17]:
y_pred

array([1, 1, 1, ..., 0, 0, 0])

In [18]:
sentences = x_test.tolist()
predicted = y_pred.tolist()
truth = y_test.tolist()
size = len(sentences)

mismatches = []
for i in range(size):
    if predicted[i] == truth[i]:
        continue
    mismatches.append((sentences[i], predicted[i], truth[i]))

In [19]:
errors = pd.DataFrame(mismatches, columns=['sentence', 'predicted', 'truth'])
errors.to_csv("cnn_errors_pt.csv", index=False)