In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from sklearn.model_selection import train_test_split
import pandas as pd
from cnn import CNN, ComplexCNN


In [2]:
### load and preprocess dataset

# Load the dataset
df = pd.read_csv('data/train.csv/train.csv')

# Split dataset into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenizer
tokenizer = get_tokenizer('spacy', language='en_core_web_sm')

# Build vocab
def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(text)

# Build vocab and include '<pad>' and '<unk>' tokens
vocab = build_vocab_from_iterator(yield_tokens(train_df['comment_text']), specials=["<unk>", "<pad>"])
vocab.set_default_index(vocab["<unk>"])

In [3]:
# custom dataset since dataset is large

def collate_batch(batch):
    label_list, text_list, lengths = [], [], []
    for (_text, _label) in batch:
         label_list.append(_label)
         processed_text = torch.tensor(_text, dtype=torch.int64)
         text_list.append(processed_text)
         lengths.append(processed_text.size(0))
    # Pad the sequence
    text_list = pad_sequence(text_list, batch_first=True)
    label_list = torch.stack(label_list)
    lengths = torch.tensor(lengths)
    return text_list, label_list, lengths

class CommentDataset(Dataset):
    def __init__(self, dataframe, text_field, label_fields, tokenizer, vocab):
        self.dataframe = dataframe
        self.text_field = text_field
        self.label_fields = label_fields
        self.tokenizer = tokenizer
        self.vocab = vocab

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        text = [vocab[token] for token in tokenizer(self.dataframe.iloc[idx][self.text_field])]
        # Convert label columns to a consistent numeric type (e.g., float)
        labels = self.dataframe.iloc[idx][self.label_fields].astype(float).values
        return torch.tensor(text, dtype=torch.long), torch.tensor(labels, dtype=torch.float)


# Create instances of the CommentDataset
train_dataset = CommentDataset(train_df, 'comment_text', ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], tokenizer, vocab)
test_dataset = CommentDataset(test_df, 'comment_text', ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], tokenizer, vocab)

In [4]:
# data loaders init

BATCH_SIZE = 64

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch)


In [5]:
### define CNN model

INPUT_DIM = len(vocab)
EMBEDDING_DIM = 100
N_FILTERS = 100
FILTER_SIZES = [2, 3, 4]
OUTPUT_DIM = len(train_dataset.label_fields)  # Number of labels
DROPOUT = 0.5

model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT)
# model = ComplexCNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT)

100


In [6]:
dummy_input = torch.randn(1, 1, 100, 100) 
dummy_output = model.embedded_to_flattened(dummy_input)  # You might need to implement this method in your model
print(dummy_output.size())

AttributeError: 'CNN' object has no attribute 'embedded_to_flattened'

In [7]:
# train model

import warnings

# Suppress all UserWarnings
warnings.filterwarnings('ignore', category=UserWarning)

optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()

# Move model and criterion to GPU (if available)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('device: ', device)
model = model.to(device)
criterion = criterion.to(device)

# Training loop
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    counter = 0
    for texts, labels, _ in train_loader:
        if (counter%20) == 0:
            print(f'Training epoch {epoch}, batch {counter}')
        texts, labels = texts.to(device), labels.to(device)
        optimizer.zero_grad()
        predictions = model(texts).squeeze(1)
        loss = criterion(predictions, labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        counter+=1
    print(f'Epoch: {epoch+1}, Loss: {epoch_loss/len(train_loader)}')

device:  cuda
Training epoch 0, batch 0
Training epoch 0, batch 20
Training epoch 0, batch 40
Training epoch 0, batch 60
Training epoch 0, batch 80
Training epoch 0, batch 100
Training epoch 0, batch 120
Training epoch 0, batch 140
Training epoch 0, batch 160
Training epoch 0, batch 180
Training epoch 0, batch 200
Training epoch 0, batch 220
Training epoch 0, batch 240
Training epoch 0, batch 260
Training epoch 0, batch 280
Training epoch 0, batch 300
Training epoch 0, batch 320
Training epoch 0, batch 340
Training epoch 0, batch 360
Training epoch 0, batch 380
Training epoch 0, batch 400
Training epoch 0, batch 420
Training epoch 0, batch 440
Training epoch 0, batch 460
Training epoch 0, batch 480
Training epoch 0, batch 500
Training epoch 0, batch 520
Training epoch 0, batch 540
Training epoch 0, batch 560
Training epoch 0, batch 580
Training epoch 0, batch 600
Training epoch 0, batch 620
Training epoch 0, batch 640
Training epoch 0, batch 660
Training epoch 0, batch 680
Training epo

In [8]:
# evaluate
model.eval()
test_loss = 0
with torch.no_grad():
    for texts, labels, _ in test_loader:
        texts, labels = texts.to(device), labels.to(device)
        predictions = model(texts).squeeze(1)
        loss = criterion(predictions, labels)
        test_loss += loss.item()
print(f'Test Loss: {test_loss/len(test_loader)}')

Test Loss: 0.054111703961550114


In [42]:
# try our own texts

def preprocess_text(text, tokenizer, vocab, max_length):
    tokens = tokenizer(text)
    token_ids = [vocab[token] for token in tokens]  # Use vocab directly
    
    # Pad or truncate the sequence to a fixed length
    if len(token_ids) < max_length:
        token_ids += [vocab['<pad>']] * (max_length - len(token_ids))
    else:
        token_ids = token_ids[:max_length]

    return torch.tensor(token_ids, dtype=torch.long)

# Example usage
text_good = "good job"
text_bad = "brotha, fuk you"
max_length = 100  # or whatever length your model expects
processed_text = preprocess_text(text_bad, tokenizer, vocab, max_length)

In [43]:
# Switch model to evaluation mode
model.eval()

# Move data to the appropriate device
processed_text = processed_text.unsqueeze(0)  # Add batch dimension
processed_text = processed_text.to(device)  # Assuming 'device' is defined

with torch.no_grad():
    predictions = model(processed_text)
    predictions = torch.sigmoid(predictions)
    predicted_labels = (predictions > 0.5).int()

# Convert predicted labels to a readable format
predicted_labels = predicted_labels.squeeze(0).cpu().numpy()
label_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
results = {label: bool(pred) for label, pred in zip(label_names, predicted_labels)}

print(results)

{'toxic': True, 'severe_toxic': True, 'obscene': True, 'threat': False, 'insult': True, 'identity_hate': False}
