In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.multioutput import MultiOutputClassifier
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, TensorDataset
from collections import Counter

from cnn import *

import lightgbm as lgb
import xgboost as xgb
from xgboost import XGBClassifier

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")




In [5]:
df = pd.read_csv('data/train.csv/train.csv')
print('rows:', df.shape[0])
# toxic,severe_toxic,obscene,threat,insult,identity_hate
print('# toxic:', df[(df['toxic'] == 1) | (df['severe_toxic'] == 1) | (df['obscene'] == 1) | (df['threat'] == 1) | (df['insult'] == 1) | (df['identity_hate'] == 1)].shape[0])

rows: 159571
# toxic: 16225


In [6]:
# Count the number of labels per row (you might already have this)
df['label_count'] = df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1)

# Separate the minority and majority instances
minority_df = df[df['label_count'] > 0]
majority_df = df[df['label_count'] == 0]

# Under-sample the majority dataframe
sampled_majority_df = majority_df.sample(n=len(minority_df))

# Combine back the minority and downsampled majority instances
balanced_df = pd.concat([minority_df, sampled_majority_df])

# Now balanced_df is your under-sampled DataFrame
df = balanced_df

In [19]:
'''
Basic Data Cleaning and Preprocessing
NOTE:
run time: 70 to 90 sec
Tokenization: turn text into tokens
Lemmatization: extracting a word's base form (ex: running -> run)
'''


# remove line breaks and special characters
df['comment_text'] = df['comment_text'].apply(lambda x: re.sub(r'\n', ' ', x))
df['comment_text'] = df['comment_text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
print(df['comment_text'][6])

# Convert to lowercase
df['comment_text'] = df['comment_text'].apply(lambda x: x.lower())
print(df['comment_text'][6])

# Tokenization 
# different from xgb
# df['comment_text'] = df['comment_text'].apply(lambda x: nltk.word_tokenize(x))
# print(df['comment_text'][6])
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(df['comment_text'])
sequences = tokenizer.texts_to_sequences(df['comment_text'])
# print(sequences)

# Padding sequences (needed for CNN)
max_sequence_length = 200  
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length)


cocksucker before you piss around on my work
cocksucker before you piss around on my work


In [8]:
'''
Train Test split
'''
# Prepare the target variable
label_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
y = df[label_columns]
labels  = y.values

X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.long)
y_train_tensor = torch.tensor(y_train, dtype=torch.float)
y_test_tensor = torch.tensor(y_test, dtype=torch.float)

train_data = TensorDataset(X_train_tensor, y_train_tensor)
test_data = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_data, shuffle=True, batch_size=32)
test_loader = DataLoader(test_data, batch_size=32)

In [9]:
''' 
model initialization
'''

# Model parameters
vocab_size = 10000  # Size of your vocabulary
embed_dim = 300    # Embedding size
num_classes = 6    # Number of output classes
filter_sizes = [13, 4, 5]  # Convolution kernel sizes
num_filters = 100  # Number of filters per kernel size

# Create the model
model = TextCNN(vocab_size, embed_dim, num_classes, filter_sizes, num_filters)

# Define a loss function - Binary Cross Entropy is a good choice for multi-label classification
criterion = nn.BCEWithLogitsLoss()

optimizer = optim.Adam(model.parameters(), lr=0.0001)

In [7]:
num_epochs = 10  # Number of training epochs
print(device)
model.to(device)

for epoch in range(num_epochs):
    # Training phase
    model.train()  # Set the model to training mode
    train_loss = 0.0
    for inputs, labels in train_loader:
        # Transfer Data to GPU if available
        if torch.cuda.is_available():
            inputs, labels = inputs.to(device), labels.to(device)

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Backward pass and optimize
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    # Evaluation phase
    model.eval()  # Set the model to evaluation mode
    eval_loss = 0.0
    with torch.no_grad():  # Turn off gradients for validation
        for inputs, labels in test_loader:
            # Transfer Data to GPU if available
            if torch.cuda.is_available():
                inputs, labels = inputs.cuda(), labels.cuda()

            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            eval_loss += loss.item()

    # Calculate average losses
    train_loss = train_loss / len(train_loader)
    eval_loss = eval_loss / len(test_loader)

    # Print training and evaluation statistics 
    print(f'Epoch {epoch+1}/{num_epochs}')
    print(f'Train Loss: {train_loss:.4f}, Eval Loss: {eval_loss:.4f}')

cuda
Epoch 1/10
Train Loss: 0.3034, Eval Loss: 0.2531
Epoch 2/10
Train Loss: 0.2254, Eval Loss: 0.2269
Epoch 3/10
Train Loss: 0.1897, Eval Loss: 0.2143
Epoch 4/10
Train Loss: 0.1636, Eval Loss: 0.2089
Epoch 5/10
Train Loss: 0.1423, Eval Loss: 0.2057
Epoch 6/10
Train Loss: 0.1226, Eval Loss: 0.2055
Epoch 7/10
Train Loss: 0.1059, Eval Loss: 0.2081
Epoch 8/10
Train Loss: 0.0913, Eval Loss: 0.2083
Epoch 9/10
Train Loss: 0.0787, Eval Loss: 0.2117
Epoch 10/10
Train Loss: 0.0684, Eval Loss: 0.2148


In [14]:
''' 
Testing dataset (with preprocessing)
'''

# Load the test data
test_df = pd.read_csv('data/test.csv/test.csv')
test_labels_df = pd.read_csv('data/test_labels.csv/test_labels.csv')
# Find ids where any label is -1
ids_with_minus_one = test_labels_df[test_labels_df.eq(-1).any(axis=1)]['id']

# Filter out these ids from both dataframes
test_df_filtered = test_df[~test_df['id'].isin(ids_with_minus_one)]
test_labels_df_filtered = test_labels_df[~test_labels_df['id'].isin(ids_with_minus_one)]

# Now test_df_filtered and test_labels_df_filtered have rows with -1 labels removed
print(test_labels_df_filtered.shape[0])
test_df = test_df_filtered
test_labels_df = test_labels_df_filtered

#--------------------------------------------------------------------

# Merge the test data and labels on 'id'
test_merged_df = pd.merge(test_df, test_labels_df, on='id')
print(test_merged_df.head())

# Preprocess the text data (same as done with the training data)
# remove line breaks and special characters
test_merged_df['comment_text'] = test_merged_df['comment_text'].apply(lambda x: re.sub(r'\n', ' ', x))
test_merged_df['comment_text'] = test_merged_df['comment_text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
print(test_merged_df['comment_text'][0])

# Convert to lowercase
test_merged_df['comment_text'] = test_merged_df['comment_text'].apply(lambda x: x.lower())
print(test_merged_df['comment_text'][0])

# Tokenize
sequences = tokenizer.texts_to_sequences(test_merged_df['comment_text'])
# print(sequences)

# Padding sequences (needed for CNN)
test_padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length)


# Actual labels
y_test = test_merged_df.iloc[:, 2:].values  # Assuming labels start from the 3rd column

X_test_tensor = torch.tensor(test_padded_sequences, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.float)

test_data = TensorDataset(X_test_tensor, y_test_tensor)
test_loader = DataLoader(test_data, batch_size=32)

# Evaluation phase
model.eval()  # Set the model to evaluation mode
eval_loss = 0.0
all_preds = []
with torch.no_grad():  # Turn off gradients for validation
    for inputs, labels in test_loader:
        # Transfer Data to GPU if available
        inputs = inputs.to(device)
        outputs = model(inputs)
        preds = torch.sigmoid(outputs).cpu()  # Apply sigmoid to get probabilities
        all_preds.append(preds.numpy())

# Concatenate all predictions
all_preds = np.concatenate(all_preds, axis=0)

# Convert predictions to binary (0 or 1)
binary_preds = np.round(all_preds)
accuracy = accuracy_score(y_test, binary_preds)
print(f"Accuracy: {accuracy}")

63978
                 id                                       comment_text  toxic  \
0  0001ea8717f6de06  Thank you for understanding. I think very high...      0   
1  000247e83dcc1211                   :Dear god this site is horrible.      0   
2  0002f87b16116a7f  "::: Somebody will invariably try to add Relig...      0   
3  0003e1cccfd5a40a  " \n\n It says it right there that it IS a typ...      0   
4  00059ace3e3e9a53  " \n\n == Before adding a new product to the l...      0   

   severe_toxic  obscene  threat  insult  identity_hate  
0             0        0       0       0              0  
1             0        0       0       0              0  
2             0        0       0       0              0  
3             0        0       0       0              0  
4             0        0       0       0              0  
Thank you for understanding I think very highly of you and would not revert without discussion
thank you for understanding i think very highly of you and would

AttributeError: 'BertTokenizer' object has no attribute 'texts_to_sequences'

In [14]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Assuming 'model' is your model instance
model_size = count_parameters(model)
print(f"The model has {model_size} trainable parameters")

print('rows:', test_merged_df.shape[0])
# toxic,severe_toxic,obscene,threat,insult,identity_hate
print('# toxic:', test_merged_df[(test_merged_df['toxic'] == 1) | (test_merged_df['severe_toxic'] == 1) | (test_merged_df['obscene'] == 1) | (test_merged_df['threat'] == 1) | (test_merged_df['insult'] == 1) | (test_merged_df['identity_hate'] == 1)].shape[0])

The model has 2517286 trainable parameters
rows: 63978
# toxic: 6243


In [11]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset

class TextDataset(Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            # Ensure labels are a 2D tensor of shape [batch_size, num_labels]
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

# Load the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=6)

# Tokenize the input (Assuming df is your DataFrame and 'text' is the column with text)
texts = df['comment_text'].tolist()
encodings = tokenizer(texts, truncation=True, padding=True, max_length=128)

# Prepare labels
label_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
labels = df[label_columns].values.tolist()  # Convert DataFrame columns to list of lists
print(np.shape(labels))

# Create a dataset
dataset = TextDataset(encodings, labels)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=20,              
    per_device_train_batch_size=8,  
    per_device_eval_batch_size=16,   
    warmup_steps=500,                
    weight_decay=0.01,               
    logging_dir='./logs',            
    logging_steps=10,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=None  
)

# Train the model
trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


(32450, 6)




  0%|          | 0/81140 [00:00<?, ?it/s]

{'loss': 0.6832, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.0}
{'loss': 0.6744, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.0}
{'loss': 0.6585, 'learning_rate': 3e-06, 'epoch': 0.01}
{'loss': 0.6423, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.01}
{'loss': 0.6177, 'learning_rate': 5e-06, 'epoch': 0.01}
{'loss': 0.5894, 'learning_rate': 6e-06, 'epoch': 0.01}
{'loss': 0.5611, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.02}
{'loss': 0.5335, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.02}
{'loss': 0.4942, 'learning_rate': 9e-06, 'epoch': 0.02}
{'loss': 0.4665, 'learning_rate': 1e-05, 'epoch': 0.02}
{'loss': 0.4499, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.03}
{'loss': 0.3951, 'learning_rate': 1.2e-05, 'epoch': 0.03}
{'loss': 0.3741, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.03}
{'loss': 0.3043, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.03}
{'loss': 0.3063, 'learning_rate': 1.5e-05, 'epoch': 0.04}
{'loss': 0.2963, 'learnin

KeyboardInterrupt: 

In [12]:
# Load the test data
test_df = pd.read_csv('data/test.csv/test.csv')
test_labels_df = pd.read_csv('data/test_labels.csv/test_labels.csv')
# Find ids where any label is -1
ids_with_minus_one = test_labels_df[test_labels_df.eq(-1).any(axis=1)]['id']

# Filter out these ids from both dataframes
test_df_filtered = test_df[~test_df['id'].isin(ids_with_minus_one)]
test_labels_df_filtered = test_labels_df[~test_labels_df['id'].isin(ids_with_minus_one)]

# Now test_df_filtered and test_labels_df_filtered have rows with -1 labels removed
print(test_labels_df_filtered.shape[0])
test_df = test_df_filtered
test_labels_df = test_labels_df_filtered

#--------------------------------------------------------------------

# Merge the test data and labels on 'id'
test_merged_df = pd.merge(test_df, test_labels_df, on='id')
print(test_merged_df.head())

# Preprocess the text data (same as done with the training data)
# remove line breaks and special characters
test_merged_df['comment_text'] = test_merged_df['comment_text'].apply(lambda x: re.sub(r'\n', ' ', x))
test_merged_df['comment_text'] = test_merged_df['comment_text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
print(test_merged_df['comment_text'][0])

# Convert to lowercase
test_merged_df['comment_text'] = test_merged_df['comment_text'].apply(lambda x: x.lower())
print(test_merged_df['comment_text'][0])


test_texts  = test_merged_df['comment_text'].tolist()
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128)

# Prepare labels for test data
test_labels = test_merged_df[label_columns].values.tolist()

# Create a dataset for test data
test_dataset = TextDataset(test_encodings, test_labels)

# Evaluate the model
trainer.evaluate(test_dataset)

63978
                 id                                       comment_text  toxic  \
0  0001ea8717f6de06  Thank you for understanding. I think very high...      0   
1  000247e83dcc1211                   :Dear god this site is horrible.      0   
2  0002f87b16116a7f  "::: Somebody will invariably try to add Relig...      0   
3  0003e1cccfd5a40a  " \n\n It says it right there that it IS a typ...      0   
4  00059ace3e3e9a53  " \n\n == Before adding a new product to the l...      0   

   severe_toxic  obscene  threat  insult  identity_hate  
0             0        0       0       0              0  
1             0        0       0       0              0  
2             0        0       0       0              0  
3             0        0       0       0              0  
4             0        0       0       0              0  
Thank you for understanding I think very highly of you and would not revert without discussion
thank you for understanding i think very highly of you and would

  0%|          | 0/3999 [00:00<?, ?it/s]

{'eval_loss': 0.2804158329963684, 'eval_runtime': 74.9789, 'eval_samples_per_second': 853.28, 'eval_steps_per_second': 53.335, 'epoch': 10.0}


{'eval_loss': 0.2804158329963684,
 'eval_runtime': 74.9789,
 'eval_samples_per_second': 853.28,
 'eval_steps_per_second': 53.335,
 'epoch': 10.0}

In [13]:
# Putting model in evaluation mode
model.eval()

# Making predictions
predictions = []
with torch.no_grad():
    for item in test_dataset:
        inputs = {k: v.to(device).unsqueeze(0) for k, v in item.items() if k != 'labels'}
        output = model(**inputs)
        predictions.append(output.logits.squeeze().cpu().numpy())

# Convert predictions to binary format
threshold = 0.5  # You may need to adjust this threshold
binary_predictions = np.array(predictions) > threshold

exact_matches = np.all(binary_predictions == test_labels, axis=1)
accuracy = np.mean(exact_matches)

print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.7780


In [20]:
# Load the test data
test_df = pd.read_csv('data/test.csv/test.csv')
test_labels_df = pd.read_csv('data/test_labels.csv/test_labels.csv')
# Find ids where any label is -1
ids_with_minus_one = test_labels_df[test_labels_df.eq(-1).any(axis=1)]['id']

# Filter out these ids from both dataframes
test_df_filtered = test_df[~test_df['id'].isin(ids_with_minus_one)]
test_labels_df_filtered = test_labels_df[~test_labels_df['id'].isin(ids_with_minus_one)]

# Now test_df_filtered and test_labels_df_filtered have rows with -1 labels removed
print(test_labels_df_filtered.shape[0])
test_df = test_df_filtered
test_labels_df = test_labels_df_filtered

#--------------------------------------------------------------------

# Merge the test data and labels on 'id'
test_merged_df = pd.merge(test_df, test_labels_df, on='id')
print(test_merged_df.head())

# Preprocess the text data (same as done with the training data)
# remove line breaks and special characters
test_merged_df['comment_text'] = test_merged_df['comment_text'].apply(lambda x: re.sub(r'\n', ' ', x))
test_merged_df['comment_text'] = test_merged_df['comment_text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
print(test_merged_df['comment_text'][0])

# Convert to lowercase
test_merged_df['comment_text'] = test_merged_df['comment_text'].apply(lambda x: x.lower())
print(test_merged_df['comment_text'][0])

# Tokenize
sequences = tokenizer.texts_to_sequences(test_merged_df['comment_text'])
# print(sequences)

# Padding sequences (needed for CNN)
test_padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length)


# Actual labels
y_test = test_merged_df.iloc[:, 2:].values  # Assuming labels start from the 3rd column

X_test_tensor = torch.tensor(test_padded_sequences, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.float)

test_data = TensorDataset(X_test_tensor, y_test_tensor)
test_loader = DataLoader(test_data, batch_size=32)

63978
                 id                                       comment_text  toxic  \
0  0001ea8717f6de06  Thank you for understanding. I think very high...      0   
1  000247e83dcc1211                   :Dear god this site is horrible.      0   
2  0002f87b16116a7f  "::: Somebody will invariably try to add Relig...      0   
3  0003e1cccfd5a40a  " \n\n It says it right there that it IS a typ...      0   
4  00059ace3e3e9a53  " \n\n == Before adding a new product to the l...      0   

   severe_toxic  obscene  threat  insult  identity_hate  
0             0        0       0       0              0  
1             0        0       0       0              0  
2             0        0       0       0              0  
3             0        0       0       0              0  
4             0        0       0       0              0  
Thank you for understanding I think very highly of you and would not revert without discussion
thank you for understanding i think very highly of you and would

In [21]:
import os

test_loader = DataLoader(test_dataset)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
def evaluate_model(model, data_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in data_loader:
            # Assuming batch is a dict with 'input_ids', 'attention_mask', 'labels'
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
            labels = batch['labels'].to(device)
            outputs = model(**inputs)
            _, predicted = torch.max(outputs.logits, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    return correct / total

# Define your test/validation dataset and DataLoader
# test_dataset = ...
# test_loader = DataLoader(test_dataset, batch_size=32)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
results_dir = 'results'
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_classes)  # Adjust as per your model

for checkpoint_dir in os.listdir(results_dir):
    checkpoint_path = os.path.join(results_dir, checkpoint_dir, 'pytorch_model.bin')  # Adjust the file name as needed
    if os.path.isfile(checkpoint_path):
        model.load_state_dict(torch.load(checkpoint_path, map_location=device))
        model.to(device)
        accuracy = evaluate_model(model, test_loader)
        print(f"Checkpoint: {checkpoint_dir}, Accuracy: {accuracy}")
    else:
        print(f"Skipped: {checkpoint_path} (Not a file)")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Checkpoint: checkpoint-1000, Accuracy: 5.773390853105755
Checkpoint: checkpoint-10000, Accuracy: 5.761840007502579
Checkpoint: checkpoint-10500, Accuracy: 3.9889336959579857
Checkpoint: checkpoint-11000, Accuracy: 5.6788583575604115


KeyboardInterrupt: 