In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoTokenizer
import torch.nn as nn
import os

# Hyperparameters
MAX_LEN = 250
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = TRAIN_BATCH_SIZE * 2
EPOCHS = 5
LEARNING_RATE = 1e-05
EMBEDDING_FILE = '../input/fasttext-crawl-300d-2m/crawl-300d-2M.vec'
DEVICE = 'cuda:0' if torch.cuda.is_available() else 'cpu'
print("Device:", DEVICE)

# Load data
train = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')
test = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip')

# Use AutoTokenizer from Hugging Face
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize data
train_encodings = tokenizer(train["comment_text"].fillna("fillna").tolist(), truncation=True, padding=True, max_length=MAX_LEN)
test_encodings = tokenizer(test["comment_text"].fillna("fillna").tolist(), truncation=True, padding=True, max_length=MAX_LEN)

x_train = np.array(train_encodings['input_ids'])
x_test = np.array(test_encodings['input_ids'])

y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values

# Load FastText embeddings
def get_coefs(word, *arr): 
    return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE))

word_index = tokenizer.get_vocab()
max_features = min(150000, len(word_index) + 1)
embed_size = 300
embedding_matrix = np.zeros((max_features, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

class MultiLabelDataset(Dataset):
    def __init__(self, inputs, targets=None):
        self.inputs = inputs
        self.targets = targets

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        input_ids = torch.tensor(self.inputs[index], dtype=torch.long)
        if self.targets is not None:
            targets = torch.tensor(self.targets[index], dtype=torch.float)
            return input_ids, targets
        return input_ids

train_size = 0.8
train_inputs, val_inputs, train_targets, val_targets = train_test_split(x_train, y_train, train_size=train_size, random_state=42)

train_dataset = MultiLabelDataset(train_inputs, train_targets)
val_dataset = MultiLabelDataset(val_inputs, val_targets)
test_dataset = MultiLabelDataset(x_test)  # Create test dataset

train_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False, num_workers=2)  # Create test DataLoader

class TransformerModel(nn.Module):
    def __init__(self, max_features, embed_size, embedding_matrix):
        super(TransformerModel, self).__init__()
        self.embedding = nn.Embedding(max_features, embed_size)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False

        self.bert = AutoModel.from_pretrained('distilbert-base-uncased')
        self.classifier = nn.Sequential(
            nn.Linear(768 + embed_size, 768),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(768, 6)
        )

    def forward(self, input_ids, fasttext_embeddings):
        attention_mask = (input_ids > 0).long()
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        bert_cls = bert_outputs.last_hidden_state[:, 0, :]

        # Apply mean pooling to fasttext_embeddings
        fasttext_embeddings = fasttext_embeddings.mean(dim=1)

        combined = torch.cat((bert_cls, fasttext_embeddings), 1)
        logits = self.classifier(combined)
        return logits

model = TransformerModel(max_features, embed_size, embedding_matrix)
model.to(DEVICE)

optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
loss_func = nn.BCEWithLogitsLoss()
lr_sched = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)

def train_one_epoch(train_loader, model, loss_func, optimizer):
    model.train()
    size = len(train_loader.dataset)
    all_targets = []
    all_outputs = []
    total_loss = 0
    
    for i, (input_ids, targets) in enumerate(train_loader):
        input_ids = input_ids.to(DEVICE)
        targets = targets.to(DEVICE)
        
        # Get FastText embeddings
        fasttext_embeddings = model.embedding(input_ids)

        optimizer.zero_grad()
        outputs = model(input_ids, fasttext_embeddings)
        loss = loss_func(outputs, targets)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        all_targets.append(targets.cpu().numpy())
        all_outputs.append(outputs.sigmoid().detach().cpu().numpy())

        if i % 100 == 0:
            loss_val = loss.item()
            current = i * len(input_ids)
            print(f"loss: {loss_val:>7f}  [{current:>5d}/{size:>5d}]")
    
    avg_loss = total_loss / len(train_loader)
    all_targets = np.concatenate(all_targets)
    all_outputs = np.concatenate(all_outputs)
    train_acc = accuracy_score(all_targets, all_outputs.round())
    train_roc_auc = roc_auc_score(all_targets, all_outputs, average='macro')
    train_f1 = f1_score(all_targets, all_outputs.round(), average='macro')

    print(f"Training Error: \n Avg loss: {avg_loss:>8f} \n Accuracy: {train_acc:>8f} \n ROC AUC: {train_roc_auc:>8f} \n F1 Score: {train_f1:>8f} \n")
    return avg_loss, train_acc, train_roc_auc, train_f1

def validate_one_epoch(val_loader, model, loss_func):
    model.eval()
    size = len(val_loader.dataset)
    num_batches = len(val_loader)
    val_loss = 0
    all_targets = []
    all_outputs = []
    with torch.no_grad():
        for input_ids, targets in val_loader:
            input_ids = input_ids.to(DEVICE)
            targets = targets.to(DEVICE)
            
            # Get FastText embeddings
            fasttext_embeddings = model.embedding(input_ids)

            outputs = model(input_ids, fasttext_embeddings)
            val_loss += loss_func(outputs, targets).item()
            
            all_targets.append(targets.cpu().numpy())
            all_outputs.append(outputs.sigmoid().cpu().numpy())

    val_loss /= num_batches
    all_targets = np.concatenate(all_targets)
    all_outputs = np.concatenate(all_outputs)
    val_acc = accuracy_score(all_targets, all_outputs.round())
    val_roc_auc = roc_auc_score(all_targets, all_outputs, average='macro')
    val_f1 = f1_score(all_targets, all_outputs.round(), average='macro')

    print(f"Validation Error: \n Avg loss: {val_loss:>8f} \n Accuracy: {val_acc:>8f} \n ROC AUC: {val_roc_auc:>8f} \n F1 Score: {val_f1:>8f} \n")
    return val_loss, val_acc, val_roc_auc, val_f1

def evaluate(loader, model):
    model.eval()
    all_outputs = []
    with torch.no_grad():
        for input_ids in loader:
            input_ids = input_ids.to(DEVICE)
            
            # Get FastText embeddings
            fasttext_embeddings = model.embedding(input_ids)

            outputs = model(input_ids, fasttext_embeddings)
            all_outputs.append(outputs.sigmoid().cpu().numpy())

    all_outputs = np.concatenate(all_outputs)
    return all_outputs

# Training loop
for epoch in range(EPOCHS):
    print(f"Epoch {epoch+1} (lr = {lr_sched.get_last_lr()[0]:.2e})\n-------------------------------")
    train_loss, train_acc, train_roc_auc, train_f1 = train_one_epoch(train_loader, model, loss_func, optimizer)
    val_loss, val_acc, val_roc_auc, val_f1 = validate_one_epoch(val_loader, model, loss_func)
    lr_sched.step()

# Evaluate on test set
test_outputs = evaluate(test_loader, model)
# Assuming the test labels are available in y_test
# y_test = ... # Load the test labels similarly to y_train

# If y_test is available, calculate the metrics
# test_acc = accuracy_score(y_test, test_outputs.round())
# test_roc_auc = roc_auc_score(y_test, test_outputs, average='macro')
# test_f1 = f1_score(y_test, test_outputs.round(), average='macro')

# print(f"Test Accuracy: {test_acc:>8f} \n Test ROC AUC: {test_roc_auc:>8f} \n Test F1 Score: {test_f1:>8f}")
