In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset,DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [2]:
import pandas as pd
df=pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv',encoding='utf-8')
sentences=df['comment_text'].tolist()
labels=df['toxic'].tolist()

In [3]:
train_texts,test_texts,train_labels,test_labels=train_test_split(sentences,labels,test_size=0.2,random_state=1)

In [4]:
class SentenceDataset(Dataset):
    def __init__(self,texts,labels,tokenizer,max_len):
        self.texts=texts
        self.labels=labels
        self.tokenizer=tokenizer
        self.max_len=max_len
    def __len__(self):
        return len(self.texts)
    def __getitem__(self,idx):
        text=str(self.texts[idx])
        label=self.labels[idx]
        
        encoding=self.tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=self.max_len,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',       
        )
        
        return{
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels':torch.tensor(label,dtype=torch.long)
        }
        

In [5]:
tokenizer=AutoTokenizer.from_pretrained('bert-base-multilingual-cased')
model=AutoModelForSequenceClassification.from_pretrained('bert-base-multilingual-cased',num_labels=2)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
MAX_LEN=128
BATCH_SIZE=16

In [7]:
train_dataset=SentenceDataset(train_texts,train_labels,tokenizer,MAX_LEN)
test_dataset=SentenceDataset(test_texts,test_labels,tokenizer,MAX_LEN)


In [8]:
train_loader=DataLoader(train_dataset,batch_size=BATCH_SIZE,shuffle=True)
test_loader=DataLoader(test_dataset,batch_size=BATCH_SIZE)


In [9]:
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1

In [10]:
optimizer=torch.optim.Adam(model.parameters(), lr=1e-5)
epochs=1

In [11]:
for epoch in range(epochs):
    model.train()
    for batch in train_loader:
        input_ids=batch['input_ids'].to(device)
        attention_mask=batch['attention_mask'].to(device)
        labels=batch['labels'].to(device)
        
        outputs=model(input_ids,attention_mask=attention_mask,labels=labels)
        loss=outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    print(f'Epoch {epoch+1}/{epochs} done')

Epoch 1/1 done


In [15]:
model.eval()
predictions=[]
actual_labels=[]

with torch.no_grad():
    for batch in test_loader:
        input_ids=batch['input_ids'].to(device)
        attention_mask=batch['attention_mask'].to(device)
        labels=batch['labels'].to(device)
        
        outputs=model(input_ids,attention_mask=attention_mask)
        _,preds=torch.max(outputs.logits,dim=1)
        
        predictions.extend(preds.cpu().tolist())
        actual_labels.extend(labels.cpu().tolist())
        
accuracy=accuracy_score(actual_labels,predictions)
print(f'Accuracy: {accuracy:.2f}')
print(classification_report(actual_labels,predictions))

Accuracy: 0.96
              precision    recall  f1-score   support

           0       0.97      0.99      0.98     40426
           1       0.86      0.67      0.75      4284

    accuracy                           0.96     44710
   macro avg       0.91      0.83      0.87     44710
weighted avg       0.96      0.96      0.96     44710

