In [5]:
import pandas as pd
csv_file_path = 'MOLD_train.csv' 
df = pd.read_csv(csv_file_path)

print(df.head())


                                               Tweet          Class
0   भारत 15 ऑगस्ट 1947 ला स्वतंत्र झाला आणि त्यान...  not offensive
1   स्वत ला हवा तसा बाइट किंवा प्रतिक्रिया घेण्या...  not offensive
2   5 व्या नंबरची अर्थव्यवस्था आहे भारताची जगात 2...  not offensive
3     च्यायला म्हणजे दुबईचा फोन ही पुडीच निघाली की.       offensive
4   ह्याला खरंतर कधीच आत टाकला पाहिजे होता. पैसा ...      offensive


In [6]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

class TweetDataset(Dataset):
    def __init__(self, tweets, labels, tokenizer, max_len):
        self.tweets = tweets
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.tweets)

    def __getitem__(self, index):
        tweet = self.tweets[index]
        label = self.labels[index]

        encoding = self.tokenizer.encode_plus(
            tweet,
            max_length=self.max_len,
            add_special_tokens=True,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'tweet_text': tweet,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

train_labels = [1 if label == 'offensive' else 0 for label in train_df['Class']]
val_labels = [1 if label == 'offensive' else 0 for label in val_df['Class']]


train_dataset = TweetDataset(
    tweets=train_df['Tweet'].to_numpy(),
    labels=train_labels,
    tokenizer=tokenizer,
    max_len=128
)

val_dataset = TweetDataset(
    tweets=val_df['Tweet'].to_numpy(),
    labels=val_labels,
    tokenizer=tokenizer,
    max_len=128
)

train_loader = DataLoader(
    train_dataset,
    sampler=RandomSampler(train_dataset),
    batch_size=16
)

val_loader = DataLoader(
    val_dataset,
    sampler=SequentialSampler(val_dataset),
    batch_size=16
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

In [8]:

import numpy as np
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=2)

optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_loader) * 4  
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

def train_epoch(model, data_loader, optimizer, device, scheduler):
    model = model.train()
    losses = []
    correct_predictions = 0

    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        _, preds = torch.max(logits, dim=1)
        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())

        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)

def eval_model(model, data_loader, device):
    model = model.eval()
    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            _, preds = torch.max(logits, dim=1)
            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())

    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)


num_epochs = 4

for epoch in range(num_epochs):
    print(f'Epoch {epoch + 1}/{num_epochs}')
    print('-' * 10)

    train_acc, train_loss = train_epoch(model, train_loader, optimizer, device, scheduler)
    print(f'Train loss {train_loss} accuracy {train_acc}')

    val_acc, val_loss = eval_model(model, val_loader, device)
    print(f'Validation loss {val_loss} accuracy {val_acc}')
    print()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/4
----------




Train loss 0.5605297095651897 accuracy 0.7241992882562278
Validation loss 0.37404008706410724 accuracy 0.8670212765957447

Epoch 2/4
----------
Train loss 0.3580679053897565 accuracy 0.8653618030842231
Validation loss 0.2455349393809835 accuracy 0.9308510638297872

Epoch 3/4
----------
Train loss 0.24896165422814073 accuracy 0.9139976275207593
Validation loss 0.24525759958972534 accuracy 0.9148936170212766

Epoch 4/4
----------
Train loss 0.18698784459452583 accuracy 0.938908659549229
Validation loss 0.23938164208084345 accuracy 0.9202127659574468



In [10]:
import os
from transformers import BertTokenizer, BertForSequenceClassification

output_dir = '/mnt/data/fine_tuned_bert_model'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
model.save_pretrained(output_dir)

tokenizer.save_pretrained(output_dir)

print(f"Model and tokenizer saved to {output_dir}")


Model and tokenizer saved to /mnt/data/fine_tuned_bert_model


In [14]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
model_dir = '/mnt/data/fine_tuned_bert_model'

tokenizer = BertTokenizer.from_pretrained(model_dir)
model = BertForSequenceClassification.from_pretrained(model_dir)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

def predict_abusive(comment, model, tokenizer, device):
    encoding = tokenizer.encode_plus(
        comment,
        max_length=128,
        add_special_tokens=True,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt',
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        _, preds = torch.max(logits, dim=1)

    return 'abusive word(s) detected' if preds.item() == 1 else 'safe'


new_comment = "माझा हंटर चांगलाच घुसलाय आईच्या पुच्चीत तुझ्या कशी खवळली रांड लगेच"
prediction = predict_abusive(new_comment, model, tokenizer, device)
print(f"The comment '{new_comment}' is: {prediction}")


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


The comment 'माझा हंटर चांगलाच घुसलाय आईच्या पुच्चीत तुझ्या कशी खवळली रांड लगेच' is: abusive word(s) detected
