In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim

In [62]:
# Load the dataset
dataset_path = "/kaggle/input/dataset-text/TELUGU_METADATA.xlsx"  # Update the path as per your dataset location
data = pd.read_excel(dataset_path,sheet_name="BinaryClass")

# Display the first few rows of the dataset
print(data.head())
# Check the shape of the dataset
print(f"Dataset shape: {data.shape}")

        AUDIO FILE NAME CLASS LABLE  \
0  H_TE_001_R_F_001_001           H   
1  H_TE_001_R_F_001_002           H   
2  H_TE_001_R_M_001_003           H   
3  H_TE_001_R_M_001_004           H   
4  H_TE_001_R_M_001_005           H   

                                       TRANSCRIPTION  
0               ఎస్సీలుగా పుట్టాలని ఎవరు కోరుకుంటారు  
1  ఎవరు మాత్రం SC కులంలో పుట్టాలని కోరుకుంటారు అం...  
2        ఎవరు మాత్రం SC కులంలో పుట్టాలని కోరుకుంటారు  
3  ఎవరు మాత్రం SC కులంలో పుట్టాలని కోరుకుంటారు డబ...  
4  అందరూ రాజుల కులంలో పుడితే రాజ్యాన్ని ఎలచ్చనుకు...  
Dataset shape: (601, 3)


In [5]:
# Drop unnecessary columns and keep only relevant ones
data = data[['AUDIO FILE NAME', 'CLASS LABLE', 'TRANSCRIPTION']]

# Map labels to binary values (Hate: 1, Non-Hate: 0)
data['CLASS LABLE'] = data['CLASS LABLE'].map({'H': 1, 'NH': 0})

# Check for missing values
print(data.isnull().sum())

# Drop rows with missing values (if any)
data = data.dropna()

# Display the cleaned dataset
print(data.head())

AUDIO FILE NAME    0
CLASS LABLE        0
TRANSCRIPTION      0
dtype: int64
        AUDIO FILE NAME  CLASS LABLE  \
0  H_TE_001_R_F_001_001            1   
1  H_TE_001_R_F_001_002            1   
2  H_TE_001_R_M_001_003            1   
3  H_TE_001_R_M_001_004            1   
4  H_TE_001_R_M_001_005            1   

                                       TRANSCRIPTION  
0               ఎస్సీలుగా పుట్టాలని ఎవరు కోరుకుంటారు  
1  ఎవరు మాత్రం SC కులంలో పుట్టాలని కోరుకుంటారు అం...  
2        ఎవరు మాత్రం SC కులంలో పుట్టాలని కోరుకుంటారు  
3  ఎవరు మాత్రం SC కులంలో పుట్టాలని కోరుకుంటారు డబ...  
4  అందరూ రాజుల కులంలో పుడితే రాజ్యాన్ని ఎలచ్చనుకు...  


In [6]:
# Define a custom Dataset class
class HateSpeechDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Set up device (GPU if available, else CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [20]:
# Split the data into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Load the LaBSE tokenizer
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/LaBSE")

# Define parameters
MAX_LEN = 128
BATCH_SIZE = 16

# Create datasets
train_dataset = HateSpeechDataset(train_data['TRANSCRIPTION'].tolist(), train_data['CLASS LABLE'].tolist(), tokenizer, MAX_LEN)
test_dataset = HateSpeechDataset(test_data['TRANSCRIPTION'].tolist(), test_data['CLASS LABLE'].tolist(), tokenizer, MAX_LEN)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [24]:
# Load the LaBSE model
model = AutoModel.from_pretrained("sentence-transformers/LaBSE").to(device)

# Add a classification head
class HateSpeechClassifier(nn.Module):
    def __init__(self, n_classes):
        super(HateSpeechClassifier, self).__init__()
        self.labse = model
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.labse.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.labse(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        output = self.drop(pooled_output)
        return self.out(output)

# Initialize the model
model = HateSpeechClassifier(n_classes=2).to(device)

# Define optimizer and loss function
optimizer = optim.AdamW(model.parameters(), lr=1e-5)
loss_fn = nn.CrossEntropyLoss().to(device)

In [25]:
# Training function
def train_epoch(model, data_loader, loss_fn, optimizer, device):
    model = model.train()
    losses = []
    correct_predictions = 0

    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels = d["label"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, labels)

        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)

# Training loop
EPOCHS = 20
for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    train_acc, train_loss = train_epoch(model, train_loader, loss_fn, optimizer, device)
    print(f'Train loss: {train_loss}, Train accuracy: {train_acc}')

Epoch 1/20
Train loss: 0.5754983445008596, Train accuracy: 0.6791666666666667
Epoch 2/20
Train loss: 0.38521497597297033, Train accuracy: 0.8270833333333333
Epoch 3/20
Train loss: 0.24501242687304814, Train accuracy: 0.91875
Epoch 4/20
Train loss: 0.13579967767000198, Train accuracy: 0.9604166666666667
Epoch 5/20
Train loss: 0.0588349886238575, Train accuracy: 0.9833333333333333
Epoch 6/20
Train loss: 0.01972611063780884, Train accuracy: 0.9958333333333333
Epoch 7/20
Train loss: 0.00938042439520359, Train accuracy: 1.0
Epoch 8/20
Train loss: 0.005005022763119389, Train accuracy: 1.0
Epoch 9/20
Train loss: 0.003037321373509864, Train accuracy: 1.0
Epoch 10/20
Train loss: 0.0028546103955401727, Train accuracy: 1.0
Epoch 11/20
Train loss: 0.0017946678349593033, Train accuracy: 1.0
Epoch 12/20
Train loss: 0.0014349457924254239, Train accuracy: 1.0
Epoch 13/20
Train loss: 0.001425603588965411, Train accuracy: 1.0
Epoch 14/20
Train loss: 0.0028009679246072967, Train accuracy: 1.0
Epoch 15/20

In [26]:
# Evaluation function
def eval_model(model, data_loader, loss_fn, device):
    model = model.eval()
    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            labels = d["label"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, labels)

            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())

    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)

# Evaluate the model
test_acc, test_loss = eval_model(model, test_loader, loss_fn, device)
print(f'Test loss: {test_loss}, Test accuracy: {test_acc}')

# Generate classification report
y_true = []
y_pred = []

with torch.no_grad():
    for d in test_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels = d["label"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)

        y_true.extend(labels.cpu().numpy())
        y_pred.extend(preds.cpu().numpy())

print(classification_report(y_true, y_pred, target_names=['Non-Hate', 'Hate']))

Test loss: 0.48502410924993455, Test accuracy: 0.8925619834710744
              precision    recall  f1-score   support

    Non-Hate       0.87      0.81      0.84        42
        Hate       0.90      0.94      0.92        79

    accuracy                           0.89       121
   macro avg       0.89      0.87      0.88       121
weighted avg       0.89      0.89      0.89       121



**MULTICLASS**

In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim

In [3]:
# Load the dataset
dataset_path = "/kaggle/input/dataset-text/TELUGU_METADATA.xlsx"  # Update the path as per your dataset location
data = pd.read_excel(dataset_path,sheet_name="MultiClass")

# Display the first few rows of the dataset
print(data.head())
# Check the shape of the dataset
print(f"Dataset shape: {data.shape}")

        AUDIO FILE NAME CLASS LABLE  \
0  H_TE_001_R_F_001_001           R   
1  H_TE_001_R_F_001_002           R   
2  H_TE_001_R_M_001_003           R   
3  H_TE_001_R_M_001_004           R   
4  H_TE_001_R_M_001_005           R   

                                                TEXT  
0               ఎస్సీలుగా పుట్టాలని ఎవరు కోరుకుంటారు  
1  ఎవరు మాత్రం SC కులంలో పుట్టాలని కోరుకుంటారు అం...  
2        ఎవరు మాత్రం SC కులంలో పుట్టాలని కోరుకుంటారు  
3  ఎవరు మాత్రం SC కులంలో పుట్టాలని కోరుకుంటారు డబ...  
4  అందరూ రాజుల కులంలో పుడితే రాజ్యాన్ని ఎలచ్చనుకు...  
Dataset shape: (601, 3)


In [4]:
# Define the label mapping
label_mapping = {
    'NH': 0,  # Non-Hate
    'C': 1,   # Character
    'G': 2,   # Gender
    'R': 3,   # Religion
    'P': 4    # Political
}

# Apply the mapping to the CLASS LABLE column
data['CLASS LABLE'] = data['CLASS LABLE'].map(label_mapping)

# Check the updated class distribution
print(data['CLASS LABLE'].value_counts())

CLASS LABLE
0    208
1    132
2    111
3     82
4     68
Name: count, dtype: int64


In [5]:
# Drop rows with missing or invalid labels
data = data.dropna(subset=['CLASS LABLE'])

# Check the cleaned dataset
print(data['CLASS LABLE'].value_counts())

CLASS LABLE
0    208
1    132
2    111
3     82
4     68
Name: count, dtype: int64


In [6]:
# Define a custom Dataset class
class HateSpeechDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Set up device (GPU if available, else CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [7]:
# Split the data into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Load the LaBSE tokenizer
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/LaBSE")

# Define parameters
MAX_LEN = 128
BATCH_SIZE = 8

# Create datasets
train_dataset = HateSpeechDataset(train_data['TEXT'].tolist(), train_data['CLASS LABLE'].tolist(), tokenizer, MAX_LEN)
test_dataset = HateSpeechDataset(test_data['TEXT'].tolist(), test_data['CLASS LABLE'].tolist(), tokenizer, MAX_LEN)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

tokenizer_config.json:   0%|          | 0.00/397 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/804 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/5.22M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.62M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [14]:
# Load the LaBSE model
model = AutoModel.from_pretrained("sentence-transformers/LaBSE").to(device)

# Add a classification head
class HateSpeechClassifier(nn.Module):
    def __init__(self, n_classes):
        super(HateSpeechClassifier, self).__init__()
        self.labse = model
        self.drop = nn.Dropout(p=0.4)
        self.out = nn.Linear(self.labse.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.labse(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        output = self.drop(pooled_output)
        return self.out(output)

# Initialize the model
model = HateSpeechClassifier(n_classes=5).to(device)  # 5 classes: Gender, Religion, Character, Political, Non-Hate

# Define optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
loss_fn = nn.CrossEntropyLoss().to(device)

In [15]:
# Training function
def train_epoch(model, data_loader, loss_fn, optimizer, device):
    model = model.train()
    losses = []
    correct_predictions = 0

    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels = d["label"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, labels)

        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)

# Training loop
EPOCHS = 10
for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    train_acc, train_loss = train_epoch(model, train_loader, loss_fn, optimizer, device)
    print(f'Train loss: {train_loss}, Train accuracy: {train_acc}')

Epoch 1/10
Train loss: 1.265400559703509, Train accuracy: 0.5041666666666667
Epoch 2/10
Train loss: 0.62682274132967, Train accuracy: 0.7791666666666667
Epoch 3/10
Train loss: 0.2638416544223825, Train accuracy: 0.9270833333333334
Epoch 4/10
Train loss: 0.09579355257252851, Train accuracy: 0.9916666666666667
Epoch 5/10
Train loss: 0.02927128605855008, Train accuracy: 1.0
Epoch 6/10
Train loss: 0.01281562818524738, Train accuracy: 1.0
Epoch 7/10
Train loss: 0.008377978399706384, Train accuracy: 1.0
Epoch 8/10
Train loss: 0.006358478311449289, Train accuracy: 1.0
Epoch 9/10
Train loss: 0.005048331358314802, Train accuracy: 1.0
Epoch 10/10
Train loss: 0.0041293374573191, Train accuracy: 1.0


In [13]:
# Evaluation function
def eval_model(model, data_loader, loss_fn, device):
    model = model.eval()
    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            labels = d["label"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, labels)

            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())

    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)

# Evaluate the model
test_acc, test_loss = eval_model(model, test_loader, loss_fn, device)
print(f'Test loss: {test_loss}, Test accuracy: {test_acc}')

# Generate classification report
y_true = []
y_pred = []

with torch.no_grad():
    for d in test_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels = d["label"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)

        y_true.extend(labels.cpu().numpy())
        y_pred.extend(preds.cpu().numpy())

# Define target names for multi-class classification
target_names = ['Gender', 'Religion', 'Character', 'Political', 'Non-Hate']
print(classification_report(y_true, y_pred, target_names=target_names))

Test loss: 1.1372189662251913, Test accuracy: 0.7603305785123967
              precision    recall  f1-score   support

      Gender       0.83      0.81      0.82        42
    Religion       0.67      0.73      0.70        22
   Character       0.64      0.70      0.67        23
   Political       0.79      0.58      0.67        19
    Non-Hate       0.88      1.00      0.94        15

    accuracy                           0.76       121
   macro avg       0.76      0.76      0.76       121
weighted avg       0.76      0.76      0.76       121

