In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from sklearn.preprocessing import LabelEncoder
import torch.nn as nn
import torch.optim as optim

import pandas as pd
from sklearn.utils import shuffle

# Sample your data
def sample_data(df, n_samples=15000):
    # Shuffle the data for randomness
    df = shuffle(df, random_state=42)
    # Sample `n_samples` rows
    return df.head(n_samples)

class CrimeDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'label': torch.tensor(self.labels[idx], dtype=torch.long)
        }

def prepare_data(df, tokenizer, is_training=True, le=None):
    df = df.copy()
    df['text'] = df['crimeaditionalinfo'].apply(process_text)

    if is_training:
        le = LabelEncoder()
        labels = le.fit_transform(df['label'])
        return df['text'].tolist(), labels, le
    else:
        labels = le.transform(df['label'])
        return df['text'].tolist(), labels

def train_model(model, train_loader, val_loader, criterion, optimizer, device, epochs=10):
    model.to(device)

    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for batch in train_loader:
            optimizer.zero_grad()

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            loss.backward()
            optimizer.step()

        # Validation
        model.eval()
        val_loss = 0
        correct = 0
        total = 0

        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['label'].to(device)

                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                val_loss += loss.item()

                preds = torch.argmax(outputs.logits, dim=1)
                correct += (preds == labels).sum().item()
                total += labels.size(0)

        print(f'Epoch {epoch+1}/{epochs}')
        print(f'Train Loss: {total_loss/len(train_loader):.4f}')
        print(f'Val Loss: {val_loss/len(val_loader):.4f}')
        print(f'Val Accuracy: {correct/total:.4f}\n')

# Main execution
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Prepare training data
train_texts, train_labels, le = prepare_data(sample_data(df), tokenizer, is_training=True)
test_texts, test_labels = prepare_data(sample_data(test_df,n_samples=500), tokenizer, is_training=False, le=le)

# Create datasets and dataloaders
train_dataset = CrimeDataset(train_texts, train_labels, tokenizer)
test_dataset = CrimeDataset(test_texts, test_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=128)

# Prepare model
num_labels = len(np.unique(train_labels))
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=num_labels
)

# Prepare optimizer and loss
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
optimizer = optim.AdamW(model.parameters(), lr=5e-4)
criterion = nn.CrossEntropyLoss()

# Train the model
train_model(model, train_loader, test_loader, criterion, optimizer, device)

In [None]:
model.eval()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
def preprocess_text(text, tokenizer, max_length=128):
    inputs = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    return inputs['input_ids'], inputs['attention_mask']
def classify_text(text, model, tokenizer, device):
    input_ids, attention_mask = preprocess_text(text, tokenizer)
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)

    # Get predictions from the model
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probabilities = torch.softmax(logits, dim=-1)
        predicted_class = torch.argmax(probabilities, dim=-1).item()

    return predicted_class, probabilities.cpu().numpy()
input_text = ("I had continue received random calls and abusive messages in my "
              "whatsapp Someone added my number in a unknown facebook group name "
              "with Only Girls and still getting calls from unknown numbers pls "
              "help me and sort out the issue as soon as possible Thank you")

predicted_class, probabilities = classify_text(input_text, model, tokenizer, device)

print(f"Predicted Class: {predicted_class}")
print(f"Class Probabilities: {probabilities}")
predicted_label = le.inverse_transform([predicted_class])[0]
print(f"Predicted Label: {predicted_label}")