In [None]:
%pip install transformers torch torchtext scikit-learn pandas
%pip install torch==2.0.1 torchtext==0.15.2
!pip install tensorflow
%pip install transformers torch scikit-learn
!pip install datasets





In [None]:
import pandas as pd 
import torch 
from torch.utils.data import Dataset, DataLoader 
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from tqdm import tqdm

# Load your data 
df = pd.read_csv("crime_data.csv") 

# Drop rows with NaN values
df = df.dropna()

# Display the DataFrame to verify the data 
print(df.head()) 

# Map labels to integers 
label_to_int = {label: i for i, label in enumerate(df['sub_category'].unique())} 
df['sub_category'] = df['sub_category'].map(label_to_int) 

# Set hyperparameters 
MAX_LENGTH = 128 
BATCH_SIZE = 4
EPOCHS = 30
LEARNING_RATE = 5e-5

# Initialize tokenizer 
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Define CustomTextDataset 
class CustomTextDataset(Dataset): 
    def __init__(self, dataframe, tokenizer, max_length, text_column, label_column): 
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text_column = text_column
        self.label_column = label_column
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx): 
        # Get text and label for the sample 
        text = self.data.iloc[idx][self.text_column]
        label = self.data.iloc[idx][self.label_column]

        # Tokenize the text
        tokens = self.tokenizer(
            text,
            padding='max_length',
            max_length=self.max_length,
            truncation=True,
            return_tensors="pt"
        ) 

        # Remove the batch dimension from token tensors 
        tokens = {key: val.squeeze(0) for key, val in tokens.items()} 

        # Return tokenized inputs and the label 
        return tokens, torch.tensor(label, dtype=torch.long)

def predict(model, tokenizer, texts, max_length=128):
    model.eval()
    predictions = []
    with torch.no_grad():
        for text in texts:
            tokens = tokenizer( 
                text, 
                padding='max_length', 
                max_length=max_length,
                truncation=True,
                return_tensors="pt"
            )
            tokens = {key: val.to(model.device) for key, val in tokens.items()}
            output = model(**tokens)
            _, predicted_class = torch.max(output.logits, dim=1)
            predictions.append(predicted_class.item())
    return predictions

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

unique_labels = df['sub_category'].nunique()
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=unique_labels)

train_dataset = CustomTextDataset(df, tokenizer, MAX_LENGTH, text_column='crimeaditionalinfo', label_column='sub_category')
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

optimizer = AdamW(model.parameters(), lr=5e-5)

model.train()
for epoch in range(EPOCHS):
    print(f"Epoch {epoch + 1}/{EPOCHS}") 
    total_loss = 0 
    for batch in tqdm(train_loader): 
        optimizer.zero_grad()
        tokens, labels = batch
        labels = labels.to(torch.long)  
        outputs = model(**tokens, labels=labels)

        loss = outputs.loss
        loss.backward() 
        optimizer.step() 
        total_loss += loss.item() 

    average_loss = total_loss / len(train_loader) 
    print(f"Average Loss: {average_loss:.4f}") 

# Save the trained model
model.save_pretrained("trained_bert_model")
tokenizer.save_pretrained("trained_bert_tokenizer")

# Predict on unknown texts



In [None]:
unknown_texts = [
 
    "spam message i recieve msg from unwanted number they say you take loan and today is repayment date but i did not take loani recieve text and whatsapp message any time for make repayment of loan but i did not take any loan "
]
predicted_classes = predict(model, tokenizer, unknown_texts)

int_to_label = {v: k for k, v in label_to_int.items()}

for text, prediction in zip(unknown_texts, predicted_classes): 
    predicted_label = int_to_label[prediction]
    print(f"Text: {text} | Predicted Class: {prediction} | Class Label: {predicted_label}")