In [None]:
import pandas as pd
import re
import torch
import torch.nn as nn
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer

In [None]:
# Load datasets
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [None]:
train_data.head()

Unnamed: 0,category,sub_category,crimeaditionalinfo
0,Online and Social Media Related Crime,Cyber Bullying Stalking Sexting,I had continue received random calls and abusi...
1,Online Financial Fraud,Fraud CallVishing,The above fraudster is continuously messaging ...
2,Online Gambling Betting,Online Gambling Betting,He is acting like a police and demanding for m...
3,Online and Social Media Related Crime,Online Job Fraud,In apna Job I have applied for job interview f...
4,Online Financial Fraud,Fraud CallVishing,I received a call from lady stating that she w...


In [None]:
test_data.head()

Unnamed: 0,category,sub_category,crimeaditionalinfo
0,RapeGang Rape RGRSexually Abusive Content,,Sir namaskar mein Ranjit Kumar PatraPaise neh...
1,Online Financial Fraud,DebitCredit Card FraudSim Swap Fraud,KOTAK MAHINDRA BANK FRAUD\r\nFRAUD AMOUNT
2,Cyber Attack/ Dependent Crimes,SQL Injection,The issue actually started when I got this ema...
3,Online Financial Fraud,Fraud CallVishing,I am amit kumar from karwi chitrakoot I am tot...
4,Any Other Cyber Crime,Other,I have ordered saree and blouse from rinki s...


In [None]:
def clean_text(text):
    # Check if the text is a string; if not, return an empty string or handle accordingly
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
        return text
    else:
        return ''

In [None]:
train_data['crimeaditionalinfo'] = train_data['crimeaditionalinfo'].apply(clean_text)
test_data['crimeaditionalinfo'] = test_data['crimeaditionalinfo'].apply(clean_text)

In [None]:
category_encoder = LabelEncoder()
subcategory_encoder = LabelEncoder()
train_data['category'] = category_encoder.fit_transform(train_data['category'])
train_data['sub_category'] = subcategory_encoder.fit_transform(train_data['sub_category'])

In [None]:
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]



In [None]:
class CybercrimeDataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['crimeaditionalinfo']
        inputs = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )

        category = self.data.iloc[idx]['category']
        sub_category = self.data.iloc[idx]['sub_category']

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'category': torch.tensor(category, dtype=torch.long),
            'sub_category': torch.tensor(sub_category, dtype=torch.long)
        }


In [None]:
max_len = 128
train_dataset = CybercrimeDataset(train_data, tokenizer, max_len)
test_dataset = CybercrimeDataset(test_data, tokenizer, max_len)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

In [None]:
class LSTMClassifier(nn.Module):
    def __init__(self, hidden_dim, num_categories, num_subcategories, dropout=0.5):
        super(LSTMClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-multilingual-cased')
        self.lstm = nn.LSTM(self.bert.config.hidden_size, hidden_dim, batch_first=True, bidirectional=True)
        self.fc_category = nn.Linear(hidden_dim * 2, num_categories)
        self.fc_subcategory = nn.Linear(hidden_dim * 2, num_subcategories)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_ids, attention_mask):
        bert_out = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        lstm_out, _ = self.lstm(bert_out.last_hidden_state)
        lstm_out = self.dropout(lstm_out[:, -1, :])  # Use output of the last LSTM cell

        category_out = self.fc_category(lstm_out)
        subcategory_out = self.fc_subcategory(lstm_out)

        return category_out, subcategory_out

In [None]:
hidden_dim = 256
num_categories = train_data['category'].nunique()
num_subcategories = train_data['sub_category'].nunique()

model = LSTMClassifier(hidden_dim, num_categories, num_subcategories)

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

In [None]:
def train_epoch(model, data_loader, criterion, optimizer, device):
    model = model.train()
    total_loss = 0

    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        category_labels = batch['category'].to(device)
        subcategory_labels = batch['sub_category'].to(device)

        optimizer.zero_grad()

        category_outputs, subcategory_outputs = model(input_ids, attention_mask)
        category_loss = criterion(category_outputs, category_labels)
        subcategory_loss = criterion(subcategory_outputs, subcategory_labels)

        loss = category_loss + subcategory_loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    return total_loss / len(data_loader)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

LSTMClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

In [None]:
# Training
num_epochs = 3
for epoch in range(num_epochs):
    train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
    print(f"Epoch {epoch+1}, Training Loss: {train_loss:.4f}")

In [None]:
j

SyntaxError: illegal target for annotation (<ipython-input-31-aef0c77623c5>, line 1)