In [None]:
import pandas as pd
import re
import torch
import torch.nn as nn
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer

In [None]:
# Load datasets
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [None]:
train_data.head()

Unnamed: 0,category,sub_category,crimeaditionalinfo
0,Online and Social Media Related Crime,Cyber Bullying Stalking Sexting,I had continue received random calls and abusi...
1,Online Financial Fraud,Fraud CallVishing,The above fraudster is continuously messaging ...
2,Online Gambling Betting,Online Gambling Betting,He is acting like a police and demanding for m...
3,Online and Social Media Related Crime,Online Job Fraud,In apna Job I have applied for job interview f...
4,Online Financial Fraud,Fraud CallVishing,I received a call from lady stating that she w...


In [None]:
test_data.head()

Unnamed: 0,category,sub_category,crimeaditionalinfo
0,RapeGang Rape RGRSexually Abusive Content,,Sir namaskar mein Ranjit Kumar PatraPaise neh...
1,Online Financial Fraud,DebitCredit Card FraudSim Swap Fraud,KOTAK MAHINDRA BANK FRAUD\r\nFRAUD AMOUNT
2,Cyber Attack/ Dependent Crimes,SQL Injection,The issue actually started when I got this ema...
3,Online Financial Fraud,Fraud CallVishing,I am amit kumar from karwi chitrakoot I am tot...
4,Any Other Cyber Crime,Other,I have ordered saree and blouse from rinki s...


In [None]:
def clean_text(text):
    # Check if the text is a string; if not, return an empty string or handle accordingly
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
        return text
    else:
        return ''

In [None]:
train_data['crimeaditionalinfo'] = train_data['crimeaditionalinfo'].apply(clean_text)
test_data['crimeaditionalinfo'] = test_data['crimeaditionalinfo'].apply(clean_text)

In [None]:
category_encoder = LabelEncoder()
subcategory_encoder = LabelEncoder()
train_data['category'] = category_encoder.fit_transform(train_data['category'])
train_data['sub_category'] = subcategory_encoder.fit_transform(train_data['sub_category'])

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



In [None]:
class CybercrimeDataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['crimeaditionalinfo']
        inputs = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )

        category = self.data.iloc[idx]['category']
        sub_category = self.data.iloc[idx]['sub_category']

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'category': torch.tensor(category, dtype=torch.long),
            'sub_category': torch.tensor(sub_category, dtype=torch.long)
        }

In [None]:
max_len = 128
train_dataset = CybercrimeDataset(train_data, tokenizer, max_len)
test_dataset = CybercrimeDataset(test_data, tokenizer, max_len)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

In [None]:
class LSTMClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, num_categories, num_subcategories, vocab_size, dropout=0.5):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc_category = nn.Linear(hidden_dim * 2, num_categories)
        self.fc_subcategory = nn.Linear(hidden_dim * 2, num_subcategories)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_ids, attention_mask):
        embeds = self.embedding(input_ids)
        lstm_out, _ = self.lstm(embeds)
        lstm_out = self.dropout(lstm_out[:, -1, :])  # Use output of the last LSTM cell

        category_out = self.fc_category(lstm_out)
        subcategory_out = self.fc_subcategory(lstm_out)

        return category_out, subcategory_out

In [None]:
embedding_dim = 128
hidden_dim = 256
vocab_size = tokenizer.vocab_size
num_categories = train_data['category'].nunique()
num_subcategories = train_data['sub_category'].nunique()

model = LSTMClassifier(embedding_dim, hidden_dim, num_categories, num_subcategories, vocab_size)

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [None]:
def train_epoch(model, data_loader, criterion, optimizer, device):
    model = model.train()
    total_loss = 0

    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        category_labels = batch['category'].to(device)
        subcategory_labels = batch['sub_category'].to(device)

        optimizer.zero_grad()

        category_outputs, subcategory_outputs = model(input_ids, attention_mask)
        category_loss = criterion(category_outputs, category_labels)
        subcategory_loss = criterion(subcategory_outputs, subcategory_labels)

        loss = category_loss + subcategory_loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    return total_loss / len(data_loader)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

LSTMClassifier(
  (embedding): Embedding(30522, 128)
  (lstm): LSTM(128, 256, batch_first=True, bidirectional=True)
  (fc_category): Linear(in_features=512, out_features=15, bias=True)
  (fc_subcategory): Linear(in_features=512, out_features=36, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [None]:
num_epochs = 5
for epoch in range(num_epochs):
    train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
    print(f"Epoch {epoch+1}, Training Loss: {train_loss:.4f}")

Epoch 1, Training Loss: 2.7756
Epoch 2, Training Loss: 2.2897
Epoch 3, Training Loss: 2.1385
Epoch 4, Training Loss: 2.0202
Epoch 5, Training Loss: 1.8924


In [None]:
j

SyntaxError: illegal target for annotation (<ipython-input-31-aef0c77623c5>, line 1)