In [1]:
import pandas as pd
import torch
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from scipy.sparse import hstack, csr_matrix
import joblib
import numpy as np


In [2]:
# Load dataset
df = pd.read_csv("test.csv")  # Replace with your dataset path
# Handle missing values in the dataset
df['crimeaditionalinfo'] = df['crimeaditionalinfo'].fillna("")  # Replace NaN in the text column with an empty string
df['category'] = df['category'].fillna("Unknown")  # Replace NaN in categories with "Unknown"
df['sub_category'] = df['sub_category'].fillna("Unknown")  # Replace NaN in subcategories with "Unknown"
texts = df['crimeaditionalinfo']
categories = df['category']  # Broad types (e.g., Phishing, Malware)
subcategories = df['sub_category']  # Specific types (e.g., Spear Phishing, Ransomware)

# Split dataset into train and validation sets
train_texts, val_texts, train_categories, val_categories, train_subcategories, val_subcategories = train_test_split(
    texts, categories, subcategories, test_size=0.2, random_state=42
)
# Ensure no NaN values in train and validation sets
train_texts = train_texts.fillna("")
val_texts = val_texts.fillna("")

In [3]:
# Initialize CountVectorizer for Bag-of-Words
vectorizer = CountVectorizer(lowercase=True, stop_words="english", max_features=5000)
bow_train = vectorizer.fit_transform(train_texts)
bow_val = vectorizer.transform(val_texts)

# Initialize DistilBERT tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
bert_model = DistilBertModel.from_pretrained('distilbert-base-uncased')




In [5]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model = bert_model.to(device)  # Move model to GPU

# Function to extract BERT embeddings on GPU
def extract_bert_embeddings_on_gpu(texts, batch_size=64):
    all_embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        
        # Tokenize and move inputs to GPU
        inputs = tokenizer(list(batch_texts), return_tensors="pt", truncation=True, padding=True, max_length=128).to(device)
        
        # Forward pass through the model
        with torch.no_grad():
            outputs = bert_model(**inputs)
        
        # Extract embeddings and move them back to CPU
        batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
        all_embeddings.append(batch_embeddings)
        
        # Clear GPU cache
        torch.cuda.empty_cache()
    
    return np.vstack(all_embeddings)

# Generate or load cached embeddings
try:
    bert_train = joblib.load("bert_train.pkl")
    bert_val = joblib.load("bert_val.pkl")
    print("Loaded cached BERT embeddings.")
except FileNotFoundError:
    print("Generating BERT embeddings on GPU...")
    bert_train = extract_bert_embeddings_on_gpu(train_texts)
    bert_val = extract_bert_embeddings_on_gpu(val_texts)
    joblib.dump(bert_train, "bert_train.pkl")
    joblib.dump(bert_val, "bert_val.pkl")


Generating BERT embeddings on GPU...


In [6]:
# Convert BERT embeddings to sparse matrices
bert_train_sparse = csr_matrix(bert_train)
bert_val_sparse = csr_matrix(bert_val)

# Combine BoW and BERT features
combined_train = hstack([bow_train, bert_train_sparse])
combined_val = hstack([bow_val, bert_val_sparse])


In [8]:
# Train a category classifier
category_model = LogisticRegression(max_iter=1000)
category_model.fit(combined_train, train_categories)

# Evaluate the category classifier
val_category_preds = category_model.predict(combined_val)
print("Category Classification Report:")
print(classification_report(val_categories, val_category_preds))


Category Classification Report:
                                                      precision    recall  f1-score   support

                               Any Other Cyber Crime       0.30      0.28      0.29       687
Child Pornography CPChild Sexual Abuse Material CSAM       0.50      0.19      0.28        26
                      Crime Against Women & Children       0.00      0.00      0.00         2
                                Cryptocurrency Crime       0.58      0.35      0.44        31
                      Cyber Attack/ Dependent Crimes       0.99      1.00      1.00       267
                                     Cyber Terrorism       0.00      0.00      0.00        15
      Hacking  Damage to computercomputer system etc       0.32      0.19      0.24       126
                            Online Cyber Trafficking       0.00      0.00      0.00        12
                              Online Financial Fraud       0.83      0.89      0.86      3787
                           

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [9]:
# Train a subcategory classifier
subcategory_model = LogisticRegression(max_iter=1000)
subcategory_model.fit(combined_train, train_subcategories)

# Evaluate the subcategory classifier
val_subcategory_preds = subcategory_model.predict(combined_val)
print("Subcategory Classification Report:")
print(classification_report(val_subcategories, val_subcategory_preds))


Subcategory Classification Report:
                                                                      precision    recall  f1-score   support

                             Business Email CompromiseEmail Takeover       0.00      0.00      0.00        22
                                           Cheating by Impersonation       0.14      0.10      0.12       145
                                        Computer Generated CSAM/CSEM       0.00      0.00      0.00         2
                                                Cryptocurrency Fraud       0.48      0.42      0.45        31
                                   Cyber Bullying  Stalking  Sexting       0.35      0.35      0.35       272
                                                     Cyber Terrorism       0.00      0.00      0.00        15
                             Damage to computer computer systems etc       0.00      0.00      0.00         9
                                                   Data Breach/Theft       0.19     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [31]:
# Function to predict category and subcategory
def predict_category_and_subcategory(text, category_model, subcategory_model, vectorizer, bert_model, tokenizer, device):
    # Extract BoW features
    bow_features = vectorizer.transform([text])

    # Extract BERT embeddings
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128).to(device)
    with torch.no_grad():
        bert_features = bert_model(**inputs).last_hidden_state.mean(dim=1).cpu().numpy()

    # Combine features
    combined_features = hstack([bow_features, csr_matrix(bert_features)])

    # Predict category and subcategory
    predicted_category = category_model.predict(combined_features)[0]
    predicted_subcategory = subcategory_model.predict(combined_features)[0]

    return predicted_category, predicted_subcategory

# Test prediction
test_text = "My credit card got stolen."
predicted_category, predicted_subcategory = predict_category_and_subcategory(
    test_text, category_model, subcategory_model, vectorizer, bert_model, tokenizer, device
)

print(f"Text: {test_text}")
print(f"Predicted Category: {predicted_category}")
print(f"Predicted Subcategory: {predicted_subcategory}")


Text: My credit card got stolen.
Predicted Category: Any Other Cyber Crime
Predicted Subcategory: Other


In [25]:
# Save category classifier
joblib.dump(category_model, "category_model.pkl")

# Save subcategory classifier
joblib.dump(subcategory_model, "subcategory_model.pkl")
# Save CountVectorizer
joblib.dump(vectorizer, "vectorizer.pkl")
# Save tokenizer and model
tokenizer.save_pretrained("bert_tokenizer/")
bert_model.save_pretrained("bert_model/")
# Save BERT embeddings
joblib.dump(bert_train, "bert_train.pkl")
joblib.dump(bert_val, "bert_val.pkl")



['bert_val.pkl']

In [26]:
# Load models, vectorizer, and BERT components
category_model = joblib.load("category_model.pkl")
subcategory_model = joblib.load("subcategory_model.pkl")
vectorizer = joblib.load("vectorizer.pkl")
tokenizer = DistilBertTokenizer.from_pretrained("bert_tokenizer/")
bert_model = DistilBertModel.from_pretrained("bert_model/").to(device)

# Predict for new text
test_text = "An attacker used ransomware to encrypt files."
predicted_category, predicted_subcategory = predict_category_and_subcategory(
    test_text, category_model, subcategory_model, vectorizer, bert_model, tokenizer, device
)

print(f"Text: {test_text}")
print(f"Predicted Category: {predicted_category}")
print(f"Predicted Subcategory: {predicted_subcategory}")


Text: An attacker used ransomware to encrypt files.
Predicted Category: Hacking  Damage to computercomputer system etc
Predicted Subcategory: Unauthorised AccessData Breach
