In [1]:
from google.colab import drive
drive.mount('/content/drive')

# Paths to your datasets
train_path = '/content/drive/My Drive/train.csv'
test_path = '/content/drive/My Drive/test.csv'


Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC


In [3]:
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

# Inspect data
print(train_df.head())
print(test_df.head())



                                category                       sub_category  \
0  Online and Social Media Related Crime  Cyber Bullying  Stalking  Sexting   
1                 Online Financial Fraud                  Fraud CallVishing   
2               Online Gambling  Betting           Online Gambling  Betting   
3  Online and Social Media Related Crime                   Online Job Fraud   
4                 Online Financial Fraud                  Fraud CallVishing   

                                  crimeaditionalinfo  
0  I had continue received random calls and abusi...  
1  The above fraudster is continuously messaging ...  
2  He is acting like a police and demanding for m...  
3  In apna Job I have applied for job interview f...  
4  I received a call from lady stating that she w...  
                                    category  \
0  RapeGang Rape RGRSexually Abusive Content   
1                     Online Financial Fraud   
2             Cyber Attack/ Dependent Crimes   
3  

In [4]:
# Download necessary NLTK data
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Initialize lemmatizer and stop words
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function for preprocessing
def preprocess_text(text):
    text = str(text) if isinstance(text, str) else ""
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

# Apply preprocessing to 'crimeaditionalinfo' in both train and test datasets
train_df['cleaned_text'] = train_df['crimeaditionalinfo'].apply(preprocess_text)
test_df['cleaned_text'] = test_df['crimeaditionalinfo'].apply(preprocess_text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [5]:
# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000)
X_train = tfidf.fit_transform(train_df['cleaned_text']).toarray()
X_test = tfidf.transform(test_df['cleaned_text']).toarray()

# Separate target labels for both category and sub_category
y_train = train_df[['category', 'sub_category']]
y_test = test_df[['category', 'sub_category']]


In [6]:
from sklearn.preprocessing import LabelEncoder
# Combine both train and test labels for category and sub_category for LabelEncoder fitting
combined_labels_category = pd.concat([train_df['category'], test_df['category']], axis=0)
combined_labels_subcategory = pd.concat([train_df['sub_category'], test_df['sub_category']], axis=0)

# Initialize LabelEncoders
category_encoder = LabelEncoder()
subcategory_encoder = LabelEncoder()

# Fit the encoders on the combined labels (train + test) to avoid unseen labels in test data
category_encoder.fit(combined_labels_category)
subcategory_encoder.fit(combined_labels_subcategory)

# Encode the labels for both train and test datasets
train_df['category_encoded'] = category_encoder.transform(train_df['category'])
test_df['category_encoded'] = category_encoder.transform(test_df['category'])
train_df['subcategory_encoded'] = subcategory_encoder.transform(train_df['sub_category'])
test_df['subcategory_encoded'] = subcategory_encoder.transform(test_df['sub_category'])


In [7]:
from transformers import BertTokenizer

# Initialize BERT Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the text data
train_encodings = tokenizer(list(train_df['cleaned_text']), padding=True, truncation=True, max_length=128)
test_encodings = tokenizer(list(test_df['cleaned_text']), padding=True, truncation=True, max_length=128)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



In [8]:
!pip install datasets
from datasets import Dataset, DatasetDict

# Create Dataset for PyTorch
train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'category_labels': train_df['category_encoded'].values,
    'subcategory_labels': train_df['subcategory_encoded'].values
})

test_dataset = Dataset.from_dict({
    'input_ids': test_encodings['input_ids'],
    'attention_mask': test_encodings['attention_mask'],
    'category_labels': test_df['category_encoded'].values,
    'subcategory_labels': test_df['subcategory_encoded'].values
})

# Prepare DatasetDict
dataset_dict = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})


Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m32.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [9]:
from transformers import BertForSequenceClassification

# Define separate models for category and subcategory
category_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(category_encoder.classes_))
subcategory_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(subcategory_encoder.classes_))


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Define metrics function to calculate accuracy, precision, recall, and f1 score
def compute_metrics(p):
    predictions, labels = p
    preds = predictions.argmax(axis=-1)  # Get the predicted class labels

    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')
    f1 = f1_score(labels, preds, average='weighted')

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }


In [15]:
from transformers import BertForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Define metrics function to calculate accuracy, precision, recall, and f1 score
def compute_metrics(p):
    predictions, labels = p
    preds = predictions.argmax(axis=-1)  # Get the predicted class labels

    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')
    f1 = f1_score(labels, preds, average='weighted')

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# Increase the dataset size for more data to train on (train on the first 5000 samples instead of 1000)
subset_train_dataset = train_dataset.select(range(5000))  # Use 5000 samples for better training

# Define optimized training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,                # Set to 5 epochs for longer training
    per_device_train_batch_size=16,    # Use a larger batch size (16) to train faster
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=2,     # Accumulate gradients to simulate larger batch size
    learning_rate=2e-5,                # Set a smaller learning rate for better convergence
    logging_dir='./logs',
    logging_steps=500,                 # Log every 500 steps for better monitoring
    evaluation_strategy="epoch",       # Evaluate after every epoch
    save_strategy="epoch",             # Save model after each epoch
    fp16=True,                         # Enable mixed-precision training for faster computation
    load_best_model_at_end=True,       # Automatically load the best model when training finishes
    metric_for_best_model="accuracy"  # Use accuracy to determine the best model
)

# Train for category
trainer_category = Trainer(
    model=category_model,
    args=training_args,
    train_dataset=subset_train_dataset.rename_column("category_labels", "labels"),
    eval_dataset=test_dataset.rename_column("category_labels", "labels"),
    compute_metrics=compute_metrics
)
print("Training model for category classification...")
trainer_category.train()

# Train for subcategory
trainer_subcategory = Trainer(
    model=subcategory_model,
    args=training_args,
    train_dataset=subset_train_dataset.rename_column("subcategory_labels", "labels"),
    eval_dataset=test_dataset.rename_column("subcategory_labels", "labels"),
    compute_metrics=compute_metrics
)
print("Training model for subcategory classification...")
trainer_subcategory.train()


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Training model for category classification...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
0,No log,0.819185,0.725832,0.694597,0.725832,0.69681
2,No log,0.863466,0.728361,0.692843,0.728361,0.705608
4,0.651700,0.939706,0.727977,0.689503,0.727977,0.705192


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Training model for subcategory classification...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
0,No log,1.706919,0.480579,0.446372,0.480579,0.436216
2,No log,1.64576,0.508502,0.481915,0.508502,0.475807
4,1.515400,1.686814,0.508502,0.46296,0.508502,0.474563


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=780, training_loss=1.4023419893704927, metrics={'train_runtime': 649.3778, 'train_samples_per_second': 38.498, 'train_steps_per_second': 1.201, 'total_flos': 1640252821266432.0, 'train_loss': 1.4023419893704927, 'epoch': 4.984025559105431})

In [17]:
# Evaluate the category model
category_results = trainer_category.evaluate()
print("Category Model Evaluation Results:")
print(f"Accuracy: {category_results['eval_accuracy']}")
print(f"Precision: {category_results['eval_precision']}")
print(f"Recall: {category_results['eval_recall']}")
print(f"F1-Score: {category_results['eval_f1']}")

# Evaluate the subcategory model
subcategory_results = trainer_subcategory.evaluate()
print("Subcategory Model Evaluation Results:")
print(f"Accuracy: {subcategory_results['eval_accuracy']}")
print(f"Precision: {subcategory_results['eval_precision']}")
print(f"Recall: {subcategory_results['eval_recall']}")
print(f"F1-Score: {subcategory_results['eval_f1']}")

# Make predictions on the test dataset for category
category_predictions = trainer_category.predict(test_dataset.rename_column("category_labels", "labels"))
category_preds = category_predictions.predictions.argmax(axis=-1)  # Get predicted category labels

# Reverse label encoding for category
category_pred_labels = category_encoder.inverse_transform(category_preds)
category_true_labels = category_encoder.inverse_transform(test_dataset['category_labels'])

# Print first 10 predictions and true labels for category
print("\nFirst 10 Category Predictions vs True Labels:")
for i in range(10):
    print(f"Predicted: {category_pred_labels[i]}, Actual: {category_true_labels[i]}")

# Classification Report for category
from sklearn.metrics import classification_report
category_report = classification_report(category_true_labels, category_pred_labels)
print("\nCategory Classification Report:")
print(category_report)

# Make predictions on the test dataset for subcategory
subcategory_predictions = trainer_subcategory.predict(test_dataset.rename_column("subcategory_labels", "labels"))
subcategory_preds = subcategory_predictions.predictions.argmax(axis=-1)  # Get predicted subcategory labels

# Ensure predicted labels and true labels are both strings (for categorical data)
subcategory_pred_labels = subcategory_encoder.inverse_transform(subcategory_preds).astype(str)
subcategory_true_labels = subcategory_encoder.inverse_transform(test_dataset['subcategory_labels']).astype(str)

# Print first 10 predictions and true labels for subcategory
print("\nFirst 10 Subcategory Predictions vs True Labels:")
for i in range(10):
    print(f"Predicted: {subcategory_pred_labels[i]}, Actual: {subcategory_true_labels[i]}")

# Classification Report for subcategory
subcategory_report = classification_report(subcategory_true_labels, subcategory_pred_labels)
print("\nSubcategory Classification Report:")
print(subcategory_report)


Category Model Evaluation Results:
Accuracy: 0.7297383841941785
Precision: 0.6914551182930204
Recall: 0.7297383841941785
F1-Score: 0.7014119212038828


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Subcategory Model Evaluation Results:
Accuracy: 0.5085017131512376
Precision: 0.48191532195856757
Recall: 0.5085017131512376
F1-Score: 0.47580721890638844


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



First 10 Category Predictions vs True Labels:
Predicted: Online Financial Fraud, Actual: RapeGang Rape RGRSexually Abusive Content
Predicted: Online Financial Fraud, Actual: Online Financial Fraud
Predicted: Cyber Attack/ Dependent Crimes, Actual: Cyber Attack/ Dependent Crimes
Predicted: Online Financial Fraud, Actual: Online Financial Fraud
Predicted: Online Financial Fraud, Actual: Any Other Cyber Crime
Predicted: Online Financial Fraud, Actual: Online Financial Fraud
Predicted: Any Other Cyber Crime, Actual: Hacking  Damage to computercomputer system etc
Predicted: Online Financial Fraud, Actual: Online Financial Fraud
Predicted: Online Financial Fraud, Actual: Online Financial Fraud
Predicted: Online Financial Fraud, Actual: Online Financial Fraud


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Category Classification Report:
                                                      precision    recall  f1-score   support

                               Any Other Cyber Crime       0.34      0.29      0.31      3670
Child Pornography CPChild Sexual Abuse Material CSAM       0.00      0.00      0.00       123
                      Crime Against Women & Children       0.00      0.00      0.00         4
                                Cryptocurrency Crime       0.00      0.00      0.00       166
                      Cyber Attack/ Dependent Crimes       0.98      1.00      0.99      1261
                                     Cyber Terrorism       0.00      0.00      0.00        52
      Hacking  Damage to computercomputer system etc       0.33      0.04      0.06       592
                            Online Cyber Trafficking       0.00      0.00      0.00        61
                              Online Financial Fraud       0.83      0.90      0.86     18896
                          

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



First 10 Subcategory Predictions vs True Labels:
Predicted: Other, Actual: nan
Predicted: UPI Related Frauds, Actual: DebitCredit Card FraudSim Swap Fraud
Predicted: Ransomware Attack, Actual: SQL Injection
Predicted: UPI Related Frauds, Actual: Fraud CallVishing
Predicted: Other, Actual: Other
Predicted: DebitCredit Card FraudSim Swap Fraud, Actual: Internet Banking Related Fraud
Predicted: Other, Actual: Unauthorised AccessData Breach
Predicted: DebitCredit Card FraudSim Swap Fraud, Actual: UPI Related Frauds
Predicted: Internet Banking Related Fraud, Actual: Internet Banking Related Fraud
Predicted: DebitCredit Card FraudSim Swap Fraud, Actual: DebitCredit Card FraudSim Swap Fraud


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Subcategory Classification Report:
                                                                      precision    recall  f1-score   support

                             Business Email CompromiseEmail Takeover       0.00      0.00      0.00        90
                                           Cheating by Impersonation       0.00      0.00      0.00       719
                                        Computer Generated CSAM/CSEM       0.00      0.00      0.00         2
                                                Cryptocurrency Fraud       0.00      0.00      0.00       166
                                    Cyber Blackmailing & Threatening       0.00      0.00      0.00         1
                                   Cyber Bullying  Stalking  Sexting       0.39      0.61      0.47      1366
                                                     Cyber Terrorism       0.00      0.00      0.00        52
                             Damage to computer computer systems etc       0.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [19]:
# Define the path where you want to save the models on Google Drive
category_model_save_path = '/content/drive/MyDrive/category_model'
subcategory_model_save_path = '/content/drive/MyDrive/subcategory_model'

# Save category model and tokenizer to Google Drive
category_model.save_pretrained(category_model_save_path)
tokenizer.save_pretrained(category_model_save_path)

# Save subcategory model and tokenizer to Google Drive
subcategory_model.save_pretrained(subcategory_model_save_path)
tokenizer.save_pretrained(subcategory_model_save_path)

print("Models and tokenizers saved to Google Drive!")


Models and tokenizers saved to Google Drive!
