In [44]:
%pip install transformers torch torchtext scikit-learn pandas




In [45]:
%pip install torch==2.0.1 torchtext==0.15.2




In [46]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AdamW
from tqdm import tqdm


data = [
    {"crimeaditionalinfo": "I have continued receiving random calls and abusive messages on WhatsApp. Someone added my number to an unknown Facebook group called 'Only Girls' and I am still getting calls from unknown numbers. Please help.", "category": "Harassment"},
    {"crimeaditionalinfo": "The fraudster is continuously messaging me, asking me to pay him money or he will send fake nude photos of me to my contacts through WhatsApp.", "category": "Fraud"},
    {"crimeaditionalinfo": "He is pretending to be a police officer and demanding money with legal-sounding threats in text messages.", "category": "Harassment"},
    {"crimeaditionalinfo": "I applied for a telecalling job and paid security fees, but the job didn’t match the description. Please help recover my financial loss.", "category": "Scam"},
    {"crimeaditionalinfo": "Received a call from someone asking for OTP verification, claiming to be from my bank. I did not provide it, but I suspect it was an attempt to steal my information.", "category": "Phishing"},
    {"crimeaditionalinfo": "An app on the Play Store accessed my contacts without permission and is now harassing me to repay a loan I never took.", "category": "Harassment"},
    {"crimeaditionalinfo": "Received a message from someone pretending to be a friend asking for an urgent loan. Later found out it wasn’t my friend.", "category": "Identity Theft"},
    {"crimeaditionalinfo": "Someone created a fake social media profile in my name and has been messaging people with offensive content.", "category": "Identity Theft"},
    {"crimeaditionalinfo": "A company promised an investment opportunity with high returns, but after I invested, they stopped responding.", "category": "Fraud"},
    {"crimeaditionalinfo": "Received multiple calls asking me to update KYC for my bank account, requesting personal details. It seemed suspicious.", "category": "Phishing"},
    {"crimeaditionalinfo": "A website claimed to sell electronics at a discount, but after placing an order, there was no response from them.", "category": "Scam"},
    {"crimeaditionalinfo": "Received an email claiming I won a prize and asking for payment to cover taxes. I suspect it's a scam.", "category": "Scam"},
    {"crimeaditionalinfo": "Someone called, pretending to be a government official and threatened legal action unless I paid a fee.", "category": "Fraud"},
    {"crimeaditionalinfo": "An unknown person has been sending harassing messages and calling my phone, affecting my personal life.", "category": "Harassment"},
    {"crimeaditionalinfo": "I was contacted by someone claiming to represent my bank, asking for my account details to prevent account suspension.", "category": "Phishing"},
    {"crimeaditionalinfo": "A man I met online asked for emergency money. He claimed to be in a difficult situation, but I believe it was a scam.", "category": "Scam"},
    {"crimeaditionalinfo": "Someone used my identity to apply for a loan and is now harassing my contacts for repayment.", "category": "Identity Theft"},
    {"crimeaditionalinfo": "Received a call from a 'representative' of a tech company, asking for remote access to my computer to fix an issue.", "category": "Phishing"},
    {"crimeaditionalinfo": "I received multiple calls and messages threatening me to repay a loan I never took. Please help.", "category": "Harassment"},
    {"crimeaditionalinfo": "Someone used my profile picture and created a fake account to impersonate me on social media.", "category": "Identity Theft"},
    {"crimeaditionalinfo": "An online store claimed to have a limited-time offer, but after paying, I didn’t receive the product.", "category": "Scam"},
    {"crimeaditionalinfo": "A person contacted me through LinkedIn offering a job but asked for upfront fees, which seemed suspicious.", "category": "Scam"},
    {"crimeaditionalinfo": "Received a suspicious email with a link to claim a refund from my bank. They requested my account and password details.", "category": "Phishing"},
    {"crimeaditionalinfo": "A man is pretending to be my relative and is calling my office, asking for personal information. This is impacting my work.", "category": "Harassment"},
    {"crimeaditionalinfo": "Received a fake message from someone posing as my friend, asking for money for an emergency.", "category": "Identity Theft"}
]




df = pd.DataFrame(data)


class CustomTextDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length, text_column='crimeaditionalinfo', label_column='category'):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length


        self.label_encoder = LabelEncoder()
        self.data['encoded_label'] = self.label_encoder.fit_transform(self.data[label_column])
        self.text_column = text_column
        self.label_column = 'encoded_label'

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):

        text = self.data.iloc[idx][self.text_column]
        label = self.data.iloc[idx][self.label_column]


        tokens = self.tokenizer(
            text,
            padding='max_length',
            max_length=self.max_length,
            truncation=True,
            return_tensors="pt"
        )
        tokens = {key: val.squeeze(0) for key, val in tokens.items()}
        return tokens, torch.tensor(label)





In [47]:
def predict(model, tokenizer, texts, max_length=128):
    model.eval()
    predictions = []
    with torch.no_grad():
        for text in texts:
            tokens = tokenizer(
                text,
                padding='max_length',
                max_length=max_length,
                truncation=True,
                return_tensors="pt"
            )
            tokens = {key: val.to(model.device) for key, val in tokens.items()}
            output = model(**tokens)
            _, predicted_class = torch.max(output.logits, dim=1)
            predictions.append(predicted_class.item())
    return predictions


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

unique_labels = df['category'].nunique()
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=unique_labels)

MAX_LENGTH = 128
BATCH_SIZE = 16
EPOCHS = 10


train_dataset = CustomTextDataset(df, tokenizer, MAX_LENGTH, text_column='crimeaditionalinfo', label_column='category')
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)


optimizer = AdamW(model.parameters(), lr=4e-5)


model.train()
for epoch in range(EPOCHS):
    print(f"Epoch {epoch + 1}/{EPOCHS}")
    total_loss = 0
    for batch in tqdm(train_loader):
        optimizer.zero_grad()
        tokens, labels = batch
        outputs = model(**tokens, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    average_loss = total_loss / len(train_loader)
    print(f"Average Loss: {average_loss:.4f}")


model.save_pretrained("trained_bert_model")
tokenizer.save_pretrained("trained_bert_tokenizer")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10


100%|██████████| 2/2 [00:47<00:00, 23.82s/it]


Average Loss: 1.6199
Epoch 2/10


100%|██████████| 2/2 [00:25<00:00, 12.82s/it]


Average Loss: 1.5648
Epoch 3/10


100%|██████████| 2/2 [00:25<00:00, 12.64s/it]


Average Loss: 1.5390
Epoch 4/10


100%|██████████| 2/2 [00:25<00:00, 12.58s/it]


Average Loss: 1.5358
Epoch 5/10


100%|██████████| 2/2 [00:25<00:00, 12.77s/it]


Average Loss: 1.4448
Epoch 6/10


100%|██████████| 2/2 [00:24<00:00, 12.48s/it]


Average Loss: 1.3267
Epoch 7/10


100%|██████████| 2/2 [00:24<00:00, 12.19s/it]


Average Loss: 1.2326
Epoch 8/10


100%|██████████| 2/2 [00:24<00:00, 12.17s/it]


Average Loss: 1.1382
Epoch 9/10


100%|██████████| 2/2 [00:23<00:00, 11.68s/it]


Average Loss: 1.0894
Epoch 10/10


100%|██████████| 2/2 [00:23<00:00, 11.88s/it]


Average Loss: 0.9585


('trained_bert_tokenizer/tokenizer_config.json',
 'trained_bert_tokenizer/special_tokens_map.json',
 'trained_bert_tokenizer/vocab.txt',
 'trained_bert_tokenizer/added_tokens.json')

In [48]:

unknown_texts = [
    "someone posing as my friend, asking for money for an emergency. "
]
predicted_classes = predict(model, tokenizer, unknown_texts)


for text, prediction in zip(unknown_texts, predicted_classes):
    print(f"Text: {text} | Predicted Class: {prediction} | Class Label: {train_dataset.label_encoder.inverse_transform([prediction])[0]}")

Text: someone posing as my friend, asking for money for an emergency.  | Predicted Class: 2 | Class Label: Identity Theft


In [49]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score


data = {
    'category': [
        "Online and Social Media Related Crime", "Online Financial Fraud", "Online Gambling Betting",
        "Online and Social Media Related Crime", "Online Financial Fraud", "Online Financial Fraud",
        "Online Financial Fraud", "Online Financial Fraud", "RapeGang Rape RGRSexually Abusive Content",
        "Any Other Cyber Crime", "Online and Social Media Related Crime", "Any Other Cyber Crime",
        "Online and Social Media Related Crime", "Online Financial Fraud", "Online Financial Fraud",
        "Online Financial Fraud", "Online Financial Fraud", "Online Financial Fraud", "Online Financial Fraud",
        "Cyber Attack/ Dependent Crimes", "Online Financial Fraud", "Online Gambling Betting",
        "Online and Social Media Related Crime", "Cyber Attack/ Dependent Crimes"
    ],
    'sub_category': [
        "Cyber Bullying Stalking Sexting", "Fraud CallVishing", "Online Gambling Betting",
        "Online Job Fraud", "Fraud CallVishing", "UPI Related Frauds", "Fraud CallVishing",
        "Internet Banking Related Fraud", "Sexually Abusive Content", "Other", "Cyber Bullying Stalking Sexting",
        "Other", "Profile Hacking Identity Theft", "DebitCredit Card FraudSim Swap Fraud",
        "UPI Related Frauds", "UPI Related Frauds", "UPI Related Frauds", "EWallet Related Fraud",
        "Internet Banking Related Fraud", "Data Breach/Theft", "Fraud CallVishing", "Online Gambling Betting",
        "Cheating by Impersonation", "Denial of Service (DoS)/Distributed Denial of Service (DDOS) attacks"
    ]
}
df = pd.DataFrame(data)


X_train, X_test, y_train, y_test = train_test_split(df['sub_category'], df['category'], test_size=0.2, random_state=42)


tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


classifier = RandomForestClassifier(random_state=42)
classifier.fit(X_train_tfidf, y_train)


y_pred = classifier.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


sample_sub_category = ["Fraud CallVishing"]
sample_tfidf = tfidf.transform(sample_sub_category)
predicted_category = classifier.predict(sample_tfidf)
print("\nPredicted Category:", predicted_category[0])


Accuracy: 0.8

Classification Report:
                                            precision    recall  f1-score   support

                    Any Other Cyber Crime       1.00      1.00      1.00         1
                   Online Financial Fraud       0.67      1.00      0.80         2
    Online and Social Media Related Crime       1.00      1.00      1.00         1
RapeGang Rape RGRSexually Abusive Content       0.00      0.00      0.00         1

                                 accuracy                           0.80         5
                                macro avg       0.67      0.75      0.70         5
                             weighted avg       0.67      0.80      0.72         5


Predicted Category: Online Financial Fraud


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Text Filtering


In [50]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split


sentences = [
    "I received a call from someone asking for my bank account details.",
    "You've won a lottery! Send us your bank details to claim the prize.",
    "Please update your account information to prevent deactivation.",
    "Hey, just wanted to check in on our meeting next week.",
    "I need help with my order; it didn't arrive on time.",
    "Your account has been compromised; please send your password to fix it.",

]
labels = [1, 1, 1, 0, 0, 1]


tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(sentences)
padded_sequences = pad_sequences(sequences, maxlen=20, padding='post', truncating='post')


X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)


model = Sequential([
    Embedding(input_dim=5000, output_dim=64, input_length=20),
    SpatialDropout1D(0.2),
    LSTM(64, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])


model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.fit(X_train, np.array(y_train), epochs=20, batch_size=1, validation_data=(X_test, np.array(y_test)))

loss, accuracy = model.evaluate(X_test, np.array(y_test))
print(f"Test Accuracy: {accuracy:.2f}")



Epoch 1/20




[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 268ms/step - accuracy: 0.1667 - loss: 0.7028 - val_accuracy: 1.0000 - val_loss: 0.6705
Epoch 2/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - accuracy: 0.2667 - loss: 0.7021 - val_accuracy: 1.0000 - val_loss: 0.6820
Epoch 3/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - accuracy: 0.7333 - loss: 0.6845 - val_accuracy: 1.0000 - val_loss: 0.6720
Epoch 4/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - accuracy: 0.4333 - loss: 0.6982 - val_accuracy: 1.0000 - val_loss: 0.6758
Epoch 5/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.8333 - loss: 0.6803 - val_accuracy: 1.0000 - val_loss: 0.6817
Epoch 6/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - accuracy: 0.3667 - loss: 0.6964 - val_accuracy: 1.0000 - val_loss: 0.6861
Epoch 7/20
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37

In [51]:
new_sentence = ["Yourrrrrrrrr"]
new_sequence = tokenizer.texts_to_sequences(new_sentence)
new_padded = pad_sequences(new_sequence, maxlen=20, padding='post', truncating='post')
prediction = model.predict(new_padded)
print("Fraud probability:", prediction[0][0])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 290ms/step
Fraud probability: 0.9886178


In [54]:
%pip install transformers torch scikit-learn
!pip install datasets



Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [96]:
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import accuracy_score


sentences = [
    "I received a call from someone asking for my bank account details.",  # fraud
    "You've won a lottery! Send us your bank details to claim the prize.",  # fraud
    "Please update your account information to prevent deactivation.",  # fraud
    "Hey, just wanted to check in on our meeting next week.",  # not fraud
    "I need help with my order; it didn't arrive on time.",  # not fraud
    "Your account has been compromised; please send your password to fix it.",  # fraud
    "Urgent: Your bank account is under review. Kindly verify your personal information immediately to avoid restrictions.",  # fraud
    "Warning: Unauthorized login attempts detected on your account. Please confirm your identity to secure your account.",  # fraud
    "We noticed suspicious activity in your account. Please reply with your account number to verify your identity.",  # fraud
    "You have been selected to receive a special reward. Please send your payment details to claim your prize.",  # fraud
    "Your subscription is about to expire. To prevent service interruption, please update your payment information as soon as possible.",  # not fraud
    "Important: Your account has been locked due to multiple failed login attempts. Click here to reset your password.",  # fraud
    "We are conducting a security check. Kindly provide your social security number and date of birth to verify your account.",  # fraud
    "Exclusive Offer: You've won a free vacation! Please provide your payment details to confirm your booking.",  # fraud
    "We need to verify your identity. Please send a copy of your ID and recent utility bill to proceed.",  # fraud
    "Congratulations! You've won a gift card worth $500. Please reply with your email address and payment info to claim it.",  # fraud
    "Immediate action required: Your account has been flagged for suspicious activity. Please log in and verify your account details immediately.",  # fraud

]

labels = [1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,1,1,1,1]


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(sentences, labels, test_size=0.2, random_state=42)

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the input data
train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=32)
test_encodings = tokenizer(X_test, truncation=True, padding=True, max_length=32)

# Create a Dataset object for the Hugging Face Trainer API
train_dataset = Dataset.from_dict({'input_ids': train_encodings['input_ids'], 'attention_mask': train_encodings['attention_mask'], 'label': y_train})
test_dataset = Dataset.from_dict({'input_ids': test_encodings['input_ids'], 'attention_mask': test_encodings['attention_mask'], 'label': y_test})

# Load the pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)



# Define the compute_metrics function to calculate accuracy
def compute_metrics(p):
    predictions, labels = p
    # Convert numpy.ndarray to torch.Tensor
    predictions = torch.tensor(predictions)
    # Apply argmax to get the predicted labels
    preds = torch.argmax(predictions, dim=-1)
    return {'accuracy': accuracy_score(labels, preds)}


# Define training arguments, disable wandb
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=10,              # number of training epochs
    per_device_train_batch_size=4,   # batch size for training
    per_device_eval_batch_size=4,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',
    # learning_rate=5e-5,# directory for storing logs
    report_to="none",                # Disable Wandb
)

# Initialize the Trainer
trainer = Trainer(
    model=model,                         # the model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,           # evaluation dataset
    compute_metrics=compute_metrics      # pass the compute_metrics function
)

# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()
print(f"Test Accuracy: {results['eval_accuracy']:.2f}")

# Test with a new sentence
new_sentence = "Hey, just wanted to check in on our meeting next week."
inputs = tokenizer(new_sentence, return_tensors="pt", truncation=True, padding=True, max_length=32)
output = model(**inputs)
prediction = torch.argmax(output.logits, dim=-1).item()
print(f"Fraud prediction for new sentence: {'fraud' if prediction == 1 else 'not fraud'}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Test Accuracy: 1.00
Fraud prediction for new sentence: not fraud


In [97]:

# Test with a new sentence
new_sentence = "I received a call from someone asking for my bank account details."
inputs = tokenizer(new_sentence, return_tensors="pt", truncation=True, padding=True, max_length=32)
output = model(**inputs)
prediction = torch.argmax(output.logits, dim=-1).item()
print(f"Fraud prediction for new sentence: {'fraud' if prediction == 1 else 'not fraud'}")

Fraud prediction for new sentence: fraud
