In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import make_pipeline

In [2]:
train_df = pd.read_csv('/content/drive/MyDrive/train55.csv')
test_df = pd.read_csv('/content/drive/MyDrive/test55.csv')


In [3]:
print("Train Data Sample:")
print(train_df.head())
print("\nTest Data Sample:")
print(test_df.head())

Train Data Sample:
                                category                       sub_category  \
0  Online and Social Media Related Crime  Cyber Bullying  Stalking  Sexting   
1                 Online Financial Fraud                  Fraud CallVishing   
2               Online Gambling  Betting           Online Gambling  Betting   
3  Online and Social Media Related Crime                   Online Job Fraud   
4                 Online Financial Fraud                  Fraud CallVishing   

                                  crimeaditionalinfo  
0  I had continue received random calls and abusi...  
1  The above fraudster is continuously messaging ...  
2  He is acting like a police and demanding for m...  
3  In apna Job I have applied for job interview f...  
4  I received a call from lady stating that she w...  

Test Data Sample:
                                    category  \
0  RapeGang Rape RGRSexually Abusive Content   
1                     Online Financial Fraud   
2            

In [4]:
X_train = train_df['crimeaditionalinfo']
y_train = train_df['category']
X_test = test_df['crimeaditionalinfo']
y_test = test_df['category']

In [5]:

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(train_df['category'])


In [6]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_encoder.classes_))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:

X_train = X_train.fillna("").astype(str)
X_test = X_test.fillna("").astype(str)
train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=128)


In [9]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    logging_dir='./logs',
)



In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_encodings,
    eval_dataset=test_encodings
)

In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_encodings,
    eval_dataset=test_encodings
)

In [12]:
# Combine labels for encoding
combined_labels = list(y_train) + list(y_test)

# Fit the LabelEncoder on the combined labels
label_encoder = LabelEncoder()
label_encoder.fit(combined_labels)

# Transform y_train and y_test separately
y_train = label_encoder.transform(y_train)
y_test = label_encoder.transform(y_test)


In [14]:
import torch  # Add this import

from transformers import Trainer, TrainingArguments
from torch.utils.data import Dataset

# Define the custom dataset including labels
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])  # Add labels here
        return item

    def __len__(self):
        return len(self.labels)

# Create datasets with encodings and labels
train_dataset = CustomDataset(train_encodings, y_train)
test_dataset = CustomDataset(test_encodings, y_test)

# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
)

# Create the Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)




In [17]:
from transformers import BertForSequenceClassification, BertTokenizer

# Assuming label_encoder is already defined and you have the correct number of labels
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
print(label_encoder.classes_)


['Any Other Cyber Crime'
 'Child Pornography CPChild Sexual Abuse Material CSAM'
 'Crime Against Women & Children' 'Cryptocurrency Crime'
 'Cyber Attack/ Dependent Crimes' 'Cyber Terrorism'
 'Hacking  Damage to computercomputer system etc'
 'Online Cyber Trafficking' 'Online Financial Fraud'
 'Online Gambling  Betting' 'Online and Social Media Related Crime'
 'Ransomware' 'RapeGang Rape RGRSexually Abusive Content'
 'Report Unlawful Content' 'Sexually Explicit Act'
 'Sexually Obscene material']


In [19]:
print(set(y_train))  # to print unique labels in the training set


{0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}


In [26]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [27]:
def compute_metrics(p):
    preds = p.predictions.argmax(axis=-1)
    labels = p.label_ids
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    accuracy = accuracy_score(labels, preds)
    return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1}


In [28]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

In [None]:
results = trainer.evaluate()
print(results)


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss
