In [6]:
from google.colab import files
uploaded=files.upload()

Saving test.csv to test.csv
Saving train .csv to train .csv


In [9]:
# Import necessary libraries
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, TrainerCallback
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from torch.utils.data import Dataset

# Load training and testing data
train_data = pd.read_csv("train .csv")
test_data = pd.read_csv("test.csv")

# Handle NaN values
train_data.dropna(subset=['crimeaditionalinfo', 'category', 'sub_category'], inplace=True)
test_data.dropna(subset=['crimeaditionalinfo', 'category', 'sub_category'], inplace=True)

# Label Encoding
category_labels = {label: i for i, label in enumerate(train_data['category'].unique())}
sub_category_labels = {label: i for i, label in enumerate(train_data['sub_category'].unique())}

train_data['category'] = train_data['category'].map(category_labels)
train_data['sub_category'] = train_data['sub_category'].map(sub_category_labels)
test_data['category'] = test_data['category'].map(category_labels)
test_data['sub_category'] = test_data['sub_category'].map(sub_category_labels)

train_data.dropna(subset=['category', 'sub_category'], inplace=True)
test_data.dropna(subset=['category', 'sub_category'], inplace=True)

# Combine category and sub_category into a single label
num_categories = len(category_labels)
train_data["combined_label"] = train_data["category"] * len(sub_category_labels) + train_data["sub_category"]
test_data["combined_label"] = test_data["category"] * len(sub_category_labels) + test_data["sub_category"]

# Tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
max_len = 128

# Define Dataset class for PyTorch
class ComplaintDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt"
        )

        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "labels": torch.tensor(label, dtype=torch.long)
        }

# Prepare Dataset
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_data["crimeaditionalinfo"].values,
    train_data["combined_label"].values,
    test_size=0.1,
    random_state=42
)

train_dataset = ComplaintDataset(train_texts, train_labels, tokenizer, max_len)
val_dataset = ComplaintDataset(val_texts, val_labels, tokenizer, max_len)

# Model Initialization
num_labels = len(category_labels) * len(sub_category_labels)
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)

# Custom Callback to Print Loss and Batch Number
class LossCallback(TrainerCallback):
    def on_step_begin(self, args, state, control, **kwargs):
        print(f"Epoch: {state.epoch}, Batch: {state.global_step}")

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs:
            print(f"Epoch: {state.epoch}, Batch: {state.global_step}, Loss: {logs.get('loss', 'N/A')}")

# Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="epoch"
)

# Metric Function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    # Decode combined labels back into category and sub-category
    category_labels_pred = preds // len(sub_category_labels)
    sub_category_labels_pred = preds % len(sub_category_labels)
    category_labels_true = labels // len(sub_category_labels)
    sub_category_labels_true = labels % len(sub_category_labels)

    # Calculate metrics for category
    accuracy_cat = accuracy_score(category_labels_true, category_labels_pred)
    precision_cat, recall_cat, f1_cat, _ = precision_recall_fscore_support(category_labels_true, category_labels_pred, average="weighted")

    # Calculate metrics for sub_category
    accuracy_sub_cat = accuracy_score(sub_category_labels_true, sub_category_labels_pred)
    precision_sub_cat, recall_sub_cat, f1_sub_cat, _ = precision_recall_fscore_support(sub_category_labels_true, sub_category_labels_pred, average="weighted")

    # Organize metrics into a dictionary
    metrics = {
        "accuracy_category": accuracy_cat,
        "precision_category": precision_cat,
        "recall_category": recall_cat,
        "f1_category": f1_cat,
        "accuracy_sub_category": accuracy_sub_cat,
        "precision_sub_category": precision_sub_cat,
        "recall_sub_category": recall_sub_cat,
        "f1_sub_category": f1_sub_cat,
    }

    return metrics

# Trainer Initialization with Custom Callback
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[LossCallback()]
)

# Training
trainer.train()

# Evaluation on test dataset
test_texts = test_data["crimeaditionalinfo"].values
test_labels = test_data["combined_label"].values
test_dataset = ComplaintDataset(test_texts, test_labels, tokenizer, max_len)

eval_result = trainer.evaluate(test_dataset)

# Save Evaluation Results to File
with open("output_train_results.txt", "w") as f:
    f.write("Evaluation Results for Category and Sub-Category\n")
    f.write("Accuracy for Category: {:.4f}\n".format(eval_result["eval_accuracy_category"]))
    f.write("Precision for Category: {:.4f}\n".format(eval_result["eval_precision_category"]))
    f.write("Recall for Category: {:.4f}\n".format(eval_result["eval_recall_category"]))
    f.write("F1 Score for Category: {:.4f}\n".format(eval_result["eval_f1_category"]))
    f.write("Accuracy for Sub-Category: {:.4f}\n".format(eval_result["eval_accuracy_sub_category"]))
    f.write("Precision for Sub-Category: {:.4f}\n".format(eval_result["eval_precision_sub_category"]))
    f.write("Recall for Sub-Category: {:.4f}\n".format(eval_result["eval_recall_sub_category"]))
    f.write("F1 Score for Sub-Category: {:.4f}\n".format(eval_result["eval_f1_sub_category"]))


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch: 0, Batch: 0


Epoch,Training Loss,Validation Loss,Accuracy Category,Precision Category,Recall Category,F1 Category,Accuracy Sub Category,Precision Sub Category,Recall Sub Category,F1 Sub Category
1,1.4368,1.33988,0.767455,0.767167,0.767455,0.762064,0.568328,0.526137,0.568328,0.530602
2,1.2257,1.306961,0.770096,0.77588,0.770096,0.771893,0.582338,0.556385,0.582338,0.556934
3,1.0939,1.329188,0.787781,0.77911,0.787781,0.782504,0.587965,0.561462,0.587965,0.567247


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Epoch: 0.07207023274806043, Batch: 353
Epoch: 0.0722743977133524, Batch: 354
Epoch: 0.07247856267864435, Batch: 355
Epoch: 0.0726827276439363, Batch: 356
Epoch: 0.07288689260922826, Batch: 357
Epoch: 0.07309105757452021, Batch: 358
Epoch: 0.07329522253981217, Batch: 359
Epoch: 0.07349938750510412, Batch: 360, Loss: 2.055
Epoch: 0.07349938750510412, Batch: 360
Epoch: 0.07370355247039607, Batch: 361
Epoch: 0.07390771743568804, Batch: 362
Epoch: 0.07411188240097999, Batch: 363
Epoch: 0.07431604736627195, Batch: 364
Epoch: 0.0745202123315639, Batch: 365
Epoch: 0.07472437729685585, Batch: 366
Epoch: 0.07492854226214782, Batch: 367
Epoch: 0.07513270722743977, Batch: 368
Epoch: 0.07533687219273173, Batch: 369
Epoch: 0.07554103715802368, Batch: 370, Loss: 2.0523
Epoch: 0.07554103715802368, Batch: 370
Epoch: 0.07574520212331563, Batch: 371
Epoch: 0.0759493670886076, Batch: 372
Epoch: 0.07615353205389955, Batch: 373
Epoch: 0.076357

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Epoch: 1.0720702327480605, Batch: 5251
Epoch: 1.0722743977133524, Batch: 5252
Epoch: 1.0724785626786444, Batch: 5253
Epoch: 1.0726827276439364, Batch: 5254
Epoch: 1.0728868926092283, Batch: 5255
Epoch: 1.0730910575745203, Batch: 5256
Epoch: 1.0732952225398122, Batch: 5257
Epoch: 1.0734993875051042, Batch: 5258
Epoch: 1.0737035524703962, Batch: 5259
Epoch: 1.0739077174356881, Batch: 5260, Loss: 1.3887
Epoch: 1.0739077174356881, Batch: 5260
Epoch: 1.07411188240098, Batch: 5261
Epoch: 1.0743160473662718, Batch: 5262
Epoch: 1.0745202123315638, Batch: 5263
Epoch: 1.0747243772968558, Batch: 5264
Epoch: 1.0749285422621477, Batch: 5265
Epoch: 1.0751327072274397, Batch: 5266
Epoch: 1.0753368721927317, Batch: 5267
Epoch: 1.0755410371580236, Batch: 5268
Epoch: 1.0757452021233156, Batch: 5269
Epoch: 1.0759493670886076, Batch: 5270, Loss: 1.1985
Epoch: 1.0759493670886076, Batch: 5270
Epoch: 1.0761535320538995, Batch: 5271
Epoch: 1.076

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Epoch: 2.072274397713352, Batch: 10150, Loss: 1.0598
Epoch: 2.072274397713352, Batch: 10150
Epoch: 2.0724785626786444, Batch: 10151
Epoch: 2.072682727643936, Batch: 10152
Epoch: 2.0728868926092283, Batch: 10153
Epoch: 2.07309105757452, Batch: 10154
Epoch: 2.0732952225398122, Batch: 10155
Epoch: 2.073499387505104, Batch: 10156
Epoch: 2.073703552470396, Batch: 10157
Epoch: 2.073907717435688, Batch: 10158
Epoch: 2.07411188240098, Batch: 10159
Epoch: 2.074316047366272, Batch: 10160, Loss: 1.1448
Epoch: 2.074316047366272, Batch: 10160
Epoch: 2.074520212331564, Batch: 10161
Epoch: 2.0747243772968558, Batch: 10162
Epoch: 2.074928542262148, Batch: 10163
Epoch: 2.0751327072274397, Batch: 10164
Epoch: 2.075336872192732, Batch: 10165
Epoch: 2.0755410371580236, Batch: 10166
Epoch: 2.075745202123316, Batch: 10167
Epoch: 2.0759493670886076, Batch: 10168
Epoch: 2.0761535320538997, Batch: 10169
Epoch: 2.0763576970191915, Batch: 10170, Lo

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch: 3.0, Batch: 14694, Loss: N/A


Epoch: 3.0, Batch: 14694, Loss: N/A


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [10]:
from google.colab import files

# Download the file
files.download("output_train_results.txt")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>