In [26]:
import torch
import pandas as pd
import numpy as np
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

In [27]:
df = pd.read_csv('datasets/Epics_Main_dataset.csv')

In [28]:
df.head()

Unnamed: 0,text,label
0,I deposited ₹55905 in my savings account,Amount
1,My mobile no. is 8634525228,Phone Number
2,Call me on 7662180703,Phone Number
3,I deposited 95759 INR in my savings account,Amount
4,Call me on 6862714992,Phone Number


In [4]:
df['label'].describe()

count      19372
unique         4
top       Amount
freq        4904
Name: label, dtype: object

In [5]:
df.shape

(19372, 2)

## Preprocessing

In [29]:
import re
import nltk
import contractions
from word2number import w2n
from num2words import num2words
# from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download stopwords for the first time
# nltk.download("stopwords")
# nltk.download("punkt")
# nltk.download('punkt_tab')


In [None]:
# Loading stopwords
# stop_words = set(stopwords.words("english"))

In [30]:
# Number word mapping for conversions (expandable)
number_mapping = {
    "k": "000",   # 5k -> 5000
    "m": "000000",  # 2m -> 2000000
    "b": "000000000",  # 3b -> 3000000000
    "lakh": "00000",  # 10 lakh -> 1000000
    "crore": "0000000",  # 2 crore -> 20000000
    "million": "000000",  # half million -> 500000
    "billion": "000000000"  # 3 billion -> 3000000000
}

In [31]:
def convert_words_to_numbers(text):
    """
    Convert spoken numbers (e.g., 'five thousand' → '5000') & handle 5k, 10 lakh, etc.
    """
    words = text.split()
    processed_words = []
    temp_phrase = ""

    for word in words:
        # Handle abbreviations like "5k" → "5000"
        for key, value in number_mapping.items():
            if word.endswith(key):
                num_part = re.sub(r"\D", "", word)  # Extract numeric part
                if num_part:
                    processed_words.append(num_part + value)
                    temp_phrase = ""  # Clear the phrase
                break
        else:
            # Accumulate words to form a numeric phrase
            temp_phrase += f" {word}"
            try:
                # Attempt to convert accumulated words to a number
                num_value = w2n.word_to_num(temp_phrase.strip())
                processed_words.append(str(num_value))
                temp_phrase = ""  # Clear after conversion
            except ValueError:
                continue

    # Append any remaining phrase
    if temp_phrase.strip():
        processed_words.extend(temp_phrase.strip().split())

    return " ".join(processed_words)

In [32]:
def preprocess_text(text):
    """
    Preprocess input text:
    1. Expand contractions (e.g., "I'm" → "I am")
    2. Convert to lowercase
    3. Remove punctuation & special characters
    4. Convert numbers (e.g., '5k' → '5000')
    5. Remove stopwords (optional)
    6. Handle multiple spaces
    """

    # Expand contractions (e.g., "I'm" → "I am")
    text = contractions.fix(text)
    
    # Convert to lowercase
    text = text.lower()

    # Remove special characters & punctuation (except numbers & words)
    text = re.sub(r"[^\w\s]", "", text)

    # Convert numbers in words and handle 5k, 10 lakh, etc.
    text = convert_words_to_numbers(text)

    # Remove stopwords (optional)
    # words = word_tokenize(text)
    # text = " ".join([word for word in words if word not in stop_words])

    # Remove extra spaces
    text = re.sub(r"\s+", " ", text).strip()

    return text

In [33]:
# Apply Preprocessing
df["text"] = df["text"].apply(preprocess_text)

print(df["text"])


0            i deposited 55905 in my savings account
1                         my mobile no is 8634525228
2                              call me on 7662180703
3        i deposited 95759 inr in my savings account
4                              call me on 6862714992
                            ...                     
19367                             ac no 827137161025
19368                   here is my number 6626638399
19369                   here is my number 9836564261
19370              my account number is 110449552902
19371             i would like to withdraw 38 rupees
Name: text, Length: 19372, dtype: object


In [11]:
df['text'][0]

'i deposited 55905 in my savings account'

In [34]:
df.size

38744

## TTS

In [37]:
# making train-test split
train_texts, val_texts, train_labels, val_labels = train_test_split(df["text"], df["label"], test_size=0.2, random_state=42)

# Converting our data to Hugging Face Dataset format(gives better results while using big dataset)
train_dataset = Dataset.from_dict({"text": train_texts.tolist(), "label": train_labels.tolist()})
val_dataset = Dataset.from_dict({"text": val_texts.tolist(), "label": val_labels.tolist()})

In [38]:
# Define label mapping
label_mapping = {
    "Name": 0,
    "Phone Number": 1,
    "Amount": 2,
    "Account Number": 3
}

# Apply mapping to dataset
train_dataset = train_dataset.map(lambda x: {"label": label_mapping[x["label"]]})
val_dataset = val_dataset.map(lambda x: {"label": label_mapping[x["label"]]})

Map: 100%|██████████| 15497/15497 [00:01<00:00, 8444.18 examples/s]
Map: 100%|██████████| 3875/3875 [00:00<00:00, 9783.65 examples/s]


In [None]:
# Load DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenization function
def tokenize_function(examples):
    return tokenizer(
        examples["text"], 
        padding="max_length", 
        truncation=True,       
        max_length=64,         
        return_tensors="pt"   
    )

# Apply tokenization
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 15497/15497 [00:07<00:00, 2028.77 examples/s]
Map: 100%|██████████| 3875/3875 [00:01<00:00, 2011.91 examples/s]


In [None]:
# Convert labels to integers
# train_dataset = train_dataset.map(lambda x: {"label": int(x["label"])})
# val_dataset = val_dataset.map(lambda x: {"label": int(x["label"])})

Map: 100%|██████████| 15497/15497 [00:04<00:00, 3853.97 examples/s]
Map: 100%|██████████| 3875/3875 [00:00<00:00, 10087.55 examples/s]


In [None]:
from transformers import DistilBertForSequenceClassification, DistilBertConfig

# Create a configuration with increased dropout
config = DistilBertConfig.from_pretrained(
    "distilbert-base-uncased",
    num_labels=4, 
    dropout=0.3, 
    attention_dropout=0.3 
)

# Load model with the updated configuration
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    config=config
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [42]:
# Define function to compute metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")
    return {"accuracy": acc, "f1": f1}

In [None]:
# Import TensorBoard
from transformers.integrations import TensorBoardCallback
from torch.utils.tensorboard import SummaryWriter

# Initialize TensorBoard writer
writer = SummaryWriter(log_dir="./tensorboard_logs")

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01, 
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    report_to=["tensorboard"],
)

# Trainer initialization
from transformers import EarlyStoppingCallback

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3), TensorBoardCallback(writer)]
)

  trainer = Trainer(


In [44]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0752,3e-05,1.0,1.0
2,0.0,4e-06,1.0,1.0
3,0.0,2e-06,1.0,1.0
4,0.0,2e-06,1.0,1.0
5,0.0,1e-06,1.0,1.0


TrainOutput(global_step=9690, training_loss=0.006636795067261701, metrics={'train_runtime': 39029.7463, 'train_samples_per_second': 1.985, 'train_steps_per_second': 0.248, 'total_flos': 1283075310128640.0, 'train_loss': 0.006636795067261701, 'epoch': 5.0})

In [45]:
eval_results = trainer.evaluate()
print("Evaluation Results:", eval_results)

Evaluation Results: {'eval_loss': 8.435090421698987e-07, 'eval_accuracy': 1.0, 'eval_f1': 1.0, 'eval_runtime': 468.7786, 'eval_samples_per_second': 8.266, 'eval_steps_per_second': 1.035, 'epoch': 5.0}


In [46]:
def predict(text):
    inputs = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=64)
    with torch.no_grad():
        outputs = model(**inputs)
    prediction = torch.argmax(outputs.logits, dim=1).item()
    categories = ["Name", "Phone Number", "Amount", "Account Number"]
    return categories[prediction]

# Example Prediction
example_text = "Transfer 10000 rupees"
print("Predicted Category:", predict(example_text))

Predicted Category: Amount


In [1]:
model.summary()

NameError: name 'model' is not defined