In [1]:
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer

model_path = "Models/ClassificationModel"

# Load the trained model
model = DistilBertForSequenceClassification.from_pretrained(model_path)

# Load the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained(model_path)

print("Model and tokenizer loaded successfully!")

  from .autonotebook import tqdm as notebook_tqdm


Model and tokenizer loaded successfully!


In [3]:
import torch
def predict(text):
    inputs = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=64)
    with torch.no_grad():
        outputs = model(**inputs)
    prediction = torch.argmax(outputs.logits, dim=1).item()
    categories = ["Name", "Phone Number", "Amount", "Account Number"]
    return categories[prediction]

# Example Prediction
example_text = "Transfer 0 rupees to nitin"
print("Predicted Category:", predict(example_text))

Predicted Category: Amount


In [4]:
print(model)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.3, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.3, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.3, inplace=False)


In [6]:
%pip install torchinfo

Collecting torchinfoNote: you may need to restart the kernel to use updated packages.

  Downloading torchinfo-1.8.0-py3-none-any.whl.metadata (21 kB)
Downloading torchinfo-1.8.0-py3-none-any.whl (23 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.8.0



[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
from torchinfo import summary
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

# Create dummy input
inputs = tokenizer("This is a test sentence", return_tensors="pt")

# Use torchinfo to print summary
summary(model, input_data=(inputs['input_ids'],), depth=3)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Layer (type:depth-idx)                                  Output Shape              Param #
DistilBertForSequenceClassification                     [1, 2]                    --
├─DistilBertModel: 1-1                                  [1, 7, 768]               --
│    └─Embeddings: 2-1                                  [1, 7, 768]               --
│    │    └─Embedding: 3-1                              [1, 7, 768]               23,440,896
│    │    └─Embedding: 3-2                              [1, 7, 768]               393,216
│    │    └─LayerNorm: 3-3                              [1, 7, 768]               1,536
│    │    └─Dropout: 3-4                                [1, 7, 768]               --
│    └─Transformer: 2-2                                 [1, 7, 768]               --
│    │    └─ModuleList: 3-5                             --                        42,527,232
├─Linear: 1-2                                           [1, 768]                  590,592
├─Dropout: 1-3                 

In [2]:
import re
import spacy


In [3]:
nlp = spacy.load("en_core_web_sm")

In [None]:
# pip install spacy
# python -m spacy download en_core_web_sm



# Common intro patterns (regex) to capture names after phrases
INTRO_PATTERNS = [
    r"(?:this is|this's)\s+([A-Z][\w\-]+(?:\s+[A-Z][\w\-]+)*)",
    r"(?:i am|i'm|im)\s+([A-Z][\w\-]+(?:\s+[A-Z][\w\-]+)*)",
    r"(?:call me|friends call me|my name is)\s+([A-Z][\w\-]+(?:\s+[A-Z][\w\-]+)*)",
    r"introduce myself\s*,?\s*i am\s+([A-Z][\w\-]+(?:\s+[A-Z][\w\-]+)*)"
]
compiled_patterns = [re.compile(p, re.IGNORECASE) for p in INTRO_PATTERNS]

def extract_names_spacy(text):
    names = []

    # 1) Rule-based regex patterns (high precision)
    for pat in compiled_patterns:
        for m in pat.finditer(text):
            candidate = m.group(1).strip()
            # Basic cleanup: strip trailing punctuation
            candidate = re.sub(r'[\.,;:!?\)]*$', '', candidate)
            names.append(candidate)

    # 2) POS-based extraction using spaCy: consecutive PROPN tokens
    doc = nlp(text)
    prop_seq = []
    for token in doc:
        if token.pos_ == "PROPN":
            prop_seq.append(token.text)
        else:
            if prop_seq:
                # join sequence and add if plausible (length > 1 char and more than 1 token or looks like a name)
                candidate = " ".join(prop_seq)
                # avoid adding duplicates
                if candidate not in names:
                    names.append(candidate)
                prop_seq = []
    # flush
    if prop_seq:
        candidate = " ".join(prop_seq)
        if candidate not in names:
            names.append(candidate)

    # 3) Clean results: remove false positives (common words) - optional filter step
    cleaned = []
    for nm in names:
        nm_clean = nm.strip()
        # discard single-letter tokens or digits
        if len(nm_clean) < 2: continue
        if re.search(r'\d', nm_clean): continue
        cleaned.append(nm_clean)
    return cleaned


In [20]:
extract_names_spacy("This is Kavita Reddy speaking")

['Kavita Reddy speaking', 'Kavita Reddy']

In [11]:
def extract_with_regex(text):
    import re

    # Phone (Indian-style 10 digits, optional +91 or 0)
    phones = re.findall(r'(?:(?:\+91|0)?[\s\-]?)?[6-9]\d{9}', text)

    # Account numbers (usually 11 to 18 digits)
    accounts = re.findall(r'\b\d{11,18}\b', text)

    # Amounts (₹, Rs., or plain numbers with commas/decimals)
    amounts = re.findall(r'(?:₹|Rs\.?|INR)?[\s]?[0-9,]+(?:\.\d{1,2})?', text)

    return {
        "phones": phones[0],
        "accounts": accounts[0],
        "amounts": amounts
    }


In [12]:
extract_with_regex("account number 123456789012 Please transfer ₹5,000 to account number 123456789012 and call me at 9876543210")

{'phones': ' 9876543210',
 'accounts': '123456789012',
 'amounts': [' 123456789012', '₹5,000', ' 123456789012', ' 9876543210']}

In [4]:
import spacy

# Load English model
nlp = spacy.load("en_core_web_sm")

def extract_names(sentence):
    doc = nlp(sentence)
    names = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
    return names

# Test sentences
sentences = [
    "My name is Nitin Mishra and I live in Delhi.",
    "Barack Obama was the 44th President of the United States.",
    "Please connect me with Dr. A. P. J. Abdul Kalam."
]

for s in sentences:
    print(f"Sentence: {s}")
    print("Extracted Names:", extract_names(s))
    print("-" * 50)


Sentence: My name is Nitin Mishra and I live in Delhi.
Extracted Names: ['Nitin Mishra']
--------------------------------------------------
Sentence: Barack Obama was the 44th President of the United States.
Extracted Names: ['Barack Obama']
--------------------------------------------------
Sentence: Please connect me with Dr. A. P. J. Abdul Kalam.
Extracted Names: ['A. P. J. Abdul Kalam']
--------------------------------------------------
