In [None]:
import spacy
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import pandas as pd
import json
from spacy import displacy
from spacy.language import Language
 
# INTENTS and ENTITIES from your dataset
INTENTS = [
    "Total (Manpower_Manhour) are present in (discipline)",
    "Total (Manpower_Manhour) are present in (discipline) for (Month)",
    "Total (Manpower_Manhour) are present in (discipline) from (Month) till (Month)"
    "Manhours Plan vs Actuals for a (month) and (discipline)"
    "Difference in Manmonths  Plan vs Actuals for a particular month and discipline"
    "Count of manpower for a particular discipline for a month"

]

ENTITIES = {
    "Discipline": ["HSE", "CIVIL", "PROCESS", "ALL","CATHODIC PROTECTION","COMMISSIONING","DIGITAL","DOCUMENT CONTROL","HVAC", "INSTRUMENTATION AND ENVIRONMENT","MANUFACTURING","MECHANICAL","METALLURGY","PROCESS","PRODUCTION","PROTECTION","QUALITY","SUBCONTRACT ELECTRICAL","TECHNICAL MANAGEMENT","STATIC","PROJECT MANAGEMENT","PROJECT CONTROL"],
    "Month": ["Nov-2024", "Apr-2024", "JAN-2023","SEP-2023", "JUL-2023","FEB-2023","any date in form of mm-yy or mmm-yyyy", "current month", "CURRENT MONTH","today"],
    "Manhour_Manpower_type": ["manpower", "manhour", "man hour", "man power","manmonths", "man months", "manmonnth", "man month", "Manhours Plan vs Actuals", "Manhour Planned VS Actual"]
}
 
# === INTENT CLASSIFIER COMPONENT ===
@Language.factory("intent_classifier")
class IntentClassifier:
    def __init__(self, nlp, name, model_name="roberta-base"):
        print("Loading tokenizer...")
        self.tokenizer = RobertaTokenizer.from_pretrained(model_name)
        print("Loading model...")
        self.model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=len(INTENTS))
        print("Model loaded successfully!")
 
    def classify_intent(self, user_input):
        print(f"Classifying intent for input: {user_input}")
        inputs = self.tokenizer(user_input, return_tensors="pt", padding=True, truncation=True, max_length=512)
        outputs = self.model(**inputs)
        logits = outputs.logits
        predicted_class_id = torch.argmax(logits).item()
        return INTENTS[predicted_class_id]
 
    def __call__(self, doc):
        intent = self.classify_intent(doc.text)
        doc._.intent = intent
        return doc
 
# === ENTITY MATCHER FUNCTION ===
def match_entities(text):
    matched_entities = []
    for entity_type, values in ENTITIES.items():
        for value in values:
            if value.lower() in text.lower():  # Case insensitive matching
                matched_entities.append({"entity_name": entity_type, "value": value})
    return matched_entities
 
# === CUSTOM COMPONENT TO EXTRACT ENTITIES ===
@Language.component("entity_extractor")
def extract_entities(doc):
    matched_entities = match_entities(doc.text)  # Match entities based on user input text
    doc._.entities = matched_entities
    return doc
 
# === DATASET MANAGER CLASS ===
class DatasetManager:
    def __init__(self):
        self.dataset = None
 
    def load_dataset(self):
        # Ask for the file path directly via input() instead of opening a dialog
        file_path = input("Enter the full path to the dataset (CSV or Excel): ")
 
        if file_path.endswith('.csv'):
            self.dataset = pd.read_csv(file_path)
        elif file_path.endswith('.xlsx'):
            self.dataset = pd.read_excel(file_path)
        else:
            raise ValueError("Unsupported file format")
        print(f"Dataset loaded from: {file_path}")
 
    def get_dataset(self):
        return self.dataset
 
# === ANNOTATION FUNCTION FOR VISUALIZING ENTITIES ===
def annotate_user_input(doc):
    """
    Annotates and highlights entities in the user input using spaCy's displacy.
    """
    # Prepare the visualization data
    ents = []
    for entity in doc._.entities:
        # Find where the entity is in the doc
        start = doc.text.find(entity['value'])
        end = start + len(entity['value'])
        ents.append({
            "start": start,
            "end": end,
            "label": entity['entity_name']
        })
    # Create a custom displacy-friendly structure
    displacy_data = {
        "text": doc.text,
        "ents": ents,
        "title": None
    }
 
    # Visualize using displacy (spaCy's visualization tool)
    displacy.render(displacy_data, style="ent", manual=True)
 
# === MAIN FUNCTION TO PROCESS QUERY ===
def process_query(nlp, user_input):
    # Process the query with spaCy's pipeline
    print(f"Processing query: {user_input}")
    doc = nlp(user_input)
 
    # Get intent and entities
    intent = doc._.intent
    entities = doc._.entities
 
    # Annotate the user input to highlight entities
    annotate_user_input(doc)
 
    # Return the response with intent and entities
    return {
        "User Query": user_input,
        "Intent": intent,
        "Entities": entities
    }
 
# === MAIN WORKFLOW SETUP ===
def main():
    # Load spaCy and add components
    print("Loading spaCy pipeline...")
    nlp = spacy.blank("en")
 
    # Add custom components to the pipeline
    print("Adding custom components to pipeline.")
    nlp.add_pipe("intent_classifier", last=True)  # Registering the intent classifier
    nlp.add_pipe("entity_extractor", last=True)  # Adding entity extractor after intent classifier
    print("Components added to pipeline.")
 
    # Define custom attributes
    spacy.tokens.Doc.set_extension("intent", default=None)
    spacy.tokens.Doc.set_extension("entities", default=[])
 
    # Load dataset (using DatasetManager)
    dataset_manager = DatasetManager()
    dataset_manager.load_dataset()
    print("Dataset loaded.")
 
    # Receive user input
    user_query = input("Enter your query: ")
    print(f"User query received: {user_query}")
 
    # Process the query and print the result
    response = process_query(nlp, user_query)
    print(json.dumps(response, indent=4))
 
if __name__ == "__main__":
    main()

In [None]:
import spacy

import torch

from transformers import RobertaTokenizer, RobertaForSequenceClassification

import json

from spacy import displacy

from spacy.language import Language

# INTENTS and ENTITIES from your dataset

INTENTS = [

    "Total (Manpower_Manhour) are present in (discipline)",

    "Total (Manpower_Manhour) are present in (discipline) for (Month)",

    "Total (Manpower_Manhour) are present in (discipline) from (Month) till (Month)",

    "Manhours Plan vs Actuals for a (month) and (discipline)",

    "Difference in Manmonths Plan vs Actuals for a particular month and discipline",

    "Count of manpower for a particular discipline for a month"

]

ENTITIES = {

    "Discipline": ["All", "Cathodic Protection", "Civil", "Commissioning", "Digital", "Document Control", "HVAC", "HSE",
                   "Instrumentation and Environment", "Manufacturing", "Mechanical", "Metallurgy", "Process", "Production",
                   "Project Control", "Project Management", "Protection", "Quality", "Static", "Subcontract Electrical", "Technical Management"],

    "Month": ["Nov-2024", "Apr-2024", "JAN-2023", "SEP-2023", "JUL-2023", 

              "FEB-2023", "any date in form of mm-yy or mmm-yyyy", "current month", 

              "CURRENT MONTH", "today"],

    "Manhour_Manpower_type": ["manpower", "manhour", "man hour", "man power", 

                              "manmonths", "man months", "manmonnth", "man month", 

                              "Manhours Plan vs Actuals", "Manhour Planned VS Actual"]

}

# === INTENT CLASSIFIER COMPONENT ===

@Language.factory("intent_classifier")

class IntentClassifier:

    def __init__(self, nlp, name, model_name="roberta-base"):

        print("Loading tokenizer...")

        self.tokenizer = RobertaTokenizer.from_pretrained(model_name)

        print("Loading model...")

        self.model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=len(INTENTS))

        print("Model loaded successfully!")

    def classify_intent(self, user_input):

        print(f"Classifying intent for input: {user_input}")

        inputs = self.tokenizer(user_input, return_tensors="pt", padding=True, truncation=True, max_length=512)

        outputs = self.model(**inputs)

        logits = outputs.logits

        predicted_class_id = torch.argmax(logits).item()

        return INTENTS[predicted_class_id]

    def __call__(self, doc):

        intent = self.classify_intent(doc.text)

        doc._.intent = intent

        return doc

# === ENTITY MATCHER FUNCTION ===

def match_entities(text):

    matched_entities = []

    for entity_type, values in ENTITIES.items():

        for value in values:

            if value.lower() in text.lower():  # Case insensitive matching

                matched_entities.append({"entity_name": entity_type, "value": value})

    return matched_entities

# === CUSTOM COMPONENT TO EXTRACT ENTITIES ===

@Language.component("entity_extractor")

def extract_entities(doc):

    matched_entities = match_entities(doc.text)  # Match entities based on user input text

    doc._.entities = matched_entities

    return doc

# === ANNOTATION FUNCTION FOR VISUALIZING ENTITIES ===

def annotate_user_input(doc):

    """

    Annotates and highlights entities in the user input using spaCy's displacy.

    """

    # Prepare the visualization data

    ents = []

    for entity in doc._.entities:

        # Find where the entity is in the doc

        start = doc.text.find(entity['value'])

        end = start + len(entity['value'])

        ents.append({

            "start": start,

            "end": end,

            "label": entity['entity_name']

        })

    # Create a custom displacy-friendly structure

    displacy_data = {

        "text": doc.text,

        "ents": ents,

        "title": None

    }

    # Visualize using displacy (spaCy's visualization tool)

    displacy.render(displacy_data, style="ent", manual=True)

# === MAIN FUNCTION TO PROCESS QUERY ===

def process_query(nlp, user_input):

    # Process the query with spaCy's pipeline

    print(f"Processing query: {user_input}")

    doc = nlp(user_input)

    # Get intent and entities

    intent = doc._.intent

    entities = doc._.entities

    # Annotate the user input to highlight entities

    annotate_user_input(doc)

    # Return the response with intent and entities

    return {

        "User Query": user_input,

        "Intent": intent,

        "Entities": entities

    }

# === MAIN WORKFLOW SETUP ===

def main():

    # Load spaCy and add components

    print("Loading spaCy pipeline...")

    nlp = spacy.blank("en")

    # Add custom components to the pipeline

    print("Adding custom components to pipeline.")

    nlp.add_pipe("intent_classifier", last=True)  # Registering the intent classifier

    nlp.add_pipe("entity_extractor", last=True)  # Adding entity extractor after intent classifier

    print("Components added to pipeline.")

    # Define custom attributes

    spacy.tokens.Doc.set_extension("intent", default=None)

    spacy.tokens.Doc.set_extension("entities", default=[])

    # Loop for dynamic user input

    while True:

        user_query = input("Enter your query (or type 'exit' to quit): ")

        if user_query.lower() == 'exit':

            break

        # Process the query and print the result

        response = process_query(nlp, user_query)

        print(json.dumps(response, indent=4))
 
if __name__ == "__main__":

    main()
 

In [None]:
import spacy
import torch
import yaml
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import json
from spacy import displacy
from spacy.language import Language
 
# Load the configuration from the YAML file
with open('config.yml', 'r') as file:
    config = yaml.safe_load(file)
 
# INTENTS and ENTITIES from your dataset
INTENTS = [
    "Total (Manpower_Manhour) are present in (discipline)",
    "Total (Manpower_Manhour) are present in (discipline) for (Month)",
    "Total (Manpower_Manhour) are present in (discipline) from (Month) till (Month)",
    "Manhours Plan vs Actuals for a (month) and (discipline)",
    "Difference in Manmonths Plan vs Actuals for a particular month and discipline",
    "Count of manpower for a particular discipline for a month"
]
 
ENTITIES = {
    "Discipline": ["HSE", "CIVIL", "PROCESS", "ALL", "CATHODIC PROTECTION", "COMMISSIONING", 
                   "DIGITAL", "DOCUMENT CONTROL", "HVAC", "INSTRUMENTATION AND ENVIRONMENT", 
                   "MANUFACTURING", "MECHANICAL", "METALLURGY", "PROCESS", "PRODUCTION", 
                   "PROTECTION", "QUALITY", "SUBCONTRACT ELECTRICAL", "TECHNICAL MANAGEMENT", 
                   "STATIC", "PROJECT MANAGEMENT", "PROJECT CONTROL"],
    "Month": ["Nov-2024", "Apr-2024", "JAN-2023", "SEP-2023", "JUL-2023", 
              "FEB-2023", "any date in form of mm-yy or mmm-yyyy", "current month", 
              "CURRENT MONTH", "today"],
    "Manhour_Manpower_type": ["manpower", "manhour", "man hour", "man power", 
                              "manmonths", "man months", "manmonnth", "man month", 
                              "Manhours Plan vs Actuals", "Manhour Planned VS Actual"]
}
 
# === INTENT CLASSIFIER COMPONENT ===
@Language.factory("intent_classifier")
class IntentClassifier:
    def __init__(self, nlp, name, model_name=config['nlp']['pipeline'][0]['config']['model_name']):
        print("Loading tokenizer...")
        self.tokenizer = RobertaTokenizer.from_pretrained(model_name)
        print("Loading model...")
        self.model = RobertaForSequenceClassification.from_pretrained(model_name, 
                                                                     num_labels=len(INTENTS))
        print("Model loaded successfully!")
 
    def classify_intent(self, user_input):
        print(f"Classifying intent for input: {user_input}")
        inputs = self.tokenizer(user_input, return_tensors="pt", padding=True, truncation=True, max_length=512)
        outputs = self.model(**inputs)
        logits = outputs.logits
        predicted_class_id = torch.argmax(logits).item()
        return INTENTS[predicted_class_id]
 
    def __call__(self, doc):
        intent = self.classify_intent(doc.text)
        doc._.intent = intent
        return doc
 
# === ENTITY MATCHER FUNCTION ===
def match_entities(text):
    matched_entities = []
    for entity_type, values in ENTITIES.items():
        for value in values:
            if value.lower() in text.lower():  # Case insensitive matching
                matched_entities.append({"entity_name": entity_type, "value": value})
    return matched_entities
 
# === CUSTOM COMPONENT TO EXTRACT ENTITIES ===
@Language.component("entity_extractor")
def extract_entities(doc):
    matched_entities = match_entities(doc.text)  # Match entities based on user input text
    doc._.entities = matched_entities
    return doc
 
# === ANNOTATION FUNCTION FOR VISUALIZING ENTITIES ===
def annotate_user_input(doc):
    """
    Annotates and highlights entities in the user input using spaCy's displacy.
    """
    # Prepare the visualization data
    ents = []
    for entity in doc._.entities:
        # Find where the entity is in the doc
        start = doc.text.find(entity['value'])
        end = start + len(entity['value'])
        ents.append({
            "start": start,
            "end": end,
            "label": entity['entity_name']
        })
    # Create a custom displacy-friendly structure
    displacy_data = {
        "text": doc.text,
        "ents": ents,
        "title": None
    }
 
    # Visualize using displacy (spaCy's visualization tool)
    displacy.render(displacy_data, style="ent", manual=True)
 
# === MAIN FUNCTION TO PROCESS QUERY ===
def process_query(nlp, user_input):
    # Process the query with spaCy's pipeline
    print(f"Processing query: {user_input}")
    doc = nlp(user_input)
 
    # Get intent and entities
    intent = doc._.intent
    entities = doc._.entities
 
    # Annotate the user input to highlight entities
    annotate_user_input(doc)
 
    # Return the response with intent and entities
    return {
        "User Query": user_input,
        "Intent": intent,
        "Entities": entities
    }
 
# === MAIN WORKFLOW SETUP ===
def main():
    # Load spaCy and add components
    print("Loading spaCy pipeline...")
    nlp = spacy.blank("en")
 
    # Add custom components to the pipeline
    print("Adding custom components to pipeline.")
    nlp.add_pipe("intent_classifier", last=True)  # Registering the intent classifier
    nlp.add_pipe("entity_extractor", last=True)  # Adding entity extractor after intent classifier
    print("Components added to pipeline.")
 
    # Define custom attributes
    spacy.tokens.Doc.set_extension("intent", default=None)
    spacy.tokens.Doc.set_extension("entities", default=[])
 
    # Loop for dynamic user input
    while True:
        user_query = input("Enter your query (or type 'exit' to quit): ")
        if user_query.lower() == 'exit':
            break
        # Process the query and print the result
        response = process_query(nlp, user_query)
        print(json.dumps(response, indent=4))
 
if __name__ == "__main__":
    main()

In [None]:
import spacy
import torch
import yaml
import json
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from spacy import displacy
from spacy.language import Language
 
# Load the configuration from the YAML file
with open('config.yml', 'r') as file:
    config = yaml.safe_load(file)
 
# INTENTS and ENTITIES from your dataset
INTENTS = [
    "Total (Manpower_Manhour) are present in (discipline)",
    "Total (Manpower_Manhour) are present in (discipline) for (Month)",
    "Total (Manpower_Manhour) are present in (discipline) from (Month) till (Month)",
    "Manhours Plan vs Actuals for a (month) and (discipline)",
    "Difference in Manmonths Plan vs Actuals for a particular month and discipline",
    "Count of manpower for a particular discipline for a month"
]
 
ENTITIES = {
    "Discipline": ["HSE", "CIVIL", "PROCESS", "ALL", "CATHODIC PROTECTION", "COMMISSIONING",
                   "DIGITAL", "DOCUMENT CONTROL", "HVAC", "INSTRUMENTATION AND ENVIRONMENT",
                   "MANUFACTURING", "MECHANICAL", "METALLURGY", "PROCESS", "PRODUCTION",
                   "PROTECTION", "QUALITY", "SUBCONTRACT ELECTRICAL", "TECHNICAL MANAGEMENT",
                   "STATIC", "PROJECT MANAGEMENT", "PROJECT CONTROL"],
    "Month": ["Nov-2024", "Apr-2024", "JAN-2023", "SEP-2023", "JUL-2023",
              "FEB-2023", "any date in form of mm-yy or mmm-yyyy", "current month",
              "CURRENT MONTH", "today"],
    "Manhour_Manpower_type": ["manpower", "manhour", "man hour", "man power",
                              "manmonths", "man months", "manmonnth", "man month",
                              "Manhours Plan vs Actuals", "Manhour Planned VS Actual"]
}
 
# === INTENT CLASSIFIER COMPONENT ===
@Language.factory("intent_classifier")
class IntentClassifier:
    def __init__(self, nlp, name, model_name="roberta-base"):
        print("Loading tokenizer...")
        self.tokenizer = RobertaTokenizer.from_pretrained(model_name)
        print("Loading model...")
        self.model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=len(INTENTS))
        print("Model loaded successfully!")
 
    def classify_intent(self, user_input):
        print(f"Classifying intent for input: {user_input}")
        inputs = self.tokenizer(user_input, return_tensors="pt", padding=True, truncation=True, max_length=512)
        outputs = self.model(**inputs)
        logits = outputs.logits
        predicted_class_id = torch.argmax(logits).item()
        return INTENTS[predicted_class_id]
 
    def __call__(self, doc):
        intent = self.classify_intent(doc.text)
        doc._.intent = intent
        return doc
 
# === ENTITY MATCHER FUNCTION ===
def match_entities(text):
    matched_entities = []
    for entity_type, values in ENTITIES.items():
        for value in values:
            if value.lower() in text.lower():  # Case insensitive matching
                matched_entities.append({"entity_name": entity_type, "value": value})
    return matched_entities
 
# === CUSTOM COMPONENT TO EXTRACT ENTITIES ===
@Language.component("entity_extractor")
def extract_entities(doc):
    matched_entities = match_entities(doc.text)  # Match entities based on user input text
    doc._.entities = matched_entities
    return doc
 
# === ANNOTATION FUNCTION FOR VISUALIZING ENTITIES ===
def annotate_user_input(doc):
    """
    Annotates and highlights entities in the user input using spaCy's displacy.
    """
    # Prepare the visualization data
    ents = []
    for entity in doc._.entities:
        # Find where the entity is in the doc
        start = doc.text.find(entity['value'])
        end = start + len(entity['value'])
        ents.append({
            "start": start,
            "end": end,
            "label": entity['entity_name']
        })
    # Create a custom displacy-friendly structure
    displacy_data = {
        "text": doc.text,
        "ents": ents,
        "title": None
    }
 
    # Visualize using displacy (spaCy's visualization tool)
    displacy.render(displacy_data, style="ent", manual=True)
 
# === MAIN FUNCTION TO PROCESS QUERY ===
def process_query(nlp, user_input):
    # Process the query with spaCy's pipeline
    print(f"Processing query: {user_input}")
    doc = nlp(user_input)
 
    # Get intent and entities
    intent = doc._.intent
    entities = doc._.entities
 
    # Annotate the user input to highlight entities
    annotate_user_input(doc)
 
    # Return the response with intent and entities
    return {
        "User Query": user_input,
        "Intent": intent,
        "Entities": entities
    }
 
# === MAIN WORKFLOW SETUP ===
def main():
    # Load spaCy and add components
    print("Loading spaCy pipeline...")
    nlp = spacy.blank("en")
 
    # Add custom components to the pipeline
    print("Adding custom components to pipeline.")
    nlp.add_pipe("intent_classifier", last=True)  # Registering the intent classifier
    nlp.add_pipe("entity_extractor", last=True)  # Adding entity extractor after intent classifier
    print("Components added to pipeline.")
 
    # Define custom attributes
    spacy.tokens.Doc.set_extension("intent", default=None)
    spacy.tokens.Doc.set_extension("entities", default=[])
 
    # Loop for dynamic user input
    while True:
        user_query = input("Enter your query (or type 'exit' to quit): ")
        if user_query.lower() == 'exit':
            break
        
        # Process the query and print the result
        response = process_query(nlp, user_query)
        print(json.dumps(response, indent=4))
 
if __name__ == "__main__":
    main()

In [2]:
import spacy

import torch

import yaml

import json

from torch.utils.data import DataLoader, Dataset

from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW

from transformers import get_scheduler

from spacy import displacy

from spacy.language import Language
 
# Load the configuration from the YAML file

with open('config.yml', 'r') as file:

    config = yaml.safe_load(file)
 
# INTENTS and ENTITIES from your dataset

INTENTS = config['intents']

ENTITIES = config['entities']
 
# === Dataset Class for Training ===

class IntentDataset(Dataset):

    def __init__(self, filepath):

        self.data = []

        with open(filepath, 'r') as f:

            for line in f:

                self.data.append(json.loads(line))

        self.tokenizer = RobertaTokenizer.from_pretrained(config['model_name'])
 
    def __len__(self):

        return len(self.data)
 
    def __getitem__(self, idx):

        item = self.data[idx]

        inputs = self.tokenizer(item['text'], padding='max_length', truncation=True, max_length=512, return_tensors="pt")

        label = INTENTS.index(item['intent'])

        return {

            'input_ids': inputs['input_ids'].flatten(),

            'attention_mask': inputs['attention_mask'].flatten(),

            'label': torch.tensor(label, dtype=torch.long)

        }
 
# === INTENT CLASSIFIER COMPONENT ===

@Language.factory("intent_classifier")

class IntentClassifier:

    def __init__(self, nlp, name, model_name=config['model_name']):

        print("Loading tokenizer...")

        self.tokenizer = RobertaTokenizer.from_pretrained(model_name)

        print("Loading model...")

        self.model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=len(INTENTS))

        print("Model loaded successfully!")
 
    def classify_intent(self, user_input):

        print(f"Classifying intent for input: {user_input}")

        inputs = self.tokenizer(user_input, return_tensors="pt", padding=True, truncation=True, max_length=512)

        outputs = self.model(**inputs)

        logits = outputs.logits

        predicted_class_id = torch.argmax(logits).item()

        return INTENTS[predicted_class_id]
 
    def __call__(self, doc):

        intent = self.classify_intent(doc.text)

        doc._.intent = intent

        return doc
 
# === ENTITY MATCHER FUNCTION ===

def match_entities(text):

    matched_entities = []

    for entity_type, values in ENTITIES.items():

        for value in values:

            if value.lower() in text.lower():  # Case insensitive matching

                matched_entities.append({"entity_name": entity_type, "value": value})

    return matched_entities
 
# === CUSTOM COMPONENT TO EXTRACT ENTITIES ===

@Language.component("entity_extractor")

def extract_entities(doc):

    matched_entities = match_entities(doc.text)  # Match entities based on user input text

    doc._.entities = matched_entities

    return doc
 
# === ANNOTATION FUNCTION FOR VISUALIZING ENTITIES ===

def annotate_user_input(doc):

    """

    Annotates and highlights entities in the user input using spaCy's displacy.

    """

    # Prepare the visualization data

    ents = []

    for entity in doc._.entities:

        # Find where the entity is in the doc

        start = doc.text.find(entity['value'])

        end = start + len(entity['value'])

        ents.append({

            "start": start,

            "end": end,

            "label": entity['entity_name']

        })

    # Create a custom displacy-friendly structure

    displacy_data = {

        "text": doc.text,

        "ents": ents,

        "title": None

    }
 
    # Visualize using displacy (spaCy's visualization tool)

    displacy.render(displacy_data, style="ent", manual=True)
 
# === TRAINING FUNCTION ===

def train_model(train_dataset):

    train_loader = DataLoader(train_dataset, batch_size=config['training']['batch_size'], shuffle=True)

    model = RobertaForSequenceClassification.from_pretrained(config['model_name'], num_labels=len(INTENTS))

    optimizer = AdamW(model.parameters(), lr=config['training']['learning_rate'])

    num_epochs = config['training']['num_epochs']
 
    # Setup scheduler

    num_training_steps = num_epochs * len(train_loader)

    scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
 
    model.train()
 
    for epoch in range(num_epochs):

        print(f"Epoch {epoch + 1}/{num_epochs}")

        for batch in train_loader:

            optimizer.zero_grad()

            outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['label'])

            loss = outputs.loss

            loss.backward()

            optimizer.step()

            scheduler.step()

            print(f"Loss: {loss.item()}")
 
    model.save_pretrained("intent_model")  # Save the trained model
 
# === MAIN FUNCTION TO PROCESS QUERY ===

def process_query(nlp, user_input):

    # Process the query with spaCy's pipeline

    print(f"Processing query: {user_input}")

    doc = nlp(user_input)
 
    # Get intent and entities

    intent = doc._.intent

    entities = doc._.entities
 
    # Annotate the user input to highlight entities

    annotate_user_input(doc)
 
    # Return the response with intent and entities

    return {

        "User Query": user_input,

        "Intent": intent,

        "Entities": entities

    }
 
# === MAIN WORKFLOW SETUP ===

def main():

    # Load spaCy and add components

    print("Loading spaCy pipeline...")

    nlp = spacy.blank("en")
 
    # Define custom attributes

    spacy.tokens.Doc.set_extension("intent", default=None)

    spacy.tokens.Doc.set_extension("entities", default=[])
 
    # Add custom components to the pipeline

    print("Adding custom components to pipeline.")

    nlp.add_pipe("intent_classifier", last=True)  # Registering the intent classifier

    nlp.add_pipe("entity_extractor", last=True)  # Adding entity extractor after intent classifier

    print("Components added to pipeline.")
 
    # Train the model if data path is provided

    if 'data_path' in config['training']:

        train_dataset = IntentDataset(config['training']['data_path'])

        train_model(train_dataset)
 
    # Loop for dynamic user input

    while True:

        user_query = input("Enter your query (or type 'exit' to quit): ")

        if user_query.lower() == 'exit':

            break

        # Process the query and print the result

        response = process_query(nlp, user_query)

        print(json.dumps(response, indent=4))
 
if __name__ == "__main__":

    main()

OSError: source code not available