In [1]:
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
import yaml


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open("config.yaml", "r") as file:
    config = yaml.safe_load(file)
model = config["model"]['LLM_MODEL']
api_key = config["model"]['GROQ_API_KEY']
llm = ChatGroq(model=model, api_key=api_key)

In [None]:
# Only include 8 intents:

# - billing_issue       - Problems with invoices or payments
# - internet_issue      - Internet not working, slow, or disconnected
# - calls_issue         - Problems with phone calls
# - bundle_change       - Requests to change plans or bundles
# - balance_inquiry     - Asking about account balance
# - recharge            - Requests to top-up account
# - account_support     - Account info, update personal info
# - complaint           - Formal complaints about services

In [4]:
from langchain_core.prompts import ChatPromptTemplate
import json
import re
import time

# 1️⃣ Prompt template (Slightly modified to emphasize the specific intent)
prompts = ChatPromptTemplate.from_template(
    template=""" 
You are an AI assistant helping to create a dataset for a Telecom Customer Support chatbot.
Generate 50 patterns for the following intent ONLY: "{intent_name}"

Instructions:
1. Generate 50 short, realistic customer messages.
2. Focus on the telecom domain.
3. Format the output exactly like this:
tag "{intent_name}"
patterns
0 "message"
...
49 "message"

5. Include various user tones.
6. Ensure each message is unique.
"""
)

intents = [
    "billing_issue", "internet_issue", "calls_issue", "bundle_change",
    "balance_inquiry", "recharge", "account_support", "complaint"
]

# 3️⃣ Updated function with Label Validation
def parse_and_merge(text, dataset_dict, allowed_intents):
    blocks = text.split('tag ')
    for block in blocks[1:]: 
        lines = block.strip().splitlines()
        if not lines: continue
        
        tag_line = lines[0].strip().strip('"')
        
        # FIX: Only process if the tag is in our predefined list
        if tag_line in allowed_intents:
            if tag_line not in dataset_dict:
                dataset_dict[tag_line] = []
            
            for line in lines[2:]: 
                match = re.search(r'"(.*)"', line)
                if match:
                    pattern = match.group(1)
                    if pattern not in dataset_dict[tag_line]:
                        dataset_dict[tag_line].append(pattern)
        else:
            print(f"⚠️ Warning: AI generated an unauthorized tag: {tag_line}. Skipping.")

# 4️⃣ Generate dataset
dataset_dict = {}

for intent_name in intents:
    print(f"Generating data for intent: {intent_name}")
    prompt_text = prompts.format(intent_name=intent_name)
    response = llm.invoke(prompt_text)
    
    # Pass 'intents' list as a filter
    parse_and_merge(response.content, dataset_dict, intents)
    time.sleep(1) 

# 5️⃣ Final cleanup (unchanged)
for tag in dataset_dict:
    patterns = dataset_dict[tag]
    if len(patterns) > 50:
        dataset_dict[tag] = patterns[:50]
    elif len(patterns) < 50:
        dataset_dict[tag].extend([patterns[-1]] * (50 - len(patterns)))

# 6️⃣ Convert and Save
final_dataset = [{"tag": tag, "patterns": patterns} for tag, patterns in dataset_dict.items()]

with open("telecom_intent_dataset.json", "w", encoding="utf-8") as f:
    json.dump(final_dataset, f, ensure_ascii=False, indent=4)

print("✅ Dataset saved with strictly validated labels.")

Generating data for intent: billing_issue
Generating data for intent: internet_issue
Generating data for intent: calls_issue
Generating data for intent: bundle_change
Generating data for intent: balance_inquiry
Generating data for intent: recharge
Generating data for intent: account_support
Generating data for intent: complaint
✅ Dataset saved with strictly validated labels.
