In [None]:
import ast, json
import pandas as pd

In [None]:
df = pd.read_csv("../../data/combine_df.csv")

In [None]:
df.columns

In [None]:
def extract_service_entities(df, row_index=0):
    service = df['service'][row_index]
    json_str = df['entities'][row_index]

    try:
        json_obj = ast.literal_eval(json_str)
        return {service: json_obj}
    except (ValueError, SyntaxError) as e:
        print(f"Error parsing JSON string at row {row_index}: {e}")
        return {service: None}

In [None]:
result = extract_service_entities(df)

In [None]:
all_results = [extract_service_entities(df, i) for i in range(len(df))]
all_results

In [None]:
with open('../../data/entities.json', 'w', encoding='utf-8') as f:
    json.dump(all_results, f, indent=4, ensure_ascii=False)

In [None]:
def build_ner_data(data):
    ner_data = []
    for item in data:
        sentence = item.get("content", "")
        entity_dict = item.get("entities", {})
        
        ents = []
        for label, value in entity_dict.items():
            start = sentence.find(value)
            if start != -1:
                end = start + len(value)
                ents.append({"start": start, "end": end, "label": label})
        
        ner_data.append({"content": sentence, "entities": ents})
    return ner_data

In [None]:
def convert_to_ner_format(item, intent_key="training_request"):
    ner_entry = {}
    entities = []
    
    # Construct the sentence using the values (basic example)
    request = item[intent_key]
    sentence_parts = [f"{key.replace('_', ' ')}: {value}" for key, value in request.items()]
    sentence = ". ".join(sentence_parts) + "."

    for key, value in request.items():
        start = sentence.find(value)
        if start != -1:
            end = start + len(value)
            entities.append({"start": start, "end": end, "label": key})
    
    ner_entry["content"] = sentence
    ner_entry["entities"] = entities
    return ner_entry

In [None]:
data = [
    {
        "content": "I want to borrow 500 USD for 6 months.",
        "entities": {
            "amount": "500 USD",
            "duration": "6 months"
        }
    }
]
build_ner_data(data)

In [None]:
entities_json = pd.read_json("../../data/entities.json")

In [None]:
entities_json.tail()

In [None]:
import json

with open("../../data/entities.json", "r", encoding="utf-8") as f:
    data = json.load(f)

In [None]:
def build_ner_samples(intent_name, data):
    ner_data = []

    for record in data:
        # Ensure the intent exists and is a dictionary
        if intent_name not in record or not isinstance(record[intent_name], dict):
            continue

        fields = record[intent_name]  # this is a dict of key-value pairs
        sentence_parts = [f"{k.replace('_', ' ')}: {v}" for k, v in fields.items()]
        sentence = ". ".join(sentence_parts) + "."

        entities = []
        for label, value in fields.items():
            start = sentence.find(value)
            if start != -1:
                end = start + len(value)
                entities.append({
                    "start": start,
                    "end": end,
                    "label": label
                })

        ner_data.append({
            "content": sentence,
            "entities": entities
        })

    return ner_data

In [None]:
columns = df['service'].unique()

In [None]:
for intent in columns:
    ner_dataset = build_ner_samples(intent, data)

    # Optional: Save to file
    with open(f"../../data/{intent}_ner.json", "w", encoding="utf-8") as f:
        json.dump(ner_dataset, f, indent=4, ensure_ascii=False)

In [None]:
def build_ner_samples_all_intents(data):
    merged_ner_data = []

    for record in data:
        for intent_name, fields in record.items():
            if not isinstance(fields, dict):
                continue

            # Build sentence from key-value pairs
            sentence_parts = [f"{k.replace('_', ' ')}: {v}" for k, v in fields.items()]
            sentence = ". ".join(sentence_parts) + "."

            # Extract entity positions
            entities = []
            for label, value in fields.items():
                start = sentence.find(value)
                if start != -1:
                    end = start + len(value)
                    entities.append({
                        "start": start,
                        "end": end,
                        "label": label
                    })

            merged_ner_data.append({
                "content": sentence,
                "entities": entities,
                "intent": intent_name  # optional: useful if you want to filter by intent later
            })

    return merged_ner_data

In [None]:
def build_ner_samples_all_intents(data):
    merged_ner_data = []

    for record in data:
        for intent_name, fields in record.items():
            if not isinstance(fields, dict):
                continue

            # Build the sentence from key-value pairs
            sentence_parts = []
            value_positions = []
            for k, v in fields.items():
                part = f"{k.replace('_', ' ')}: {v}"
                sentence_parts.append(part)
                value_positions.append((k, v))  # Store for span indexing

            sentence = ". ".join(sentence_parts) + "."

            # Find positions of values in the sentence (avoid duplicated value collisions)
            entities = []
            cursor = 0
            for label, value in value_positions:
                try:
                    # Look for value in the sentence starting from `cursor` to avoid overlap problems
                    start = sentence.index(value, cursor)
                    end = start + len(value)
                    entities.append({
                        "start": start,
                        "end": end,
                        "label": label
                    })
                    cursor = end  # Move cursor forward
                except ValueError:
                    continue  # Value not found, skip

            merged_ner_data.append({
                "content": sentence,
                "entities": entities,
                "intent": intent_name
            })

    return merged_ner_data

In [None]:
with open("../../data/all_intents_ner.json", "w", encoding="utf-8") as f:
    json.dump(build_ner_samples_all_intents(data), f, indent=4, ensure_ascii=False)