In [2]:
import ast, json
import pandas as pd

In [3]:
df = pd.read_csv("../../data/combine_df.csv")

In [4]:
df.columns

Index(['service', 'HR_message', 'Employee_message', 'entities'], dtype='object')

In [5]:
def extract_service_entities(df, row_index=0):
    service = df['service'][row_index]
    json_str = df['entities'][row_index]

    try:
        json_obj = ast.literal_eval(json_str)
        return {service: json_obj}
    except (ValueError, SyntaxError) as e:
        print(f"Error parsing JSON string at row {row_index}: {e}")
        return {service: None}

In [6]:
result = extract_service_entities(df)

In [7]:
all_results = [extract_service_entities(df, i) for i in range(len(df))]
all_results

[{'training_request': {'training_topic': 'machine learning',
   'special_requirements': 'no special accommodations are needed',
   'number_of_participants': 'Three',
   'training_duration': 'two days',
   'budget_constraints': '$5000',
   'desired_training_outcomes': 'better research skills',
   'preferred_training_dates': 'June 15th, 2023',
   'preferred_training_format': 'online seminars',
   'current_skill_level': 'intermediate',
   'contact_information': 'hkim@researchlab.com or 555-1234'}},
 {'training_request': {'number_of_participants': 'Three',
   'preferred_training_dates': '2023-06-15',
   'training_topic': 'machine learning research',
   'budget_constraints': '$10,000',
   'contact_information': 'hkim@researchlab.com, 555-1234',
   'special_requirements': 'no',
   'desired_training_outcomes': 'to improve research skills',
   'current_skill_level': 'intermediate',
   'training_duration': '2 days',
   'preferred_training_format': 'online seminars'}},
 {'training_request': {'sp

In [8]:
with open('../../data/entities.json', 'w', encoding='utf-8') as f:
    json.dump(all_results, f, indent=4, ensure_ascii=False)

In [9]:
def build_ner_data(data):
    ner_data = []
    for item in data:
        sentence = item.get("content", "")
        entity_dict = item.get("entities", {})
        
        ents = []
        for label, value in entity_dict.items():
            start = sentence.find(value)
            if start != -1:
                end = start + len(value)
                ents.append({"start": start, "end": end, "label": label})
        
        ner_data.append({"content": sentence, "entities": ents})
    return ner_data

In [10]:
def convert_to_ner_format(item, intent_key="training_request"):
    ner_entry = {}
    entities = []
    
    # Construct the sentence using the values (basic example)
    request = item[intent_key]
    sentence_parts = [f"{key.replace('_', ' ')}: {value}" for key, value in request.items()]
    sentence = ". ".join(sentence_parts) + "."

    for key, value in request.items():
        start = sentence.find(value)
        if start != -1:
            end = start + len(value)
            entities.append({"start": start, "end": end, "label": key})
    
    ner_entry["content"] = sentence
    ner_entry["entities"] = entities
    return ner_entry

In [11]:
data = [
    {
        "content": "I want to borrow 500 USD for 6 months.",
        "entities": {
            "amount": "500 USD",
            "duration": "6 months"
        }
    }
]
build_ner_data(data)

[{'content': 'I want to borrow 500 USD for 6 months.',
  'entities': [{'start': 17, 'end': 24, 'label': 'amount'},
   {'start': 29, 'end': 37, 'label': 'duration'}]}]

In [12]:
entities_json = pd.read_json("../../data/entities.json")

In [13]:
entities_json.tail()

Unnamed: 0,training_request,performance_review,access_request,relocation_request,safety_incident_report,time_off_report,benefits_enrollment,harassment_report,goal_setting,it_issue_report
545,,,,,,,,,,{'steps_taken_to_resolve': 'restarted my compu...
546,,,,,,,,,,{'affected_device_or_application': 'Outlook em...
547,,,,,,,,,,"{'user_location': 'Los Angeles', 'impact_on_wo..."
548,,,,,,,,,,"{'issue_severity': 'things keep crashing', 'fi..."
549,,,,,,,,,,{'specific_problem_description': 'the app has ...


In [14]:
all_results

[{'training_request': {'training_topic': 'machine learning',
   'special_requirements': 'no special accommodations are needed',
   'number_of_participants': 'Three',
   'training_duration': 'two days',
   'budget_constraints': '$5000',
   'desired_training_outcomes': 'better research skills',
   'preferred_training_dates': 'June 15th, 2023',
   'preferred_training_format': 'online seminars',
   'current_skill_level': 'intermediate',
   'contact_information': 'hkim@researchlab.com or 555-1234'}},
 {'training_request': {'number_of_participants': 'Three',
   'preferred_training_dates': '2023-06-15',
   'training_topic': 'machine learning research',
   'budget_constraints': '$10,000',
   'contact_information': 'hkim@researchlab.com, 555-1234',
   'special_requirements': 'no',
   'desired_training_outcomes': 'to improve research skills',
   'current_skill_level': 'intermediate',
   'training_duration': '2 days',
   'preferred_training_format': 'online seminars'}},
 {'training_request': {'sp