## Data extractions

In [1]:
import pandas as pd

incident_data = pd.read_csv('data/cleaned.csv')

unique_industry_field = incident_data['Industry Type'].unique()

print(f'Number of unique industry type : {len(unique_industry_field)}')

Number of unique industry type : 45


In [12]:
import json
import random
import os

from pymongo import MongoClient

from api import *

MONGO_URI = "mongodb://root:root@localhost:27017/"
DATABASE_NAME = "incident_db"
COLLECTION_NAME = "incident_collection"

def get_random_documents_by_industry(industry, num_docs=3):
    try:
        client = MongoClient(MONGO_URI)
        db = client[DATABASE_NAME]
        collection = db[COLLECTION_NAME]

        if industry != 'all':
            query_filter = {"industry": {"$in": [industry]}}
        else:
            query_filter = {}

        documents = list(collection.find(query_filter))

        if documents:
            random_documents = random.sample(documents, min(num_docs, len(documents)))
        else:
            random_documents = []

        for document in random_documents:
            document.pop("_id", None)

        return random_documents
    except Exception as e:
        print(f"Error retrieving random documents: {e}")
        return []
    finally:
        client.close()

def retrieve_random(data):
    industries = data['industries']
    num_docs = data.get('num_docs', 3)
    
    random_docs = get_random_documents_by_industry(industries, num_docs)
    
    data['context'] = "\n\n".join(json.dumps(doc, cls=CustomJSONEncoder) for doc in random_docs)
    data.pop("industries")

    return data

data_input = {
    "industries": ["Finance", "Healthcare"],
    "num_docs": 3
}

get_random_documents_by_industry("Waste storage, treatment and disposal")

[]

In [6]:
import json
import os
import time
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI

load_dotenv()
API_KEY = os.getenv("API_KEY")

llm = ChatOpenAI(
    openai_api_base="https://api.groq.com/openai/v1/",
    model="llama-3.3-70b-versatile",
    temperature=1,
    api_key=API_KEY
)

def generate_question(industry):
    prompt = f"""
    Generate 3 professional questions about how to resolve common incidents in the '{industry}' industry.
    Your questions should contain a minimal description of the incident, then a question on how to manage the incident.
    Format your response as a json array.
    Only include the questions in your answer, nothing else.
    """
    response = llm.invoke(prompt)
    return response.content

industry_questions = []
for industry in unique_industry_field:
    attempt = 0
    max_attempts = 3
    success = False

    while attempt < max_attempts and not success:
        try:
            print(f"Generating question for industry: {industry} (Attempt {attempt + 1})")
            questions = json.loads(generate_question(industry))
            industry_questions.append({"industry": industry, "questions": questions})
            success = True
        except Exception as e:
            print(f"Error generating question for {industry} on attempt {attempt + 1}: {e}")
            attempt += 1
            if attempt < max_attempts:
                print("Retrying...")
                time.sleep(2)
            else:
                print(f"Failed to generate questions for {industry} after {max_attempts} attempts.")

output_file = "data/industry_questions.json"
with open(output_file, "w") as f:
    json.dump(industry_questions, f, indent=4)

print(f"Questions generated and saved to {output_file}")



Generating question for industry: Processing of metals (Attempt 1)
content='[\n  {\n    "question": "During a metal smelting process, the temperature of the furnace exceeds the optimal range, causing a potential risk of equipment damage. How can this incident be managed to prevent equipment failure?"\n  },\n  {\n    "question": "A batch of metal alloy is found to have excessive impurities, affecting its quality and usability. What steps should be taken to rectify the situation and prevent future occurrences?"\n  },\n  {\n    "question": "A mechanical failure occurs in the metal rolling mill, resulting in a production halt. What procedures should be followed to quickly resolve the issue and minimize downtime?"\n  },\n  {\n    "question": "An employee is exposed to toxic fumes during a metal plating process. What emergency response measures should be implemented to ensure the employee\'s safety and prevent similar incidents?"\n  },\n  {\n    "question": "A shipment of raw materials is de

## Dataset building

In [14]:
import json
import requests
import time

API_URL = "http://localhost:8000/invoke"
USER_ID = "test_user"
CHAT_ID = "test_chat"

INPUT_FILE = "data/industry_questions.json"
OUTPUT_FILE = "data/fine_tuning_data.json"

with open(INPUT_FILE, "r") as f:
    data = json.load(f)

output_data = {"data": []}


try:
    for industry_entry in data:
        industry = industry_entry["industry"]
        questions = industry_entry["questions"]

        for i, question_entry in enumerate(questions):
            question = question_entry["question"]
            attempt = 0
            success = False
            
            while not success:
                attempt += 1
                print(f"Processing question {i+1}/{len(questions)} for industry '{industry}' (attempt {attempt}): {question}\n")
                
                payload = {
                    "user_id": USER_ID,
                    "chat_id": CHAT_ID,
                    "question": question,
                    "industries": [industry]
                }
                
                try:
                    response = requests.post(API_URL, json=payload)
                    response.raise_for_status()
                    
                    response_json = response.json()
        
                    pair = {
                        "industry": industry,
                        "question": question,
                        "response": response_json
                    }
                    output_data["data"].append(pair)
                    success = True
                    attempt = 0
                    
                except Exception as e:
                    print(f"Error processing question: {e}. Retrying...\n")
                    time.sleep(2)

            if not success:
                print(f"Failed to process question '{question}' after {attempt} attempts.\n")

            time.sleep(1)
            
except KeyboardInterrupt as k:
    print("Interrupting process, saving...")

with open(OUTPUT_FILE, "w") as f:
    json.dump(output_data, f, indent=4)

print(f"Dataset saved to {OUTPUT_FILE}")


Processing question 1/10 for industry 'Processing of metals' (attempt 1): During a metal smelting process, the temperature of the furnace exceeds the optimal range, causing a potential risk of equipment damage. How can this incident be managed to prevent equipment failure?

Processing question 2/10 for industry 'Processing of metals' (attempt 1): A batch of metal alloy is found to have excessive impurities, affecting its quality and usability. What steps should be taken to rectify the situation and prevent future occurrences?

Processing question 3/10 for industry 'Processing of metals' (attempt 1): A mechanical failure occurs in the metal rolling mill, resulting in a production halt. What procedures should be followed to quickly resolve the issue and minimize downtime?

Processing question 4/10 for industry 'Processing of metals' (attempt 1): An employee is exposed to toxic fumes during a metal plating process. What emergency response measures should be implemented to ensure the emplo

In [15]:
import pandas as pd
import json
from sklearn.model_selection import train_test_split

INPUT_FILE = "data/fine_tuning_data.json"
OUTPUT_TRAIN_FILE = "data/finetune_train.json"
OUTPUT_VALIDATION_FILE = "data/finetune_validation.json"

try:
    with open(INPUT_FILE, "r") as f:
        data = json.load(f)
    df = pd.DataFrame(data["data"])
    print(f"Loaded {len(df)} records from {INPUT_FILE}.")
except Exception as e:
    print(f"Error loading data from {INPUT_FILE}: {e}")
    exit()

train_data, validation_data = train_test_split(
    df.to_dict(orient="records"),
    test_size=0.2
)

try:
    with open(OUTPUT_TRAIN_FILE, "w") as train_file:
        json.dump(train_data, train_file, indent=4)
    print(f"Training data saved to {OUTPUT_TRAIN_FILE}.")

    with open(OUTPUT_VALIDATION_FILE, "w") as validation_file:
        json.dump(validation_data, validation_file, indent=4)
    print(f"Validation data saved to {OUTPUT_VALIDATION_FILE}.")
except Exception as e:
    print(f"Error saving split data: {e}")


Loaded 450 records from data/fine_tuning_data.json.
Training data saved to data/finetune_train.json.
Validation data saved to data/finetune_validation.json.
