In [1]:
import os
import sys
import asyncio
import pandas as pd
from pathlib import Path

parent_dir = os.path.abspath('..')
sys.path.append(parent_dir)

from utils import batch_run
from tseval.utils import load_llms, call_llm_func

In [90]:
llm_map = load_llms()
gpt = llm_map['gpt-4o-mini-2024-07-18']

TEMP_DIR = Path('../data/intermediate')
RAG_INPUT_DIR = Path('../data/rag_inputs')

In [3]:
# ===== Process TechQA Stage 1 =====
# Filter out the questions targeted on actionable instructions using GPT + Manual Inspection.
# Format the actionable instructions (if present) with ordered list.
# Output: techqa_stage1.json & techqa_stage1_annotated.json

TECHQA_DIR = Path('../raw_data/techqa')

# 910 in total
techqa = pd.concat([pd.read_json(TECHQA_DIR / 'training_Q_A.json'), pd.read_json(TECHQA_DIR / 'dev_Q_A.json')]).reset_index()

In [4]:
# 610 answerable
answerable_techqa = techqa[techqa['ANSWERABLE'] == 'Y']

In [5]:
action_extraction_prompt = """
Analyze the response to the technical support question and determine whether it contains actionable instructions. If actionable instructions are present, extract each step in order and organize it into summary and description fields as follows:
- The `summary` field should be concise, staying as consistent as possible with the original wording. 
- The `description` field should provide details of the step. If no details are provided in the original response, leave the description field empty.
Both fields must be closely based on the original text, without adding any interpretation or inference not present in the response. Each step should be self-contained, meaning that any pronouns or vague references must be expanded to ensure clarity.
If no actionable instructions are present, output an empty JSON object.

Response:
{text}

Output Format:
{{
  "steps": [
    {{
      "summary": "",
      "description": ""
    }}
  ]
}}

If no actionable instructions are included in the response, return:
{{}}
"""

action_extraction = {'template': action_extraction_prompt}

In [9]:
def format_steps(data: dict):
    steps = data.get('steps')
    if not steps:
        return None
    items = []
    for i, step in enumerate(steps):
        summary = step['summary']
        desc = step['description']
        desc = ('\n' + desc.strip()).replace('\n', '\n   ')
        items.append(str(i + 1) + '. ' + summary.strip() + desc)

    return '\n'.join(items)

async def extract_action(answer: str) -> None:
    res, _ = await call_llm_func(gpt, action_extraction, {'text': answer})
    return format_steps(res)

In [11]:
inputs = [{'question_id': row['QUESTION_ID'], 'answer': row['ANSWER']} for _, row in answerable_techqa.iterrows()]

def extract_action_sync(data: dict):
    return asyncio.run(extract_action(data['answer']))
    
outputs = batch_run(extract_action_sync, inputs)

Processing: 100%|█████████████████████████████████████████████████████████████████████████████████████| 610/610 [01:48<00:00,  5.60it/s]


In [49]:
technotes = pd.read_json(TECHQA_DIR / 'training_dev_technotes.json')

In [60]:
df = answerable_techqa.merge(pd.DataFrame(map(lambda x: {'QUESTION_ID': x[0], 'extracted_answer': x[1]}, outputs.items())), on='QUESTION_ID', how='left')
df['question_id'] = 'TECHQA_' + df['QUESTION_ID']
df = df[~df['extracted_answer'].isna()].reset_index(drop=True)
df['question'] = df['QUESTION_TITLE'].apply(lambda s: s.strip()) + '\n' + df['QUESTION_TEXT'].apply(lambda s: s.strip())
df['ground_truth'] = df['extracted_answer']
df['reference_doc'] = df['DOCUMENT'].apply(lambda doc_id: technotes[doc_id]['title'].strip() + '\n\n' + technotes[doc_id]['text'].strip())
df = df.rename(columns={'ANSWER': 'original_ground_truth', 'DOCUMENT': 'reference_doc_id', 'START_OFFSET': 'start_offset', 'END_OFFSET': 'end_offset', 'DOC_IDS': 'doc_ids'})
df = df[['question_id', 'question', 'ground_truth', 'reference_doc', 'original_ground_truth', 'start_offset', 'end_offset', 'reference_doc_id', 'doc_ids']]

In [61]:
df.to_json(TEMP_DIR / 'techqa_stage1.json', orient='records', indent=2, index=False)

In [None]:
# ===== Process TechQA Stage 2 =====
# Highligt the key facts in the ground truth (Human Annotation)

In [62]:
df = pd.read_json(TEMP_DIR / 'techqa_stage1_annotated.json')

In [69]:
valid_df = df[~df['valid'].isna()].reset_index(drop=True)
valid_df = valid_df[[
    'question_id', 'question', 'ground_truth_refined', 'reference_doc',
    'original_ground_truth', 'start_offset', 'end_offset', 'reference_doc_id', 'doc_ids'
]].rename(columns={'ground_truth_refined': 'ground_truth'})
valid_df.to_json(TEMP_DIR / 'techqa_stage2.json', orient='records', indent=2, index=False)

In [95]:
# After annotation, write to inputs for RAG systems
df = pd.read_json(TEMP_DIR / 'techqa_stage2_annotated.json')
valid_df = df[[
    'question_id', 'question', 'reference_doc', 
    'ground_truth_refined', 'start_offset', 'end_offset' # These attributes would not be used by RAG systems
]].rename(columns={'ground_truth_refined': 'ground_truth'})

valid_df.to_json(RAG_INPUT_DIR / 'techqa.json', orient='records', indent=2, index=False)

In [None]:
# ===== Process TechQA Stage 3 =====
# Generate answers for each question via 3 RAG systems

In [87]:
import re
import html
import markdown

def markdown_to_html(text):
    text = re.sub(r'([^\n])\n', r'\1  \n', text)
    return markdown.markdown(html.escape(text))

In [88]:
df = pd.read_json(TEMP_DIR / 'techqa_stage2_annotated.json')

In [89]:
valid_df = df[[
    'question_id', 'question', 'ground_truth_refined', 'reference_doc',
    'original_ground_truth', 'start_offset', 'end_offset', 'reference_doc_id', 'doc_ids'
]].rename(columns={'ground_truth_refined': 'ground_truth'})

valid_df['ground_truth_html'] = valid_df['ground_truth'].apply(lambda s: markdown_to_html(s))

valid_df.to_json(TEMP_DIR / 'techqa_stage3.json', orient='records', indent=2, index=False)