In [None]:
import boto3
import os
import pandas as pd
import json
from langchain_community.chat_models import BedrockChat


aws_profile_name = os.getenv('AWS_PROFILE_NAME')
session = boto3.Session(profile_name=aws_profile_name)
bedrock_client = session.client("bedrock-runtime", region_name="us-east-1")


def parse_metadata(metadata_str):
    try:
        return json.loads(metadata_str.replace("'", "\""))  
    except json.JSONDecodeError:
        return {}


def generate_prompt_elements(chunk_text, metadata, source):
    llm = BedrockChat(model_id="anthropic.claude-3-haiku-20240307-v1:0", region_name="us-east-1", client=bedrock_client)
    metadata_dict = parse_metadata(metadata) 
    metadata_str = json.dumps(metadata_dict)  

    prompt = f"""Below you will find a text, its metadata, and its data source. Based on this information:
    - Formulate an instruction a developer might ask where this text could serve as an answer raw text without markdown.
    - Provide any relevant input that might be needed alongside the instruction for context raw text without markdown.
    - Rewrite the given text and metadata as a concise answer or solution raw text without markdown.
    - return your response as a dictionary with those 3 fields, return the raw dictionary no line jumps, each key and value of the dictionary must be in double quotation marks.
    Text: '{chunk_text}'
    Metadata: '{metadata_str}'
    Source: '{source}'"""
    
    user_messages = [{"role": "user", "content": prompt}]
    
    try:
        response = llm.invoke(user_messages)
        return {
            'instruction': 'What does this information tell us about the system?',
            'input': f"Text: {chunk_text}\nSource: {source}\nMetadata: {metadata_str}",
            'output': response.content
        }
    except Exception as e:
        print(f"Error during API call: {e}")
        return {'instruction': None, 'input': None, 'output': None}

prompt_response_pairs = []

# each row in dataframe
for index, row in df.iterrows():
    prompt_elements = generate_prompt_elements(row['page_content'], row['metadata'], row['source'])
    if prompt_elements['instruction']:  # make sure ther is 
        prompt_response_pairs.append(prompt_elements)

# create a new dataframe
new_df = pd.DataFrame(prompt_response_pairs)



# Save the csv
# new_df.to_csv("structured_instruct_input_output_pairs.csv", index=False)


In [None]:
import pandas as pd
import json

def clean_and_parse_json(data):
    try:
        cleaned_data = data.replace('\n', '').replace('.\n', '"').replace("response", "answer")
        cleaned_data = json.loads(cleaned_data)
        
        return cleaned_data
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}")
        return None


new_df['parsed_output'] = new_df['output'].apply(clean_and_parse_json)


structured_new_df = pd.DataFrame({
    'instruction': new_df['parsed_output'].apply(lambda x: x.get('instruction') if x else None),
    'input': new_df['parsed_output'].apply(lambda x: x.get('input') if x else None),
    'answer': new_df['parsed_output'].apply(lambda x: x.get('answer') if x else None)
})


structured_new_df.to_csv("../training_data/real_structured_instruct_input_output_pairs.csv", index=False)