In [None]:
from src.load_data import load_dataframe_from_s3
from src.helpers import load_config_from_yaml
import pandas as pd
from dotenv import load_dotenv
import os

from src.text_preprocessor import TextPreprocessor
from src.prompt_builder import PromptBuilder
from src.model_request import ModelRequest
from src.extractor_pipeline import ExtractorPipeline
from src.custom_logging import setup_logging
import config.pipeline_config as conf
import config.validation_config as vconf

In [None]:
# Define config file path
conf_file_path = "./config/local.yaml"
# Load config
yaml_conf = load_config_from_yaml(file_path=conf_file_path)

# Get bucket name and data name from config.
bucket_name = yaml_conf.get("BUCKET_NAME")
data_name = yaml_conf.get("BATCH1")

# Load in the record table
records = load_dataframe_from_s3(bucket_name, data_name).head(1)

# Load dotenv
load_dotenv()

In [None]:
display(records)

## Mistral 7B Instruct:

In [None]:
setup_logging(enable_console=conf.log_enable_console,
                            enable_file=conf.log_enable_file,
                            console_log_level=conf.console_log_level,
                            log_dir=conf.log_dir)

mistral_model_id = "mistral.mistral-7b-instruct-v0:2"
mistral_model_args = {"max_tokens": 200,
                      "temperature": 0,
                      "top_p": 0.9,
                      "top_k": 50}

preprocessor = TextPreprocessor()

prompter = PromptBuilder(model_id=mistral_model_id, prompt_layout=conf.prompt_layout,
                         accepted_values=conf.accepted_values)
# cme_prompt_id = os.getenv("CME_PROMPT_ID")
# prompter = PromptBuilder(prompt_id = cme_prompt_id, prompt_version = 5, accepted_values=conf.accepted_values)

mistral_requester = ModelRequest(mistral_model_id,
                                 mistral_model_args,
                                 prompter)

extractor_pipeline = ExtractorPipeline(config_file_path=conf_file_path,
                                       preprocessor=preprocessor,
                                       model_request=mistral_requester,
                                       valid_structure = vconf.ValidSchema)
output_df = extractor_pipeline.run(df=records)

In [None]:
output_df

## Llama 3 8B Instruct:

In [None]:
setup_logging(enable_console=conf.log_enable_console,
                            enable_file=conf.log_enable_file,
                            console_log_level=conf.console_log_level,
                            log_dir=conf.log_dir)

llama_model_id = "meta.llama3-8b-instruct-v1:0"
llama_model_args = {"max_gen_len":150,"temperature":0,"top_p":0.5}

preprocessor = TextPreprocessor()

prompter = PromptBuilder(model_id=llama_model_id, prompt_layout=conf.prompt_layout,
                         accepted_values=conf.accepted_values)
# cme_prompt_id = os.getenv("CME_PROMPT_ID")
# prompter = PromptBuilder(prompt_id = cme_prompt_id, prompt_version = 5, accepted_values=conf.accepted_values)

llama_requester = ModelRequest(llama_model_id, llama_model_args, prompter)

extractor_pipeline = ExtractorPipeline(config_file_path=conf_file_path,
                                       preprocessor=preprocessor,
                                       model_request=llama_requester,
                                       valid_structure = vconf.ValidSchema)
output_df2 = extractor_pipeline.run(df=records)

In [None]:
output_df2

## Claude 3 Haiku or 3.7 Sonnet:

In [None]:
setup_logging(enable_console=conf.log_enable_console,
                            enable_file=conf.log_enable_file,
                            console_log_level=conf.console_log_level,
                            log_dir=conf.log_dir)

claude_model_id = "anthropic.claude-3-haiku-20240307-v1:0"
# claude_model_id = "anthropic.claude-3-7-sonnet-20250219-v1:0"

claude_model_args = {"anthropic_version": "bedrock-2023-05-31","max_tokens": 512,"temperature": 0}

preprocessor = TextPreprocessor()

prompter = PromptBuilder(model_id=claude_model_id, prompt_layout=conf.prompt_layout,
                         accepted_values=conf.accepted_values)
# cme_prompt_id = os.getenv("CME_PROMPT_ID")
# prompter = PromptBuilder(prompt_id = cme_prompt_id, prompt_version = 5, accepted_values=conf.accepted_values)

claude_requester = ModelRequest(claude_model_id, claude_model_args, prompter)

extractor_pipeline = ExtractorPipeline(config_file_path=conf_file_path,
                                       preprocessor=preprocessor,
                                       model_request=claude_requester,
                                       valid_structure = vconf.ValidSchema)
output_df3 = extractor_pipeline.run(df=records)

In [None]:
output_df3