In [1]:
###2.21: added parameters to avoid calling the API###

import os
import re
import glob
import fitz  # PyMuPDF
import json
import pandas as pd
from datetime import datetime
from pprint import pprint
from dotenv import load_dotenv
import tiktoken  # pip install tiktoken

# Load environment variables (which should include api_key and grok_api_key)
load_dotenv()

# === API Call Control Parameters ===
# Set each to 'Y' to call the API, or 'N' to skip the API call.
CALL_GPT_3_5 = "Y"
CALL_O1_MINI = "N"
CALL_XAI = "N"

# Create OpenAI client for standard models
from openai import OpenAI
api_key = os.getenv("api_key")
client = OpenAI(api_key=api_key)

# Create a separate client for the xAI API
XAI_API_KEY = os.getenv("grok_api_key")
xai_client = OpenAI(api_key=XAI_API_KEY, base_url="https://api.x.ai/v1")

# Define the download week as a string and convert it to a datetime object.
date = "2025-02-24"
download_week_dt = datetime.strptime(date, "%Y-%m-%d")

# --- Helper functions for token counting and truncation using tiktoken ---

def count_tokens(text, model="gpt-3.5-turbo"):
    """
    Returns the number of tokens in the given text using tiktoken.
    """
    try:
        encoding = tiktoken.encoding_for_model(model)
    except Exception:
        encoding = tiktoken.get_encoding("cl100k_base")
    tokens = encoding.encode(text)
    return len(tokens)

def truncate_text(text, max_tokens, model="gpt-3.5-turbo"):
    """
    Truncates `text` to at most `max_tokens` using tiktoken encoding.
    """
    try:
        encoding = tiktoken.encoding_for_model(model)
    except Exception:
        encoding = tiktoken.get_encoding("cl100k_base")
    tokens = encoding.encode(text)
    if len(tokens) > max_tokens:
        truncated_tokens = tokens[:max_tokens]
        truncated_text = encoding.decode(truncated_tokens)
        return truncated_text
    return text

# --- API call functions ---

def call_gpt_3_5_turbo(full_text):
    try:
        completion = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are an analyst doing competitive intelligence research on insurance companies."},
                {"role": "user", "content": f"The following is a filing with a DOI from an insurance company: '{full_text}'. Give a summary of the rate filing."}
            ],
            max_tokens=300
        )
        print("GPT-3.5 Summary:", completion.choices[0].message.content)
        return completion.choices[0].message.content
    except Exception as e:
        print(f"Error calling GPT-3.5-turbo: {e}")
        return ""

def call_o1_mini(full_text):
    try:
        completion = client.chat.completions.create(
            model="o1-mini",
            messages=[
                {"role": "user", "content": "You are an analyst doing competitive intelligence research on insurance companies."},
                {"role": "user", "content": f"The following is a filing with a DOI from an insurance company: '{full_text}'. Give a summary of the rate filing."}
            ]
        )
        print("o1-mini Summary:", completion.choices[0].message.content)
        return completion.choices[0].message.content
    except Exception as e:
        print(f"Error calling o1-mini: {e}")
        return ""

def call_xai(full_text):
    try:
        completion = xai_client.chat.completions.create(
            model="grok-2-latest",
            messages=[
                {"role": "system", "content": "You are an analyst doing competitive intelligence research on insurance companies."},
                {"role": "user", "content": f"The following is a filing with a DOI from an insurance company: '{full_text}'. Give a summary of the rate filing."}
            ]
        )
        print("xAI Summary:", completion.choices[0].message.content)
        return completion.choices[0].message.content
    except Exception as e:
        print(f"Error calling xAI: {e}")
        return ""

# --- Extraction functions for non-API parts ---

def extract_full_text(pdf_path, max_pages=None):
    """
    Extracts full text from PDF.
    If max_pages is provided, only extracts from the first max_pages pages.
    """
    doc = fitz.open(pdf_path)
    full_text = ""
    for i, page in enumerate(doc):
        if max_pages is not None and i >= max_pages:
            break
        full_text += page.get_text() + "\n"
    doc.close()
    return full_text

def split_list_objections(text):
    parts = re.split(r"(?m)^\s*(?:\(?(\d+)\)?[.\)])\s*", text)
    objections = {}
    if len(parts) >= 3:
        for i in range(1, len(parts)-1, 2):
            num = parts[i].strip()
            obj_text = parts[i+1].strip()
            objections[f"objection{num}"] = obj_text
        if not objections:
            objections["objection1"] = text.strip()
    else:
        objections["objection1"] = text.strip()
    return objections

def parse_objection_letters(text):
    pattern = r"(Objection Letter.*?Objection Letter Date\s+(\d{2}/\d{2}/\d{4}).*?Introduction:\s*(.*?)\s*Conclusion:)"
    matches = list(re.finditer(pattern, text, flags=re.IGNORECASE | re.DOTALL))
    letter_dict = {}
    for idx, m in enumerate(matches, start=1):
        letter_date = m.group(2).strip()
        body = m.group(3).strip()
        explicit_objs = list(re.finditer(r"Objection\s+(\d+)", body, flags=re.IGNORECASE))
        if len(explicit_objs) > 1:
            obj_pattern = r"(Objection\s+(\d+).*?)(?=(Objection\s+\d+|Conclusion:|$))"
            objections = {}
            for om in re.finditer(obj_pattern, body, flags=re.IGNORECASE | re.DOTALL):
                num = om.group(2).strip()
                obj_text = om.group(1).strip()
                objections[f"objection{num}"] = obj_text
        else:
            if re.search(r"(?m)^\s*(\(?\d+\)?[.\)])", body):
                objections = split_list_objections(body)
            else:
                objections = {"objection1": body}
        letter_dict[f"Objection_letter{idx}"] = {
            "objection_letter_date": letter_date,
            **objections
        }
    return letter_dict

def extract_filing_company_info(text):
    company_codes = re.findall(r"CoCode:\s*(\d+)", text, flags=re.IGNORECASE)
    group_codes = re.findall(r"Group Code:\s*(\d+)", text, flags=re.IGNORECASE)
    company_codes = [int(code) for code in company_codes] if company_codes else []
    group_code = int(group_codes[0]) if group_codes else None
    return group_code, company_codes

# --- Extraction functions for additional filing info ---

def parse_filing_at_a_glance(text):
    info = {}
    match = re.search(r"Filing at a Glance(.*?)(?=\n[A-Z][a-z]+ Information|$)", text, flags=re.IGNORECASE | re.DOTALL)
    section = match.group(1) if match else ""
    
    serff_match = re.search(r"SERFF Tr Num:\s*([A-Z0-9\-]+)", section, flags=re.IGNORECASE)
    info["serff_tr_num"] = serff_match.group(1).strip() if serff_match else ""
    
    filing_type_match = re.search(r"Filing Type:\s*([^\n]+)", section, flags=re.IGNORECASE)
    info["filing_type"] = filing_type_match.group(1).strip() if filing_type_match else ""
    
    date_submitted_match = re.search(r"Date Submitted:\s*(\d{2}/\d{2}/\d{4})", section, flags=re.IGNORECASE)
    info["date_submitted"] = date_submitted_match.group(1).strip() if date_submitted_match else ""
    
    effective_new_match = re.search(r"Effective Date\s*Requested\s*\(New\):\s*(\d{2}/\d{2}/\d{4})", section, flags=re.IGNORECASE)
    info["effective_date_requested_new"] = effective_new_match.group(1).strip() if effective_new_match else ""
    
    effective_renewal_match = re.search(r"Effective Date\s*Requested\s*\(Renewal\):\s*(\d{2}/\d{2}/\d{4})", section, flags=re.IGNORECASE)
    info["effective_date_requested_renewal"] = effective_renewal_match.group(1).strip() if effective_renewal_match else ""
    
    serff_status_match = re.search(r"SERFF Status:\s*([^\n]+)", section, flags=re.IGNORECASE)
    info["serff_status"] = serff_status_match.group(1).strip() if serff_status_match else ""
    
    toi_match = re.search(r"TOI:\s*([^\n]+)", section, flags=re.IGNORECASE)
    info["toi"] = toi_match.group(1).strip() if toi_match else ""
    
    return info

def parse_filing_description(text):
    match = re.search(r"Filing Description:\s*(.*?)\s*(Filing Contact Information|Filing Company Information)", 
                      text, flags=re.IGNORECASE | re.DOTALL)
    if match:
        return match.group(1).strip()
    return ""

# --- API Call Functions (as defined above) ---
# (call_gpt_3_5_turbo, call_o1_mini, call_xai already defined above)

# --- Processing functions for additional filing info ---

def extract_full_text_all(pdf_path):
    """
    Extracts full text from the PDF using up to the first 3000 pages.
    This prevents the code from getting stuck on extremely large PDFs.
    """
    return extract_full_text(pdf_path, max_pages=3000)

def process_pdf_additional(pdf_path):
    """
    Processes a single PDF file and extracts additional filing info.
    For non-API extraction, only the first 100 pages are used.
    For API calls, the full text (subject to token constraints) is used.
    Returns a dictionary with keys:
      serff_tr_num, filing_type, date_submitted, effective_date_requested_new,
      effective_date_requested_renewal, serff_status, filing_description, filing_method,
      toi, group_code, company_codes, download_week, gpt_3_5_summary, o1_mini_summary, xai_summary.
    """
    # Use first 100 pages for non-API info
    limited_text = extract_full_text(pdf_path, max_pages=100)
    filing_at_a_glance = parse_filing_at_a_glance(limited_text)
    filing_description = parse_filing_description(limited_text)
    group_code, company_codes = extract_filing_company_info(limited_text)
    
    # For API calls, use the full text from up to 3000 pages.
    full_text_all = extract_full_text_all(pdf_path)
    token_count = count_tokens(full_text_all, model="gpt-3.5-turbo")
    print(f"Token count for API call in {os.path.basename(pdf_path)}: {token_count}")
    
    # Determine the API text and whether to call GPT-3.5-turbo:
    if token_count < 16385:
        api_text = full_text_all
        gpt_3_5_summary = call_gpt_3_5_turbo(full_text_all) if CALL_GPT_3_5 == "Y" else ""
    elif token_count < 75000:
        api_text = full_text_all
        gpt_3_5_summary = ""  # Do not call GPT-3.5-turbo if tokens > 16385
    else:
        api_text = truncate_text(full_text_all, max_tokens=75000, model="gpt-3.5-turbo")
        gpt_3_5_summary = ""  # Do not call GPT-3.5-turbo
    
    o1_mini_summary = call_o1_mini(api_text) if CALL_O1_MINI == "Y" else ""
    xai_summary = call_xai(api_text) if CALL_XAI == "Y" else ""
    
    additional = {
        "serff_tr_num": filing_at_a_glance.get("serff_tr_num", ""),
        "filing_type": filing_at_a_glance.get("filing_type", ""),
        "date_submitted": filing_at_a_glance.get("date_submitted", ""),
        "effective_date_requested_new": filing_at_a_glance.get("effective_date_requested_new", ""),
        "effective_date_requested_renewal": filing_at_a_glance.get("effective_date_requested_renewal", ""),
        "serff_status": filing_at_a_glance.get("serff_status", ""),
        "filing_description": filing_description,
        "filing_method": "",  # Not extracted in this version.
        "toi": filing_at_a_glance.get("toi", ""),
        "group_code": group_code,
        "company_codes": company_codes,
        "download_week": download_week_dt,
        "gpt_3_5_summary": gpt_3_5_summary,
        "o1_mini_summary": o1_mini_summary,
        "xai_summary": xai_summary
    }
    print(f"Processed additional filing info for: {os.path.basename(pdf_path)}")
    return additional

def process_all_pdfs_additional(pdf_folder):
    results = {}
    pdf_files = glob.glob(os.path.join(pdf_folder, "*.pdf"))
    for pdf_file in pdf_files:
        file_key = os.path.splitext(os.path.basename(pdf_file))[0]
        results[file_key] = process_pdf_additional(pdf_file)
    return results

def additional_to_dataframe(results):
    rows = []
    for file_key, info in results.items():
        row = {
            "SERFF Tr Num": info.get("serff_tr_num", ""),
            "Filing Type": info.get("filing_type", ""),
            "Date Submitted": info.get("date_submitted", ""),
            "Effective Date Requested (New)": info.get("effective_date_requested_new", ""),
            "Effective Date Requested (Renewal)": info.get("effective_date_requested_renewal", ""),
            "SERFF Status": info.get("serff_status", ""),
            "Filing Description": info.get("filing_description", ""),
            "Filing Method": info.get("filing_method", ""),
            "TOI": info.get("toi", ""),
            "Group_Code": info.get("group_code", ""),
            "Company_Codes": str(info.get("company_codes", "")),
            "week_date": info.get("download_week", None),
            "gpt_3_5_summary": info.get("gpt_3_5_summary", ""),
            "o1_mini_summary": info.get("o1_mini_summary", ""),
            "xai_summary": info.get("xai_summary", "")
        }
        rows.append(row)
    df = pd.DataFrame(rows, columns=[
        "SERFF Tr Num", "Filing Type", "Date Submitted", 
        "Effective Date Requested (New)", "Effective Date Requested (Renewal)",
        "SERFF Status", "Filing Description", "Filing Method", "TOI",
        "Group_Code", "Company_Codes", "week_date",
        "gpt_3_5_summary", "o1_mini_summary", "xai_summary"
    ])
    return df

# --- DataFrame for objections (unchanged from before) ---
def results_to_dataframe_objections(results):
    rows = []
    for file_key, file_info in results.items():
        filing_type = file_info.get("filing_type", "")
        date_submitted = file_info.get("date_submitted", "")
        serff_status = file_info.get("serff_status", "")
        state_abbr = file_key.split('_')[0] if "_" in file_key else file_key[:2]
        group_code = file_info.get("group_code", "No objections")
        company_codes = file_info.get("company_codes", [])
        company_code_first = company_codes[0] if company_codes else "No objections"
        company_codes_str = str(company_codes) if company_codes else "No objections"
        download_week = file_info.get("download_week", None)
        letter_dict = file_info.get("objections", {})
        if not letter_dict:
            rows.append({
                "SERFF#": file_key,
                "Filing Type": filing_type,
                "Date Submitted": date_submitted,
                "SERFF Status": serff_status,
                "State": state_abbr,
                "Group_Code": group_code,
                "Company_Code": company_code_first,
                "Company_Code_all": company_codes_str,
                "Objection_letter_date": "No objections",
                "Objectionletter#": "No objections",
                "Objection#": "No objections",
                "Objection": "No objections",
                "download_week": download_week
            })
        else:
            for letter_key, letter_content in letter_dict.items():
                letter_date = letter_content.get("objection_letter_date", "No objections")
                for obj_key, obj_text in letter_content.items():
                    if obj_key == "objection_letter_date":
                        continue
                    rows.append({
                        "SERFF#": file_key,
                        "Filing Type": filing_type,
                        "Date Submitted": date_submitted,
                        "SERFF Status": serff_status,
                        "State": state_abbr,
                        "Group_Code": group_code,
                        "Company_Code": company_code_first,
                        "Company_Code_all": company_codes_str,
                        "Objection_letter_date": letter_date,
                        "Objectionletter#": letter_key,
                        "Objection#": obj_key,
                        "Objection": obj_text.replace("\n", " ").strip(),
                        "download_week": download_week
                    })
    df = pd.DataFrame(rows, columns=[
        "SERFF#", "Filing Type", "Date Submitted", "SERFF Status", "State",
        "Group_Code", "Company_Code", "Company_Code_all", "Objection_letter_date",
        "Objectionletter#", "Objection#", "Objection", "download_week"
    ])
    return df

# --- Main Processing ---

pdf_folder = f"Filings/Final/{date}"  # Folder path based on date parameter

# Process objections
objection_results = process_all_pdfs_additional(pdf_folder)
json_output_path = f"objections_output_{date}.json"
with open(json_output_path, "w", encoding="utf-8") as f:
    json.dump(objection_results, f, default=lambda o: o.isoformat() if isinstance(o, datetime) else o, ensure_ascii=False, indent=4)
print(f"Saved objections JSON output to {json_output_path}")

df_objections = results_to_dataframe_objections(objection_results)
csv_output_path = f"objections_output_{date}.csv"
df_objections.to_csv(csv_output_path, index=False)
print(f"Saved objections CSV output to {csv_output_path}")

# Process additional filing info.
additional_results = process_all_pdfs_additional(pdf_folder)
additional_json_output_path = f"additional_filing_info_{date}.json"
with open(additional_json_output_path, "w", encoding="utf-8") as f:
    json.dump(additional_results, f, default=lambda o: o.isoformat() if isinstance(o, datetime) else o, ensure_ascii=False, indent=4)
print(f"Saved additional filing info JSON output to {additional_json_output_path}")

df_additional = additional_to_dataframe(additional_results)
additional_csv_output_path = f"additional_filing_info_{date}.csv"
df_additional.to_csv(additional_csv_output_path, index=False)
print(f"Saved additional filing info CSV output to {additional_csv_output_path}")


Token count for API call in AR_LBPM-134402349.pdf: 6626
GPT-3.5 Summary: The rate filing submitted by American Economy Insurance Company for their Homeowners product in Arkansas includes a decrease in overall rates of 1.1%. The filing was submitted on 02/11/2025 and the effective date requested for the new rates is 03/06/2025.

The filing involves proposed revisions to their Homeowners program, specifically related to the Home Rating Manual and the Home Tiering Manual. The proposed changes include introducing a Multi-Policy Discount and a Peril Offset.

The filing was reviewed by Robert Lively and Matthew Rowland, with Becky Harrington being the primary reviewer. The disposition date was on 02/18/2025, and the filing was marked as Filed on that date. The SERFF Status is Closed-Filed. The company tracking number for this filing is 25-AR-S-HO-R-HMR 2.5-ASR.

The filing required a fee of $100.00, and it was processed on 02/11/2025. The objection letter was submitted on 02/12/2025, and the

In [1]:
###Updated so that the objection files are not the same as the additional info files 3.3. ###
###Note: this code doesn't work because the objections output code is taking too long (3.4)

In [2]:
import os
import re
import glob
import fitz  # PyMuPDF
import json
import pandas as pd
from datetime import datetime
from pprint import pprint
from dotenv import load_dotenv
import tiktoken  # pip install tiktoken

# Load environment variables (which should include api_key and grok_api_key)
load_dotenv()

# === API Call Control Parameters for additional filing info (not used for objections) ===
# Set each to 'Y' to call the API, or 'N' to skip the API call.
CALL_GPT_3_5 = "Y"
CALL_O1_MINI = "N"
CALL_XAI = "N"

# Create OpenAI client for standard models
from openai import OpenAI
api_key = os.getenv("api_key")
client = OpenAI(api_key=api_key)

# Create a separate client for the xAI API
XAI_API_KEY = os.getenv("grok_api_key")
xai_client = OpenAI(api_key=XAI_API_KEY, base_url="https://api.x.ai/v1")

# Define the download week as a string and convert it to a datetime object.
date = "2025-02-24"
download_week_dt = datetime.strptime(date, "%Y-%m-%d")

# --- Helper functions for token counting and truncation using tiktoken ---
def count_tokens(text, model="gpt-3.5-turbo"):
    """
    Returns the number of tokens in the given text using tiktoken.
    """
    try:
        encoding = tiktoken.encoding_for_model(model)
    except Exception:
        encoding = tiktoken.get_encoding("cl100k_base")
    tokens = encoding.encode(text)
    return len(tokens)

def truncate_text(text, max_tokens, model="gpt-3.5-turbo"):
    """
    Truncates `text` to at most `max_tokens` using tiktoken encoding.
    """
    try:
        encoding = tiktoken.encoding_for_model(model)
    except Exception:
        encoding = tiktoken.get_encoding("cl100k_base")
    tokens = encoding.encode(text)
    if len(tokens) > max_tokens:
        truncated_tokens = tokens[:max_tokens]
        truncated_text = encoding.decode(truncated_tokens)
        return truncated_text
    return text

# --- API call functions (used in additional filing info only) ---
def call_gpt_3_5_turbo(full_text):
    try:
        completion = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are an analyst doing competitive intelligence research on insurance companies."},
                {"role": "user", "content": f"The following is a filing with a DOI from an insurance company: '{full_text}'. Give a summary of the rate filing."}
            ],
            max_tokens=300
        )
        print("GPT-3.5 Summary:", completion.choices[0].message.content)
        return completion.choices[0].message.content
    except Exception as e:
        print(f"Error calling GPT-3.5-turbo: {e}")
        return ""

def call_o1_mini(full_text):
    try:
        completion = client.chat.completions.create(
            model="o1-mini",
            messages=[
                {"role": "user", "content": "You are an analyst doing competitive intelligence research on insurance companies."},
                {"role": "user", "content": f"The following is a filing with a DOI from an insurance company: '{full_text}'. Give a summary of the rate filing."}
            ]
        )
        print("o1-mini Summary:", completion.choices[0].message.content)
        return completion.choices[0].message.content
    except Exception as e:
        print(f"Error calling o1-mini: {e}")
        return ""

def call_xai(full_text):
    try:
        completion = xai_client.chat.completions.create(
            model="grok-2-latest",
            messages=[
                {"role": "system", "content": "You are an analyst doing competitive intelligence research on insurance companies."},
                {"role": "user", "content": f"The following is a filing with a DOI from an insurance company: '{full_text}'. Give a summary of the rate filing."}
            ]
        )
        print("xAI Summary:", completion.choices[0].message.content)
        return completion.choices[0].message.content
    except Exception as e:
        print(f"Error calling xAI: {e}")
        return ""

# --- Extraction functions for non-API parts ---
def extract_full_text(pdf_path, max_pages=None):
    """
    Extracts full text from PDF.
    If max_pages is provided, only extracts from the first max_pages pages.
    """
    doc = fitz.open(pdf_path)
    full_text = ""
    for i, page in enumerate(doc):
        if max_pages is not None and i >= max_pages:
            break
        full_text += page.get_text() + "\n"
    doc.close()
    return full_text

def split_list_objections(text):
    parts = re.split(r"(?m)^\s*(?:\(?(\d+)\)?[.\)])\s*", text)
    objections = {}
    if len(parts) >= 3:
        for i in range(1, len(parts)-1, 2):
            num = parts[i].strip()
            obj_text = parts[i+1].strip()
            objections[f"objection{num}"] = obj_text
        if not objections:
            objections["objection1"] = text.strip()
    else:
        objections["objection1"] = text.strip()
    return objections

def parse_objection_letters(text):
    """
    Splits the full text into separate objection letter sections.
    A valid objection letter must contain "Objection Letter Date", "Introduction:" and "Conclusion:".
    Returns a dictionary where each key is "Objection_letterX" with its parsed objections.
    """
    pattern = r"(Objection Letter.*?Objection Letter Date\s+(\d{2}/\d{2}/\d{4}).*?Introduction:\s*(.*?)\s*Conclusion:)"
    matches = list(re.finditer(pattern, text, flags=re.IGNORECASE | re.DOTALL))
    letter_dict = {}
    for idx, m in enumerate(matches, start=1):
        letter_date = m.group(2).strip()
        body = m.group(3).strip()
        explicit_objs = list(re.finditer(r"Objection\s+(\d+)", body, flags=re.IGNORECASE))
        if len(explicit_objs) > 1:
            obj_pattern = r"(Objection\s+(\d+).*?)(?=(Objection\s+\d+|Conclusion:|$))"
            objections = {}
            for om in re.finditer(obj_pattern, body, flags=re.IGNORECASE | re.DOTALL):
                num = om.group(2).strip()
                obj_text = om.group(1).strip()
                objections[f"objection{num}"] = obj_text
        else:
            if re.search(r"(?m)^\s*(\(?\d+\)?[.\)])", body):
                objections = split_list_objections(body)
            else:
                objections = {"objection1": body}
        letter_dict[f"Objection_letter{idx}"] = {
            "objection_letter_date": letter_date,
            **objections
        }
    return letter_dict

def extract_filing_company_info(text):
    """
    Extracts filing company information.
    Returns a tuple: (group_code, company_codes)
    """
    company_codes = re.findall(r"CoCode:\s*(\d+)", text, flags=re.IGNORECASE)
    group_codes = re.findall(r"Group Code:\s*(\d+)", text, flags=re.IGNORECASE)
    company_codes = [int(code) for code in company_codes] if company_codes else []
    group_code = int(group_codes[0]) if group_codes else None
    return group_code, company_codes

# --- Extraction functions for additional filing info ---
def parse_filing_at_a_glance(text):
    """
    Extracts key filing info from the "Filing at a Glance" section.
    Returns a dictionary with keys: serff_tr_num, filing_type, date_submitted,
    effective_date_requested_new, effective_date_requested_renewal, serff_status, toi.
    """
    info = {}
    match = re.search(r"Filing at a Glance(.*?)(?=\n[A-Z][a-z]+ Information|$)", text, flags=re.IGNORECASE | re.DOTALL)
    section = match.group(1) if match else ""
    
    serff_match = re.search(r"SERFF Tr Num:\s*([A-Z0-9\-]+)", section, flags=re.IGNORECASE)
    info["serff_tr_num"] = serff_match.group(1).strip() if serff_match else ""
    
    filing_type_match = re.search(r"Filing Type:\s*([^\n]+)", section, flags=re.IGNORECASE)
    info["filing_type"] = filing_type_match.group(1).strip() if filing_type_match else ""
    
    date_submitted_match = re.search(r"Date Submitted:\s*(\d{2}/\d{2}/\d{4})", section, flags=re.IGNORECASE)
    info["date_submitted"] = date_submitted_match.group(1).strip() if date_submitted_match else ""
    
    effective_new_match = re.search(r"Effective Date\s*Requested\s*\(New\):\s*(\d{2}/\d{2}/\d{4})", section, flags=re.IGNORECASE)
    info["effective_date_requested_new"] = effective_new_match.group(1).strip() if effective_new_match else ""
    
    effective_renewal_match = re.search(r"Effective Date\s*Requested\s*\(Renewal\):\s*(\d{2}/\d{2}/\d{4})", section, flags=re.IGNORECASE)
    info["effective_date_requested_renewal"] = effective_renewal_match.group(1).strip() if effective_renewal_match else ""
    
    serff_status_match = re.search(r"SERFF Status:\s*([^\n]+)", section, flags=re.IGNORECASE)
    info["serff_status"] = serff_status_match.group(1).strip() if serff_status_match else ""
    
    toi_match = re.search(r"TOI:\s*([^\n]+)", section, flags=re.IGNORECASE)
    info["toi"] = toi_match.group(1).strip() if toi_match else ""
    
    return info

def parse_filing_description(text):
    """
    Extracts the Filing Description from the text.
    Returns the description string.
    """
    match = re.search(r"Filing Description:\s*(.*?)\s*(Filing Contact Information|Filing Company Information)", 
                      text, flags=re.IGNORECASE | re.DOTALL)
    if match:
        return match.group(1).strip()
    return ""

# --- Processing functions for objections (old version) ---
def process_pdf_file(pdf_path):
    """
    Processes a single PDF file for objections output.
    Returns a dictionary with keys:
      - "objections": parsed objection letters,
      - "group_code": extracted group code,
      - "company_codes": list of CoCodes,
      - "download_week": the download_week datetime.
    """
    full_text = extract_full_text(pdf_path)
    parsed_letters = parse_objection_letters(full_text)
    group_code, company_codes = extract_filing_company_info(full_text)
    file_key = os.path.splitext(os.path.basename(pdf_path))[0]
    return {file_key: {
                "objections": parsed_letters,
                "group_code": group_code,
                "company_codes": company_codes,
                "download_week": download_week_dt
            }}

def process_all_pdfs_in_folder(pdf_folder):
    """
    Loops through all PDF files in the folder and processes each for objections output.
    Returns a nested dictionary.
    """
    results = {}
    pdf_files = glob.glob(os.path.join(pdf_folder, "*.pdf"))
    for pdf_file in pdf_files:
        print(pdf_file)
        print(datetime.now())
        results.update(process_pdf_file(pdf_file))
    return results

# --- Processing functions for additional filing info (with API calls) ---
def extract_full_text_all(pdf_path):
    """
    Extracts full text from the PDF using up to the first 3000 pages.
    """
    return extract_full_text(pdf_path, max_pages=3000)

def process_pdf_additional(pdf_path):
    """
    Processes a single PDF file and extracts additional filing info.
    For non-API info, uses the first 100 pages.
    For API calls, uses the full text (up to 3000 pages) subject to token constraints.
    Returns a dictionary with keys including API summaries.
    """
    # Non-API extraction (first 100 pages)
    limited_text = extract_full_text(pdf_path, max_pages=100)
    filing_at_a_glance = parse_filing_at_a_glance(limited_text)
    filing_description = parse_filing_description(limited_text)
    group_code, company_codes = extract_filing_company_info(limited_text)
    
    # API extraction using full text (up to 3000 pages)
    full_text_all = extract_full_text_all(pdf_path)
    token_count = count_tokens(full_text_all, model="gpt-3.5-turbo")
    print(f"Token count for API call in {os.path.basename(pdf_path)}: {token_count}")
    
    if token_count < 16385:
        api_text = full_text_all
        gpt_3_5_summary = call_gpt_3_5_turbo(full_text_all) if CALL_GPT_3_5 == "Y" else ""
    elif token_count < 75000:
        api_text = full_text_all
        gpt_3_5_summary = ""  # Skip GPT-3.5-turbo
    else:
        api_text = truncate_text(full_text_all, max_tokens=75000, model="gpt-3.5-turbo")
        gpt_3_5_summary = ""
    
    o1_mini_summary = call_o1_mini(api_text) if CALL_O1_MINI == "Y" else ""
    xai_summary = call_xai(api_text) if CALL_XAI == "Y" else ""
    
    additional = {
        "serff_tr_num": filing_at_a_glance.get("serff_tr_num", ""),
        "filing_type": filing_at_a_glance.get("filing_type", ""),
        "date_submitted": filing_at_a_glance.get("date_submitted", ""),
        "effective_date_requested_new": filing_at_a_glance.get("effective_date_requested_new", ""),
        "effective_date_requested_renewal": filing_at_a_glance.get("effective_date_requested_renewal", ""),
        "serff_status": filing_at_a_glance.get("serff_status", ""),
        "filing_description": filing_description,
        "filing_method": "",  # Not extracted
        "toi": filing_at_a_glance.get("toi", ""),
        "group_code": group_code,
        "company_codes": company_codes,
        "download_week": download_week_dt,
        "gpt_3_5_summary": gpt_3_5_summary,
        "o1_mini_summary": o1_mini_summary,
        "xai_summary": xai_summary
    }
    print(f"Processed additional filing info for: {os.path.basename(pdf_path)}")
    return additional

def process_all_pdfs_additional(pdf_folder):
    """
    Processes all PDFs in the folder for additional filing info.
    Returns a dictionary keyed by file name.
    """
    results = {}
    pdf_files = glob.glob(os.path.join(pdf_folder, "*.pdf"))
    for pdf_file in pdf_files:
        print(pdf_file)
        print(datetime.now())
        file_key = os.path.splitext(os.path.basename(pdf_file))[0]
        results[file_key] = process_pdf_additional(pdf_file)
    return results

def additional_to_dataframe(results):
    """
    Converts the additional filing info dictionary into a DataFrame.
    """
    rows = []
    for file_key, info in results.items():
        row = {
            "SERFF Tr Num": info.get("serff_tr_num", ""),
            "Filing Type": info.get("filing_type", ""),
            "Date Submitted": info.get("date_submitted", ""),
            "Effective Date Requested (New)": info.get("effective_date_requested_new", ""),
            "Effective Date Requested (Renewal)": info.get("effective_date_requested_renewal", ""),
            "SERFF Status": info.get("serff_status", ""),
            "Filing Description": info.get("filing_description", ""),
            "Filing Method": info.get("filing_method", ""),
            "TOI": info.get("toi", ""),
            "Group_Code": info.get("group_code", ""),
            "Company_Codes": str(info.get("company_codes", "")),
            "week_date": info.get("download_week", None),
            "gpt_3_5_summary": info.get("gpt_3_5_summary", ""),
            "o1_mini_summary": info.get("o1_mini_summary", ""),
            "xai_summary": info.get("xai_summary", "")
        }
        rows.append(row)
    df = pd.DataFrame(rows, columns=[
        "SERFF Tr Num", "Filing Type", "Date Submitted", 
        "Effective Date Requested (New)", "Effective Date Requested (Renewal)",
        "SERFF Status", "Filing Description", "Filing Method", "TOI",
        "Group_Code", "Company_Codes", "week_date",
        "gpt_3_5_summary", "o1_mini_summary", "xai_summary"
    ])
    return df

# --- DataFrame for objections ---
def results_to_dataframe_objections(results):
    """
    Converts the objections dictionary (from process_all_pdfs_in_folder) into a DataFrame.
    """
    rows = []
    for file_key, file_info in results.items():
        state_abbr = file_key.split('_')[0] if "_" in file_key else file_key[:2]
        group_code = file_info.get("group_code", "No objections")
        company_codes = file_info.get("company_codes", [])
        company_code_first = company_codes[0] if company_codes else "No objections"
        company_codes_str = str(company_codes) if company_codes else "No objections"
        download_week = file_info.get("download_week", None)
        letter_dict = file_info.get("objections", {})
        if not letter_dict:
            rows.append({
                "SERFF#": file_key,
                "State": state_abbr,
                "Group_Code": group_code,
                "Company_Code": company_code_first,
                "Company_Code_all": company_codes_str,
                "Objection_letter_date": "No objections",
                "Objectionletter#": "No objections",
                "Objection#": "No objections",
                "Objection": "No objections",
                "download_week": download_week
            })
        else:
            for letter_key, letter_content in letter_dict.items():
                letter_date = letter_content.get("objection_letter_date", "No objections")
                for obj_key, obj_text in letter_content.items():
                    if obj_key == "objection_letter_date":
                        continue
                    rows.append({
                        "SERFF#": file_key,
                        "State": state_abbr,
                        "Group_Code": group_code,
                        "Company_Code": company_code_first,
                        "Company_Code_all": company_codes_str,
                        "Objection_letter_date": letter_date,
                        "Objectionletter#": letter_key,
                        "Objection#": obj_key,
                        "Objection": obj_text.replace("\n", " ").strip(),
                        "download_week": download_week
                    })
    df = pd.DataFrame(rows, columns=[
        "SERFF#", "State", "Group_Code", "Company_Code", "Company_Code_all", 
        "Objection_letter_date", "Objectionletter#", "Objection#", "Objection", "download_week"
    ])
    return df

# --- Main Processing ---
pdf_folder = f"Filings/Final/{date}"  # Folder path based on date parameter

# Process objections using the old functions.
objection_results = process_all_pdfs_in_folder(pdf_folder)
json_output_path = f"objections_output_{date}.json"
with open(json_output_path, "w", encoding="utf-8") as f:
    json.dump(objection_results, f, default=lambda o: o.isoformat() if isinstance(o, datetime) else o, ensure_ascii=False, indent=4)
print(f"Saved objections JSON output to {json_output_path}")

df_objections = results_to_dataframe_objections(objection_results)
csv_output_path = f"objections_output_{date}.csv"
df_objections.to_csv(csv_output_path, index=False)
print(f"Saved objections CSV output to {csv_output_path}")

# Process additional filing info using the new function.
additional_results = process_all_pdfs_additional(pdf_folder)
additional_json_output_path = f"additional_filing_info_{date}.json"
with open(additional_json_output_path, "w", encoding="utf-8") as f:
    json.dump(additional_results, f, default=lambda o: o.isoformat() if isinstance(o, datetime) else o, ensure_ascii=False, indent=4)
print(f"Saved additional filing info JSON output to {additional_json_output_path}")

df_additional = additional_to_dataframe(additional_results)
additional_csv_output_path = f"additional_filing_info_{date}.csv"
df_additional.to_csv(additional_csv_output_path, index=False)
print(f"Saved additional filing info CSV output to {additional_csv_output_path}")


Filings/Final/2025-02-24\AR_LBPM-134402349.pdf
2025-03-04 07:03:10.655069
Filings/Final/2025-02-24\AR_LBPM-134415366.pdf
2025-03-04 07:03:10.776022
Filings/Final/2025-02-24\AR_SFMA-134403457.pdf
2025-03-04 07:03:10.875763
Filings/Final/2025-02-24\AZ_ALSE-134423427.pdf
2025-03-04 07:03:11.035603
Filings/Final/2025-02-24\AZ_ALSE-134423455.pdf
2025-03-04 07:03:11.436744
Filings/Final/2025-02-24\AZ_ALSE-134423481.pdf
2025-03-04 07:03:11.934508
Filings/Final/2025-02-24\AZ_ALSE-134424632.pdf
2025-03-04 07:03:12.725094
Filings/Final/2025-02-24\AZ_ALSE-134424807.pdf
2025-03-04 07:03:12.990511
Filings/Final/2025-02-24\AZ_ALSE-134424886.pdf
2025-03-04 07:03:13.215586
Filings/Final/2025-02-24\AZ_ALSE-134424967.pdf
2025-03-04 07:03:13.413454
Filings/Final/2025-02-24\AZ_FARM-134267371.pdf
2025-03-04 07:03:13.708456
Filings/Final/2025-02-24\AZ_GMMX-134417367.pdf
2025-03-04 07:03:14.026352
Filings/Final/2025-02-24\CA_ALSE-134306035.pdf
2025-03-04 07:03:14.363579
Filings/Final/2025-02-24\CA_AMSI-13439

KeyboardInterrupt: 

In [None]:
### Update to truncate pdf to first 100 pages for the objection extraction part of the pdf 3.4 ###

In [3]:
import os
import re
import glob
import fitz  # PyMuPDF
import json
import pandas as pd
from datetime import datetime
from pprint import pprint
from dotenv import load_dotenv
import tiktoken  # pip install tiktoken

# Load environment variables (which should include api_key and grok_api_key)
load_dotenv()

# === API Call Control Parameters for additional filing info (not used for objections) ===
# Set each to 'Y' to call the API, or 'N' to skip the API call.
CALL_GPT_3_5 = "Y"
CALL_O1_MINI = "N"
CALL_XAI = "N"

# Create OpenAI client for standard models
from openai import OpenAI
api_key = os.getenv("api_key")
client = OpenAI(api_key=api_key)

# Create a separate client for the xAI API
XAI_API_KEY = os.getenv("grok_api_key")
xai_client = OpenAI(api_key=XAI_API_KEY, base_url="https://api.x.ai/v1")

# Define the download week as a string and convert it to a datetime object.
date = "2025-02-24"
download_week_dt = datetime.strptime(date, "%Y-%m-%d")

# --- Helper functions for token counting and truncation using tiktoken ---
def count_tokens(text, model="gpt-3.5-turbo"):
    try:
        encoding = tiktoken.encoding_for_model(model)
    except Exception:
        encoding = tiktoken.get_encoding("cl100k_base")
    tokens = encoding.encode(text)
    return len(tokens)

def truncate_text(text, max_tokens, model="gpt-3.5-turbo"):
    try:
        encoding = tiktoken.encoding_for_model(model)
    except Exception:
        encoding = tiktoken.get_encoding("cl100k_base")
    tokens = encoding.encode(text)
    if len(tokens) > max_tokens:
        truncated_tokens = tokens[:max_tokens]
        truncated_text = encoding.decode(truncated_tokens)
        return truncated_text
    return text

# --- API call functions (used in additional filing info only) ---
def call_gpt_3_5_turbo(full_text):
    try:
        completion = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are an analyst doing competitive intelligence research on insurance companies."},
                {"role": "user", "content": f"The following is a filing with a DOI from an insurance company: '{full_text}'. Give a summary of the rate filing."}
            ],
            max_tokens=300
        )
        print("GPT-3.5 Summary:", completion.choices[0].message.content)
        return completion.choices[0].message.content
    except Exception as e:
        print(f"Error calling GPT-3.5-turbo: {e}")
        return ""

def call_o1_mini(full_text):
    try:
        completion = client.chat.completions.create(
            model="o1-mini",
            messages=[
                {"role": "user", "content": "You are an analyst doing competitive intelligence research on insurance companies."},
                {"role": "user", "content": f"The following is a filing with a DOI from an insurance company: '{full_text}'. Give a summary of the rate filing."}
            ]
        )
        print("o1-mini Summary:", completion.choices[0].message.content)
        return completion.choices[0].message.content
    except Exception as e:
        print(f"Error calling o1-mini: {e}")
        return ""

def call_xai(full_text):
    try:
        completion = xai_client.chat.completions.create(
            model="grok-2-latest",
            messages=[
                {"role": "system", "content": "You are an analyst doing competitive intelligence research on insurance companies."},
                {"role": "user", "content": f"The following is a filing with a DOI from an insurance company: '{full_text}'. Give a summary of the rate filing."}
            ]
        )
        print("xAI Summary:", completion.choices[0].message.content)
        return completion.choices[0].message.content
    except Exception as e:
        print(f"Error calling xAI: {e}")
        return ""

# --- Extraction functions for non-API parts ---
def extract_full_text(pdf_path, max_pages=None):
    """
    Extracts full text from PDF.
    If max_pages is provided, only extracts from the first max_pages pages.
    """
    doc = fitz.open(pdf_path)
    full_text = ""
    for i, page in enumerate(doc):
        if max_pages is not None and i >= max_pages:
            break
        full_text += page.get_text() + "\n"
    doc.close()
    return full_text

def split_list_objections(text):
    parts = re.split(r"(?m)^\s*(?:\(?(\d+)\)?[.\)])\s*", text)
    objections = {}
    if len(parts) >= 3:
        for i in range(1, len(parts)-1, 2):
            num = parts[i].strip()
            obj_text = parts[i+1].strip()
            objections[f"objection{num}"] = obj_text
        if not objections:
            objections["objection1"] = text.strip()
    else:
        objections["objection1"] = text.strip()
    return objections

def parse_objection_letters(text):
    """
    Splits the full text into separate objection letter sections.
    A valid objection letter must contain "Objection Letter Date", "Introduction:" and "Conclusion:".
    Returns a dictionary where each key is "Objection_letterX" with its parsed objections.
    """
    pattern = r"(Objection Letter.*?Objection Letter Date\s+(\d{2}/\d{2}/\d{4}).*?Introduction:\s*(.*?)\s*Conclusion:)"
    matches = list(re.finditer(pattern, text, flags=re.IGNORECASE | re.DOTALL))
    letter_dict = {}
    for idx, m in enumerate(matches, start=1):
        letter_date = m.group(2).strip()
        body = m.group(3).strip()
        explicit_objs = list(re.finditer(r"Objection\s+(\d+)", body, flags=re.IGNORECASE))
        if len(explicit_objs) > 1:
            obj_pattern = r"(Objection\s+(\d+).*?)(?=(Objection\s+\d+|Conclusion:|$))"
            objections = {}
            for om in re.finditer(obj_pattern, body, flags=re.IGNORECASE | re.DOTALL):
                num = om.group(2).strip()
                obj_text = om.group(1).strip()
                objections[f"objection{num}"] = obj_text
        else:
            if re.search(r"(?m)^\s*(\(?\d+\)?[.\)])", body):
                objections = split_list_objections(body)
            else:
                objections = {"objection1": body}
        letter_dict[f"Objection_letter{idx}"] = {
            "objection_letter_date": letter_date,
            **objections
        }
    return letter_dict

def extract_filing_company_info(text):
    """
    Extracts filing company information.
    Returns a tuple: (group_code, company_codes)
    """
    company_codes = re.findall(r"CoCode:\s*(\d+)", text, flags=re.IGNORECASE)
    group_codes = re.findall(r"Group Code:\s*(\d+)", text, flags=re.IGNORECASE)
    company_codes = [int(code) for code in company_codes] if company_codes else []
    group_code = int(group_codes[0]) if group_codes else None
    return group_code, company_codes

# --- Extraction functions for additional filing info ---
def parse_filing_at_a_glance(text):
    """
    Extracts key filing info from the "Filing at a Glance" section.
    Returns a dictionary with keys: serff_tr_num, filing_type, date_submitted,
    effective_date_requested_new, effective_date_requested_renewal, serff_status, toi.
    """
    info = {}
    match = re.search(r"Filing at a Glance(.*?)(?=\n[A-Z][a-z]+ Information|$)", text, flags=re.IGNORECASE | re.DOTALL)
    section = match.group(1) if match else ""
    
    serff_match = re.search(r"SERFF Tr Num:\s*([A-Z0-9\-]+)", section, flags=re.IGNORECASE)
    info["serff_tr_num"] = serff_match.group(1).strip() if serff_match else ""
    
    filing_type_match = re.search(r"Filing Type:\s*([^\n]+)", section, flags=re.IGNORECASE)
    info["filing_type"] = filing_type_match.group(1).strip() if filing_type_match else ""
    
    date_submitted_match = re.search(r"Date Submitted:\s*(\d{2}/\d{2}/\d{4})", section, flags=re.IGNORECASE)
    info["date_submitted"] = date_submitted_match.group(1).strip() if date_submitted_match else ""
    
    effective_new_match = re.search(r"Effective Date\s*Requested\s*\(New\):\s*(\d{2}/\d{2}/\d{4})", section, flags=re.IGNORECASE)
    info["effective_date_requested_new"] = effective_new_match.group(1).strip() if effective_new_match else ""
    
    effective_renewal_match = re.search(r"Effective Date\s*Requested\s*\(Renewal\):\s*(\d{2}/\d{2}/\d{4})", section, flags=re.IGNORECASE)
    info["effective_date_requested_renewal"] = effective_renewal_match.group(1).strip() if effective_renewal_match else ""
    
    serff_status_match = re.search(r"SERFF Status:\s*([^\n]+)", section, flags=re.IGNORECASE)
    info["serff_status"] = serff_status_match.group(1).strip() if serff_status_match else ""
    
    toi_match = re.search(r"TOI:\s*([^\n]+)", section, flags=re.IGNORECASE)
    info["toi"] = toi_match.group(1).strip() if toi_match else ""
    
    return info

def parse_filing_description(text):
    """
    Extracts the Filing Description from the text.
    Returns the description string.
    """
    match = re.search(r"Filing Description:\s*(.*?)\s*(Filing Contact Information|Filing Company Information)", 
                      text, flags=re.IGNORECASE | re.DOTALL)
    if match:
        return match.group(1).strip()
    return ""

# --- Processing functions for objections (old version) ---
def process_pdf_file(pdf_path):
    """
    Processes a single PDF file for objections output.
    Truncates the PDF to the first 100 pages before processing.
    Returns a dictionary with keys:
      - "objections": parsed objection letters,
      - "group_code": extracted group code,
      - "company_codes": list of CoCodes,
      - "download_week": the download_week datetime.
    """
    # Truncate to first 100 pages
    full_text = extract_full_text(pdf_path, max_pages=100)
    parsed_letters = parse_objection_letters(full_text)
    group_code, company_codes = extract_filing_company_info(full_text)
    file_key = os.path.splitext(os.path.basename(pdf_path))[0]
    return {file_key: {
                "objections": parsed_letters,
                "group_code": group_code,
                "company_codes": company_codes,
                "download_week": download_week_dt
            }}

def process_all_pdfs_in_folder(pdf_folder):
    """
    Loops through all PDF files in the folder and processes each for objections output.
    Returns a nested dictionary.
    """
    results = {}
    pdf_files = glob.glob(os.path.join(pdf_folder, "*.pdf"))
    for pdf_file in pdf_files:
        print(pdf_file)
        print(datetime.now())
        results.update(process_pdf_file(pdf_file))
    return results

# --- Processing functions for additional filing info (with API calls) ---
def extract_full_text_all(pdf_path):
    """
    Extracts full text from the PDF using up to the first 3000 pages.
    """
    return extract_full_text(pdf_path, max_pages=3000)

def process_pdf_additional(pdf_path):
    """
    Processes a single PDF file and extracts additional filing info.
    For non-API info, uses the first 100 pages.
    For API calls, uses the full text (up to 3000 pages) subject to token constraints.
    Returns a dictionary with keys including API summaries.
    """
    # Non-API extraction (first 100 pages)
    limited_text = extract_full_text(pdf_path, max_pages=100)
    filing_at_a_glance = parse_filing_at_a_glance(limited_text)
    filing_description = parse_filing_description(limited_text)
    group_code, company_codes = extract_filing_company_info(limited_text)
    
    # API extraction using full text (up to 3000 pages)
    full_text_all = extract_full_text_all(pdf_path)
    token_count = count_tokens(full_text_all, model="gpt-3.5-turbo")
    print(f"Token count for API call in {os.path.basename(pdf_path)}: {token_count}")
    
    if token_count < 16385:
        api_text = full_text_all
        gpt_3_5_summary = call_gpt_3_5_turbo(full_text_all) if CALL_GPT_3_5 == "Y" else ""
    elif token_count < 75000:
        api_text = full_text_all
        gpt_3_5_summary = ""  # Skip GPT-3.5-turbo
    else:
        api_text = truncate_text(full_text_all, max_tokens=75000, model="gpt-3.5-turbo")
        gpt_3_5_summary = ""
    
    o1_mini_summary = call_o1_mini(api_text) if CALL_O1_MINI == "Y" else ""
    xai_summary = call_xai(api_text) if CALL_XAI == "Y" else ""
    
    additional = {
        "serff_tr_num": filing_at_a_glance.get("serff_tr_num", ""),
        "filing_type": filing_at_a_glance.get("filing_type", ""),
        "date_submitted": filing_at_a_glance.get("date_submitted", ""),
        "effective_date_requested_new": filing_at_a_glance.get("effective_date_requested_new", ""),
        "effective_date_requested_renewal": filing_at_a_glance.get("effective_date_requested_renewal", ""),
        "serff_status": filing_at_a_glance.get("serff_status", ""),
        "filing_description": filing_description,
        "filing_method": "",  # Not extracted
        "toi": filing_at_a_glance.get("toi", ""),
        "group_code": group_code,
        "company_codes": company_codes,
        "download_week": download_week_dt,
        "gpt_3_5_summary": gpt_3_5_summary,
        "o1_mini_summary": o1_mini_summary,
        "xai_summary": xai_summary
    }
    print(f"Processed additional filing info for: {os.path.basename(pdf_path)}")
    return additional

def process_all_pdfs_additional(pdf_folder):
    """
    Processes all PDFs in the folder for additional filing info.
    Returns a dictionary keyed by file name.
    """
    results = {}
    pdf_files = glob.glob(os.path.join(pdf_folder, "*.pdf"))
    for pdf_file in pdf_files:
        print(pdf_file)
        print(datetime.now())
        file_key = os.path.splitext(os.path.basename(pdf_file))[0]
        results[file_key] = process_pdf_additional(pdf_file)
    return results

def additional_to_dataframe(results):
    """
    Converts the additional filing info dictionary into a DataFrame.
    """
    rows = []
    for file_key, info in results.items():
        row = {
            "SERFF Tr Num": info.get("serff_tr_num", ""),
            "Filing Type": info.get("filing_type", ""),
            "Date Submitted": info.get("date_submitted", ""),
            "Effective Date Requested (New)": info.get("effective_date_requested_new", ""),
            "Effective Date Requested (Renewal)": info.get("effective_date_requested_renewal", ""),
            "SERFF Status": info.get("serff_status", ""),
            "Filing Description": info.get("filing_description", ""),
            "Filing Method": info.get("filing_method", ""),
            "TOI": info.get("toi", ""),
            "Group_Code": info.get("group_code", ""),
            "Company_Codes": str(info.get("company_codes", "")),
            "week_date": info.get("download_week", None),
            "gpt_3_5_summary": info.get("gpt_3_5_summary", ""),
            "o1_mini_summary": info.get("o1_mini_summary", ""),
            "xai_summary": info.get("xai_summary", "")
        }
        rows.append(row)
    df = pd.DataFrame(rows, columns=[
        "SERFF Tr Num", "Filing Type", "Date Submitted", 
        "Effective Date Requested (New)", "Effective Date Requested (Renewal)",
        "SERFF Status", "Filing Description", "Filing Method", "TOI",
        "Group_Code", "Company_Codes", "week_date",
        "gpt_3_5_summary", "o1_mini_summary", "xai_summary"
    ])
    return df

# --- DataFrame for objections ---
def results_to_dataframe_objections(results):
    """
    Converts the objections dictionary (from process_all_pdfs_in_folder) into a DataFrame.
    """
    rows = []
    for file_key, file_info in results.items():
        state_abbr = file_key.split('_')[0] if "_" in file_key else file_key[:2]
        group_code = file_info.get("group_code", "No objections")
        company_codes = file_info.get("company_codes", [])
        company_code_first = company_codes[0] if company_codes else "No objections"
        company_codes_str = str(company_codes) if company_codes else "No objections"
        download_week = file_info.get("download_week", None)
        letter_dict = file_info.get("objections", {})
        if not letter_dict:
            rows.append({
                "SERFF#": file_key,
                "State": state_abbr,
                "Group_Code": group_code,
                "Company_Code": company_code_first,
                "Company_Code_all": company_codes_str,
                "Objection_letter_date": "No objections",
                "Objectionletter#": "No objections",
                "Objection#": "No objections",
                "Objection": "No objections",
                "download_week": download_week
            })
        else:
            for letter_key, letter_content in letter_dict.items():
                letter_date = letter_content.get("objection_letter_date", "No objections")
                for obj_key, obj_text in letter_content.items():
                    if obj_key == "objection_letter_date":
                        continue
                    rows.append({
                        "SERFF#": file_key,
                        "State": state_abbr,
                        "Group_Code": group_code,
                        "Company_Code": company_code_first,
                        "Company_Code_all": company_codes_str,
                        "Objection_letter_date": letter_date,
                        "Objectionletter#": letter_key,
                        "Objection#": obj_key,
                        "Objection": obj_text.replace("\n", " ").strip(),
                        "download_week": download_week
                    })
    df = pd.DataFrame(rows, columns=[
        "SERFF#", "State", "Group_Code", "Company_Code", "Company_Code_all", 
        "Objection_letter_date", "Objectionletter#", "Objection#", "Objection", "download_week"
    ])
    return df

# --- Main Processing ---
pdf_folder = f"Filings/Final/{date}"  # Folder path based on date parameter

# Process objections using the old functions.
objection_results = process_all_pdfs_in_folder(pdf_folder)
json_output_path = f"objections_output_{date}.json"
with open(json_output_path, "w", encoding="utf-8") as f:
    json.dump(objection_results, f, default=lambda o: o.isoformat() if isinstance(o, datetime) else o, ensure_ascii=False, indent=4)
print(f"Saved objections JSON output to {json_output_path}")

df_objections = results_to_dataframe_objections(objection_results)
csv_output_path = f"objections_output_{date}.csv"
df_objections.to_csv(csv_output_path, index=False)
print(f"Saved objections CSV output to {csv_output_path}")

# Process additional filing info using the new function.
additional_results = process_all_pdfs_additional(pdf_folder)
additional_json_output_path = f"additional_filing_info_{date}.json"
with open(additional_json_output_path, "w", encoding="utf-8") as f:
    json.dump(additional_results, f, default=lambda o: o.isoformat() if isinstance(o, datetime) else o, ensure_ascii=False, indent=4)
print(f"Saved additional filing info JSON output to {additional_json_output_path}")

df_additional = additional_to_dataframe(additional_results)
additional_csv_output_path = f"additional_filing_info_{date}.csv"
df_additional.to_csv(additional_csv_output_path, index=False)
print(f"Saved additional filing info CSV output to {additional_csv_output_path}")


Filings/Final/2025-02-24\AR_LBPM-134402349.pdf
2025-03-04 07:08:54.302835
Filings/Final/2025-02-24\AR_LBPM-134415366.pdf
2025-03-04 07:08:54.399484
Filings/Final/2025-02-24\AR_SFMA-134403457.pdf
2025-03-04 07:08:54.451183
Filings/Final/2025-02-24\AZ_ALSE-134423427.pdf
2025-03-04 07:08:54.535942
Filings/Final/2025-02-24\AZ_ALSE-134423455.pdf
2025-03-04 07:08:54.755703
Filings/Final/2025-02-24\AZ_ALSE-134423481.pdf
2025-03-04 07:08:54.893773
Filings/Final/2025-02-24\AZ_ALSE-134424632.pdf
2025-03-04 07:08:55.168920
Filings/Final/2025-02-24\AZ_ALSE-134424807.pdf
2025-03-04 07:08:55.332506
Filings/Final/2025-02-24\AZ_ALSE-134424886.pdf
2025-03-04 07:08:55.468411
Filings/Final/2025-02-24\AZ_ALSE-134424967.pdf
2025-03-04 07:08:55.575539
Filings/Final/2025-02-24\AZ_FARM-134267371.pdf
2025-03-04 07:08:55.758705
Filings/Final/2025-02-24\AZ_GMMX-134417367.pdf
2025-03-04 07:08:55.938940
Filings/Final/2025-02-24\CA_ALSE-134306035.pdf
2025-03-04 07:08:56.121827
Filings/Final/2025-02-24\CA_AMSI-13439

In [None]:
###Update to add token count parameter for each API call 3.4 ###

In [5]:
import os
import re
import glob
import fitz  # PyMuPDF
import json
import pandas as pd
from datetime import datetime
from pprint import pprint
from dotenv import load_dotenv
import tiktoken  # pip install tiktoken

# Load environment variables (which should include api_key and grok_api_key)
load_dotenv()

# === API Call Control Parameters for additional filing info (not used for objections) ===
# Set each to 'Y' to call the API, or 'N' to skip the API call.
CALL_GPT_3_5 = "Y"
CALL_O1_MINI = "N"  # Example value; set to 'Y' or 'N'
CALL_XAI = "N"      # Example value; set to 'Y' or 'N'

# === User-defined Maximum Token Parameters for API calls ===
# These define the maximum number of tokens to send to each model.
# You can change these values or prompt the user for them.
MAX_TOKENS_GPT3_5 = 15000
MAX_TOKENS_O1_MINI = 12000
MAX_TOKENS_XAI = 12000

# Create OpenAI client for standard models
from openai import OpenAI
api_key = os.getenv("api_key")
client = OpenAI(api_key=api_key)

# Create a separate client for the xAI API
XAI_API_KEY = os.getenv("grok_api_key")
xai_client = OpenAI(api_key=XAI_API_KEY, base_url="https://api.x.ai/v1")

# Define the download week as a string and convert it to a datetime object.
date = "2025-03-03"
download_week_dt = datetime.strptime(date, "%Y-%m-%d")

# --- Helper functions for token counting and truncation using tiktoken ---
def count_tokens(text, model="gpt-3.5-turbo"):
    try:
        encoding = tiktoken.encoding_for_model(model)
    except Exception:
        encoding = tiktoken.get_encoding("cl100k_base")
    tokens = encoding.encode(text)
    return len(tokens)

def truncate_text(text, max_tokens, model="gpt-3.5-turbo"):
    try:
        encoding = tiktoken.encoding_for_model(model)
    except Exception:
        encoding = tiktoken.get_encoding("cl100k_base")
    tokens = encoding.encode(text)
    if len(tokens) > max_tokens:
        truncated_tokens = tokens[:max_tokens]
        truncated_text = encoding.decode(truncated_tokens)
        return truncated_text
    return text

# --- API call functions (used in additional filing info only) ---
def call_gpt_3_5_turbo(full_text):
    try:
        completion = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are an analyst doing competitive intelligence research on insurance companies."},
                {"role": "user", "content": f"The following is a filing with a DOI from an insurance company: '{full_text}'. Give a summary of the rate filing."}
            ],
            max_tokens=300
        )
        print("GPT-3.5 Summary:", completion.choices[0].message.content)
        return completion.choices[0].message.content
    except Exception as e:
        print(f"Error calling GPT-3.5-turbo: {e}")
        return ""

def call_o1_mini(full_text):
    try:
        completion = client.chat.completions.create(
            model="o1-mini",
            messages=[
                {"role": "user", "content": "You are an analyst doing competitive intelligence research on insurance companies."},
                {"role": "user", "content": f"The following is a filing with a DOI from an insurance company: '{full_text}'. Give a summary of the rate filing."}
            ]
        )
        print("o1-mini Summary:", completion.choices[0].message.content)
        return completion.choices[0].message.content
    except Exception as e:
        print(f"Error calling o1-mini: {e}")
        return ""

def call_xai(full_text):
    try:
        completion = xai_client.chat.completions.create(
            model="grok-2-latest",
            messages=[
                {"role": "system", "content": "You are an analyst doing competitive intelligence research on insurance companies."},
                {"role": "user", "content": f"The following is a filing with a DOI from an insurance company: '{full_text}'. Give a summary of the rate filing."}
            ]
        )
        print("xAI Summary:", completion.choices[0].message.content)
        return completion.choices[0].message.content
    except Exception as e:
        print(f"Error calling xAI: {e}")
        return ""

# --- Extraction functions for non-API parts ---
def extract_full_text(pdf_path, max_pages=None):
    """
    Extracts full text from PDF.
    If max_pages is provided, only extracts from the first max_pages pages.
    """
    doc = fitz.open(pdf_path)
    full_text = ""
    for i, page in enumerate(doc):
        if max_pages is not None and i >= max_pages:
            break
        full_text += page.get_text() + "\n"
    doc.close()
    return full_text

def split_list_objections(text):
    parts = re.split(r"(?m)^\s*(?:\(?(\d+)\)?[.\)])\s*", text)
    objections = {}
    if len(parts) >= 3:
        for i in range(1, len(parts)-1, 2):
            num = parts[i].strip()
            obj_text = parts[i+1].strip()
            objections[f"objection{num}"] = obj_text
        if not objections:
            objections["objection1"] = text.strip()
    else:
        objections["objection1"] = text.strip()
    return objections

def parse_objection_letters(text):
    """
    Splits the full text into separate objection letter sections.
    A valid objection letter must contain "Objection Letter Date", "Introduction:" and "Conclusion:".
    Returns a dictionary where each key is "Objection_letterX" with its parsed objections.
    """
    pattern = r"(Objection Letter.*?Objection Letter Date\s+(\d{2}/\d{2}/\d{4}).*?Introduction:\s*(.*?)\s*Conclusion:)"
    matches = list(re.finditer(pattern, text, flags=re.IGNORECASE | re.DOTALL))
    letter_dict = {}
    for idx, m in enumerate(matches, start=1):
        letter_date = m.group(2).strip()
        body = m.group(3).strip()
        explicit_objs = list(re.finditer(r"Objection\s+(\d+)", body, flags=re.IGNORECASE))
        if len(explicit_objs) > 1:
            obj_pattern = r"(Objection\s+(\d+).*?)(?=(Objection\s+\d+|Conclusion:|$))"
            objections = {}
            for om in re.finditer(obj_pattern, body, flags=re.IGNORECASE | re.DOTALL):
                num = om.group(2).strip()
                obj_text = om.group(1).strip()
                objections[f"objection{num}"] = obj_text
        else:
            if re.search(r"(?m)^\s*(\(?\d+\)?[.\)])", body):
                objections = split_list_objections(body)
            else:
                objections = {"objection1": body}
        letter_dict[f"Objection_letter{idx}"] = {
            "objection_letter_date": letter_date,
            **objections
        }
    return letter_dict

def extract_filing_company_info(text):
    """
    Extracts filing company information.
    Returns a tuple: (group_code, company_codes)
    """
    company_codes = re.findall(r"CoCode:\s*(\d+)", text, flags=re.IGNORECASE)
    group_codes = re.findall(r"Group Code:\s*(\d+)", text, flags=re.IGNORECASE)
    company_codes = [int(code) for code in company_codes] if company_codes else []
    group_code = int(group_codes[0]) if group_codes else None
    return group_code, company_codes

# --- Extraction functions for additional filing info ---
def parse_filing_at_a_glance(text):
    """
    Extracts key filing info from the "Filing at a Glance" section.
    Returns a dictionary with keys: serff_tr_num, filing_type, date_submitted,
    effective_date_requested_new, effective_date_requested_renewal, serff_status, toi.
    """
    info = {}
    match = re.search(r"Filing at a Glance(.*?)(?=\n[A-Z][a-z]+ Information|$)", text, flags=re.IGNORECASE | re.DOTALL)
    section = match.group(1) if match else ""
    
    serff_match = re.search(r"SERFF Tr Num:\s*([A-Z0-9\-]+)", section, flags=re.IGNORECASE)
    info["serff_tr_num"] = serff_match.group(1).strip() if serff_match else ""
    
    filing_type_match = re.search(r"Filing Type:\s*([^\n]+)", section, flags=re.IGNORECASE)
    info["filing_type"] = filing_type_match.group(1).strip() if filing_type_match else ""
    
    date_submitted_match = re.search(r"Date Submitted:\s*(\d{2}/\d{2}/\d{4})", section, flags=re.IGNORECASE)
    info["date_submitted"] = date_submitted_match.group(1).strip() if date_submitted_match else ""
    
    effective_new_match = re.search(r"Effective Date\s*Requested\s*\(New\):\s*(\d{2}/\d{2}/\d{4})", section, flags=re.IGNORECASE)
    info["effective_date_requested_new"] = effective_new_match.group(1).strip() if effective_new_match else ""
    
    effective_renewal_match = re.search(r"Effective Date\s*Requested\s*\(Renewal\):\s*(\d{2}/\d{2}/\d{4})", section, flags=re.IGNORECASE)
    info["effective_date_requested_renewal"] = effective_renewal_match.group(1).strip() if effective_renewal_match else ""
    
    serff_status_match = re.search(r"SERFF Status:\s*([^\n]+)", section, flags=re.IGNORECASE)
    info["serff_status"] = serff_status_match.group(1).strip() if serff_status_match else ""
    
    toi_match = re.search(r"TOI:\s*([^\n]+)", section, flags=re.IGNORECASE)
    info["toi"] = toi_match.group(1).strip() if toi_match else ""
    
    return info

def parse_filing_description(text):
    """
    Extracts the Filing Description from the text.
    Returns the description string.
    """
    match = re.search(r"Filing Description:\s*(.*?)\s*(Filing Contact Information|Filing Company Information)", 
                      text, flags=re.IGNORECASE | re.DOTALL)
    if match:
        return match.group(1).strip()
    return ""

# --- Processing functions for objections (old version) ---
def process_pdf_file(pdf_path):
    """
    Processes a single PDF file for objections output.
    Truncates the PDF to the first 100 pages before processing.
    Returns a dictionary with keys:
      - "objections": parsed objection letters,
      - "group_code": extracted group code,
      - "company_codes": list of CoCodes,
      - "download_week": the download_week datetime.
    """
    # Truncate to first 100 pages
    full_text = extract_full_text(pdf_path, max_pages=100)
    parsed_letters = parse_objection_letters(full_text)
    group_code, company_codes = extract_filing_company_info(full_text)
    file_key = os.path.splitext(os.path.basename(pdf_path))[0]
    return {file_key: {
                "objections": parsed_letters,
                "group_code": group_code,
                "company_codes": company_codes,
                "download_week": download_week_dt
            }}

def process_all_pdfs_in_folder(pdf_folder):
    """
    Loops through all PDF files in the folder and processes each for objections output.
    Returns a nested dictionary.
    """
    results = {}
    pdf_files = glob.glob(os.path.join(pdf_folder, "*.pdf"))
    for pdf_file in pdf_files:
        print(pdf_file)
        print(datetime.now())
        results.update(process_pdf_file(pdf_file))
    return results

# --- Processing functions for additional filing info (with API calls) ---
def extract_full_text_all(pdf_path):
    """
    Extracts full text from the PDF using up to the first 3000 pages.
    """
    return extract_full_text(pdf_path, max_pages=3000)

def process_pdf_additional(pdf_path):
    """
    Processes a single PDF file and extracts additional filing info.
    For non-API info, uses the first 100 pages.
    For API calls, uses the full text (up to 3000 pages) subject to token constraints.
    Returns a dictionary with keys including API summaries.
    """
    # Non-API extraction (first 100 pages)
    limited_text = extract_full_text(pdf_path, max_pages=100)
    filing_at_a_glance = parse_filing_at_a_glance(limited_text)
    filing_description = parse_filing_description(limited_text)
    group_code, company_codes = extract_filing_company_info(limited_text)
    
    # API extraction using full text (up to 3000 pages)
    full_text_all = extract_full_text_all(pdf_path)
    print(f"Token count for API call in {os.path.basename(pdf_path)}: {count_tokens(full_text_all, model='gpt-3.5-turbo')}")
    
    # Always call GPT-3.5-turbo if parameter is set to 'Y', using user-defined token limit.
    if CALL_GPT_3_5 == "Y":
        gpt_api_text = truncate_text(full_text_all, max_tokens=MAX_TOKENS_GPT3_5, model="gpt-3.5-turbo")
        gpt_3_5_summary = call_gpt_3_5_turbo(gpt_api_text)
    else:
        gpt_3_5_summary = ""
    
    # For o1-mini and xAI, use the user-defined token limits.
    if CALL_O1_MINI == "Y":
        o1_api_text = truncate_text(full_text_all, max_tokens=MAX_TOKENS_O1_MINI, model="o1-mini")
        o1_mini_summary = call_o1_mini(o1_api_text)
    else:
        o1_mini_summary = ""
    
    if CALL_XAI == "Y":
        xai_api_text = truncate_text(full_text_all, max_tokens=MAX_TOKENS_XAI, model="grok-2-latest")
        xai_summary = call_xai(xai_api_text)
    else:
        xai_summary = ""
    
    additional = {
        "serff_tr_num": filing_at_a_glance.get("serff_tr_num", ""),
        "filing_type": filing_at_a_glance.get("filing_type", ""),
        "date_submitted": filing_at_a_glance.get("date_submitted", ""),
        "effective_date_requested_new": filing_at_a_glance.get("effective_date_requested_new", ""),
        "effective_date_requested_renewal": filing_at_a_glance.get("effective_date_requested_renewal", ""),
        "serff_status": filing_at_a_glance.get("serff_status", ""),
        "filing_description": filing_description,
        "filing_method": "",  # Not extracted
        "toi": filing_at_a_glance.get("toi", ""),
        "group_code": group_code,
        "company_codes": company_codes,
        "download_week": download_week_dt,
        "gpt_3_5_summary": gpt_3_5_summary,
        "o1_mini_summary": o1_mini_summary,
        "xai_summary": xai_summary
    }
    print(f"Processed additional filing info for: {os.path.basename(pdf_path)}")
    return additional

def process_all_pdfs_additional(pdf_folder):
    """
    Processes all PDFs in the folder for additional filing info.
    Returns a dictionary keyed by file name.
    """
    results = {}
    pdf_files = glob.glob(os.path.join(pdf_folder, "*.pdf"))
    for pdf_file in pdf_files:
        print(pdf_file)
        print(datetime.now())
        file_key = os.path.splitext(os.path.basename(pdf_file))[0]
        results[file_key] = process_pdf_additional(pdf_file)
    return results

def additional_to_dataframe(results):
    """
    Converts the additional filing info dictionary into a DataFrame.
    """
    rows = []
    for file_key, info in results.items():
        row = {
            "SERFF Tr Num": info.get("serff_tr_num", ""),
            "Filing Type": info.get("filing_type", ""),
            "Date Submitted": info.get("date_submitted", ""),
            "Effective Date Requested (New)": info.get("effective_date_requested_new", ""),
            "Effective Date Requested (Renewal)": info.get("effective_date_requested_renewal", ""),
            "SERFF Status": info.get("serff_status", ""),
            "Filing Description": info.get("filing_description", ""),
            "Filing Method": info.get("filing_method", ""),
            "TOI": info.get("toi", ""),
            "Group_Code": info.get("group_code", ""),
            "Company_Codes": str(info.get("company_codes", "")),
            "week_date": info.get("download_week", None),
            "gpt_3_5_summary": info.get("gpt_3_5_summary", ""),
            "o1_mini_summary": info.get("o1_mini_summary", ""),
            "xai_summary": info.get("xai_summary", "")
        }
        rows.append(row)
    df = pd.DataFrame(rows, columns=[
        "SERFF Tr Num", "Filing Type", "Date Submitted", 
        "Effective Date Requested (New)", "Effective Date Requested (Renewal)",
        "SERFF Status", "Filing Description", "Filing Method", "TOI",
        "Group_Code", "Company_Codes", "week_date",
        "gpt_3_5_summary", "o1_mini_summary", "xai_summary"
    ])
    return df

# --- DataFrame for objections ---
def results_to_dataframe_objections(results):
    """
    Converts the objections dictionary (from process_all_pdfs_in_folder) into a DataFrame.
    """
    rows = []
    for file_key, file_info in results.items():
        state_abbr = file_key.split('_')[0] if "_" in file_key else file_key[:2]
        group_code = file_info.get("group_code", "No objections")
        company_codes = file_info.get("company_codes", [])
        company_code_first = company_codes[0] if company_codes else "No objections"
        company_codes_str = str(company_codes) if company_codes else "No objections"
        download_week = file_info.get("download_week", None)
        letter_dict = file_info.get("objections", {})
        if not letter_dict:
            rows.append({
                "SERFF#": file_key,
                "State": state_abbr,
                "Group_Code": group_code,
                "Company_Code": company_code_first,
                "Company_Code_all": company_codes_str,
                "Objection_letter_date": "No objections",
                "Objectionletter#": "No objections",
                "Objection#": "No objections",
                "Objection": "No objections",
                "download_week": download_week
            })
        else:
            for letter_key, letter_content in letter_dict.items():
                letter_date = letter_content.get("objection_letter_date", "No objections")
                for obj_key, obj_text in letter_content.items():
                    if obj_key == "objection_letter_date":
                        continue
                    rows.append({
                        "SERFF#": file_key,
                        "State": state_abbr,
                        "Group_Code": group_code,
                        "Company_Code": company_code_first,
                        "Company_Code_all": company_codes_str,
                        "Objection_letter_date": letter_date,
                        "Objectionletter#": letter_key,
                        "Objection#": obj_key,
                        "Objection": obj_text.replace("\n", " ").strip(),
                        "download_week": download_week
                    })
    df = pd.DataFrame(rows, columns=[
        "SERFF#", "State", "Group_Code", "Company_Code", "Company_Code_all", 
        "Objection_letter_date", "Objectionletter#", "Objection#", "Objection", "download_week"
    ])
    return df

# --- Main Processing ---
pdf_folder = f"Filings/Final/{date}"  # Folder path based on date parameter

# Process objections using the old functions.
objection_results = process_all_pdfs_in_folder(pdf_folder)
json_output_path = f"objections_output_{date}.json"
with open(json_output_path, "w", encoding="utf-8") as f:
    json.dump(objection_results, f, default=lambda o: o.isoformat() if isinstance(o, datetime) else o, ensure_ascii=False, indent=4)
print(f"Saved objections JSON output to {json_output_path}")

df_objections = results_to_dataframe_objections(objection_results)
csv_output_path = f"objections_output_{date}.csv"
df_objections.to_csv(csv_output_path, index=False)
print(f"Saved objections CSV output to {csv_output_path}")

# Process additional filing info using the new function.
additional_results = process_all_pdfs_additional(pdf_folder)
additional_json_output_path = f"additional_filing_info_{date}.json"
with open(additional_json_output_path, "w", encoding="utf-8") as f:
    json.dump(additional_results, f, default=lambda o: o.isoformat() if isinstance(o, datetime) else o, ensure_ascii=False, indent=4)
print(f"Saved additional filing info JSON output to {additional_json_output_path}")

df_additional = additional_to_dataframe(additional_results)
additional_csv_output_path = f"additional_filing_info_{date}.csv"
df_additional.to_csv(additional_csv_output_path, index=False)
print(f"Saved additional filing info CSV output to {additional_csv_output_path}")


Filings/Final/2025-03-03\AL_ALSE-134300517.pdf
2025-03-05 07:38:47.721856
Filings/Final/2025-03-03\AL_LBPM-134436753.pdf
2025-03-05 07:38:47.797844
Filings/Final/2025-03-03\AR_TRVD-G134425501.pdf
2025-03-05 07:38:53.047600
Filings/Final/2025-03-03\AZ_TRVD-134038097.pdf
2025-03-05 07:38:53.282250
Filings/Final/2025-03-03\AZ_TRVD-G134416866.pdf
2025-03-05 07:38:53.698131
Filings/Final/2025-03-03\AZ_TRVD-G134417169.pdf
2025-03-05 07:38:54.092514
Filings/Final/2025-03-03\CA_ALSE-134306035.pdf
2025-03-05 07:38:54.322003
Filings/Final/2025-03-03\CA_LBRM-134233558.pdf
2025-03-05 07:38:55.003396
Filings/Final/2025-03-03\CA_LBRM-134354159.pdf
2025-03-05 07:38:55.611450
Filings/Final/2025-03-03\CO_LBPM-134412448.pdf
2025-03-05 07:38:56.352865
Filings/Final/2025-03-03\CO_LEMO-134207600.pdf
2025-03-05 07:38:56.534432
Filings/Final/2025-03-03\CO_USAA-134337652.pdf
2025-03-05 07:38:57.086351
Filings/Final/2025-03-03\CT_TRVD-G134412788.pdf
2025-03-05 07:38:57.443973
Filings/Final/2025-03-03\GA_ALSE-1