In [4]:
import os
import pandas as pd
import re
import fitz
from openai import OpenAI
import json
import time
from pdfminer.high_level import extract_text
from PyPDF2 import PdfReader, PdfWriter
from PyPDF2.errors import PdfReadError

In [5]:
def convert_pdf_to_txt(pdf_path, txt_path):
    try:
        # 提取 PDF 文本
        text = extract_text(pdf_path)
        # 将文本写入到 TXT 文件中
        with open(txt_path, 'w', encoding='utf-8') as txt_file:
            txt_file.write(text)
        print(f"Successfully converted {pdf_path} to {txt_path}")
    except Exception as e:
        print(f"Failed to convert {pdf_path}: {e}")

In [6]:
def find_contents_text(list_of_page):
    pattern_contents = re.compile(r'content', re.IGNORECASE)
    pattern_numbers = re.compile(r'\b\d{1,3}\b(?!%)')  # 匹配1到3位数，不包含百分号
    
    def is_valid_number(number, text):
        start_idx = text.find(number)
        end_idx = start_idx + len(number)
        
        if start_idx > 0 and text[start_idx - 1].isdigit():
            return False
        if end_idx < len(text) and (text[end_idx] == '.' or text[end_idx] == '%' or text[end_idx] == ','):
            return False
        if start_idx > 0 and text[start_idx - 1] in '-*+¥$€£.,Y':
            return False
        
        return True
    
    contents_pages = [index for index, page in enumerate(list_of_page) if pattern_contents.search(page)]
    
    print("Pages with 'contents':", contents_pages)
    
    if len(contents_pages) != 1:
        max_count = 0
        best_page = None
        best_page_number = -1
        
        if len(contents_pages) != 0:
            for index, page in zip(contents_pages, [list_of_page[index] for index in contents_pages ]):
                # print(index, page)
                numbers = pattern_numbers.findall(page)
                valid_numbers = [num for num in numbers if is_valid_number(num, page)]
                valid_numbers = set(valid_numbers)
                
                # print(f"Page {index + 1}: numbers found = {numbers}, valid numbers = {valid_numbers}")
                
                if len(valid_numbers)-5 > max_count:
                    max_count = len(valid_numbers)
                    best_page = page
                    best_page_number = index
        else:
            for index, page in enumerate(list_of_page):
                # print(index, page)
                numbers = pattern_numbers.findall(page)
                valid_numbers = [num for num in numbers if is_valid_number(num, page)]
                valid_numbers = set(valid_numbers)
                
                # print(f"Page {index + 1}: numbers found = {numbers}, valid numbers = {valid_numbers}")
                
                if len(valid_numbers)-5 > max_count:
                    max_count = len(valid_numbers)
                    best_page = page
                    best_page_number = index
        
        # print("Best page based on valid numbers:", best_page_number)
        return best_page_number, best_page
    
    contents_page_number = contents_pages[0]
    contents_page = list_of_page[contents_pages[0]]
    
    return contents_page_number, contents_page

In [7]:
def extract_page_from_pdf(pdf_path, page_number, output_path):
    doc = fitz.open(pdf_path)
    page = doc.load_page(page_number)
    new_doc = fitz.open()
    new_doc.insert_pdf(doc, from_page=page_number, to_page=page_number)
    new_doc.save(output_path)
    new_doc.close()

In [8]:
def show_json(obj):
    display(json.loads(obj.model_dump_json()))

In [9]:
def TOC_to_dict(TOC_path):
    client = OpenAI(api_key='sk-svcacct-xwPVYGtCg33lhuaOtUTnT3BlbkFJTT64XPj0hVAxsZI9PkgZ')
    file = client.files.create(
      file=open(TOC_path, "rb"),
      purpose="assistants"
    )
    
    vs = client.beta.vector_stores.create(
        file_ids=[file.id]
    )
    
    assistant = client.beta.assistants.create(
        name="TOC",
        instructions="You can identify the structure of the table of contents from the pdf that contains it and represent it with a Python dictionary, and your output should contain only the dictionary",
        model="gpt-4o-mini",
        tools=[{"type": "file_search"}],
        tool_resources={"file_search": {"vector_store_ids":[vs.id] }},
        temperature = 0.2
    )
    
    thread = client.beta.threads.create()
    message = client.beta.threads.messages.create(
        thread_id=thread.id,
        role="user",
        content="Give me the Table of Contents of that PDF with dictionary format{title:{subtitle1:page1,subtitle2:page2,...}}",
    )
    
    run = client.beta.threads.runs.create(
        thread_id=thread.id,
        assistant_id=assistant.id,
    )
    
    def wait_on_run(run, thread):
        while run.status == "queued" or run.status == "in_progress":
            run = client.beta.threads.runs.retrieve(
                thread_id=thread.id,
                run_id=run.id,
            )
            time.sleep(0.5)
        return run
    
    run = wait_on_run(run, thread)
    messages = client.beta.threads.messages.list(thread_id=thread.id)
    client.beta.assistants.delete(assistant_id=assistant.id)
    client.beta.vector_stores.delete(vs.id)
    client.beta.threads.delete(thread_id=thread.id)
    return messages

In [10]:
def convert_to_page_ranges(dict_gpt):
    # 如果最外部字典只有一个键值对，保留其值字典
    if len(dict_gpt) == 1:
        dict_gpt = next(iter(dict_gpt.values()))
    
    # 提取所有的页码到一个列表中
    all_pages = []
    
    def extract_pages(section_dict):
        for key, value in section_dict.items():
            if isinstance(value, dict):
                extract_pages(value)
            else:
                try:
                    page = int(value)
                except ValueError:
                    page = 0  # 如果无法转换，将其假设为0
                all_pages.append(page)
    
    extract_pages(dict_gpt)
    all_pages = sorted(set(all_pages))  # 去重并排序
    
    # 将页码转换为页码范围
    def process_section(section_dict):
        processed_section = {}
        keys = list(section_dict.keys())
        for i, key in enumerate(keys):
            if isinstance(section_dict[key], dict):
                processed_section[key] = process_section(section_dict[key])
            else:
                try:
                    start_page = int(section_dict[key])
                except ValueError:
                    start_page = 0  # 如果无法转换，将其假设为0
                
                if start_page in all_pages and all_pages.index(start_page) + 1 < len(all_pages):
                    next_page = all_pages[all_pages.index(start_page) + 1]
                else:
                    next_page = start_page + 1
                end_page = next_page
                processed_section[key] = list(range(start_page, end_page))
        return processed_section
    
    return process_section(dict_gpt)


In [11]:
def get_map(slide_a_page,slide_b_page,slide_c_page,slide_a_path,slide_b_path,slide_c_path):
    client = OpenAI(api_key='sk-svcacct-xwPVYGtCg33lhuaOtUTnT3BlbkFJTT64XPj0hVAxsZI9PkgZ')
    file_a = client.files.create(
      file=open(slide_a_path, "rb"),
      purpose="assistants",
    )
    
    file_b = client.files.create(
      file=open(slide_b_path, "rb"),
      purpose="assistants"
    )
    
    file_c = client.files.create(
      file=open(slide_c_path, "rb"),
      purpose="assistants"
    )
    
    vs = client.beta.vector_stores.create(
        file_ids=[file_a.id, file_b.id,file_c.id]
    )
    
    assistant = client.beta.assistants.create(
        name="Page ot slide mapping",
        instructions="You are a helpful assistance",
        model="gpt-4o-mini",
        tools=[{"type": "file_search"}],
        tool_resources={"file_search": {"vector_store_ids":[vs.id]}}
    )
    
    thread = client.beta.threads.create()
    message = client.beta.threads.messages.create(
        thread_id=thread.id,
        role="user",
        content=f"You have three pdf files in your document library, each of one is the page from a company's annual report. \n\nFirst,Identify how many pages (one or two) are in a pdf slide. Then, directly extract the 'page number' from the content of pdfs in your library and remember them as 'page_a', 'page_b', 'page_c'(dont just think they are 1,2,3).(1.Extracted page numbers of {file_a.id} and {file_b.id} should be consecutive and not repeated because the pdf is two consecutive pages\n 2.a landscape pdf may correspond to two rather than one page numbers, u can remember as 'page_a_1', 'page_a_2', 'page_b1','page_b2', 'page_c1', 'page_c2'.)\n\n The extracted ['page_a', 'page_b', 'page_c'](['page_a_1', 'page_a_2', 'page_b1','page_b2', 'page_c1', 'page_c2'] if a landscape pdf slide has two pages) correspond to the three slide numbers: [{slide_a_page},{slide_b_page},{slide_c_page}]. Please find out the same mathematical relationship between the page numbers and the slide numbers. What is the mathematical operation of the page number to get the corresponding slide number? (you should do a check to make sure your answer is correct). Your output must has a Python function for only i (function name 'page_to_slide'), where i is the page number displayed./n/n When two pages in one slide, u must use function like 'int()'or'%'or'//' ",
    )
    # \n The function cannot be a simple mapping like below:\ndef page_to_slide(page_number):\nif page_number == 93:\nreturn 93\nelif page_number == 94:\nreturn 94\nelif page_number == 372:\nreturn 372\nelse:\nraise ValueError('Page number not recognized.') 
    
    run = client.beta.threads.runs.create(
        thread_id=thread.id,
        assistant_id=assistant.id,
    )
    
    def wait_on_run(run, thread):
        while run.status == "queued" or run.status == "in_progress":
            run = client.beta.threads.runs.retrieve(
                thread_id=thread.id,
                run_id=run.id,
            )
            time.sleep(2)
        return run
    
    wait_on_run(run, thread)
    messages = client.beta.threads.messages.list(thread_id=thread.id)
    if len(messages.data) == 1:
         wait_on_run(run,thread)
    client.beta.assistants.delete(assistant_id=assistant.id)
    client.beta.vector_stores.delete(vs.id)
    client.beta.threads.delete(thread_id=thread.id)
    return messages

In [12]:
def apply_to_all_values(d, func):
        if isinstance(d, dict):
            return {k: apply_to_all_values(v, func) for k, v in d.items()}
        elif isinstance(d, list):
            return [apply_to_all_values(i, func) for i in d]
        else:
            return int(func(d))

In [21]:
def Governance_slides_numbers(dic,by="gpt"):
    # if by == "re":
    if by == "gpt":
        client = OpenAI(api_key='sk-svcacct-xwPVYGtCg33lhuaOtUTnT3BlbkFJTT64XPj0hVAxsZI9PkgZ')
        completion = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role":"system","content":"You are a helpful assistance"},
                {"role":"user","content":f"""{dic}\n\nGiven the table of contents of an annual report, identify the page numbers that cover topics related to governance within the Environmental, Social, and Governance (ESG) review and the Corporate Governance Report. The specific governance issues of interest are:

                Ownership & Control: Evaluation of the company's equity ownership structure and its impact on shareholder rights.
                Board: Effectiveness of the board in overseeing management and corporate strategy, protecting investor value, and representing shareholder interests.
                Pay: Alignment between executive pay and corporate strategy.
                Accounting: Transparency, independence, and effectiveness of audit and financial reporting practices.
                Business Ethics: Oversight and management of business ethics issues such as fraud, executive misconduct, corrupt practices, money laundering, or anti-trust violations.
                Tax Transparency: Evaluation of the company's corporate tax practices and transparency.
                Include relevant pages from the sections on Governance within ESG and Corporate Governance Report, specifically from 'Board Committees' and 'Directors’ Remuneration Report'. If these sections contain more pages than needed, only include the first few pages of each section such that the total number of pages does not exceed 50. 
                
                The output should contain a flatten python list including all pages you think related, like python code\n[pages numbers(not need to be continous)] dont use any function."""
                }
            ]
        )
        response = completion.choices[0].message.content
        pattern = r"```python\n(.+?)\n```"
        slides = sorted(set(eval(re.search(pattern, response, re.DOTALL).groups()[-1])))
        return response, slides
        

In [22]:
def extend_segments(nums):
    # 找到连续片段
    segments = []
    start = nums[0]
    for i in range(1, len(nums)):
        if nums[i] != nums[i - 1] + 1:
            segments.append((start, nums[i - 1]))
            start = nums[i]
    segments.append((start, nums[-1]))
    
    # 扩展片段
    extended_segments = []
    for start, end in segments:
        extended_segments.extend(range(start - 2, end + 3))
    
    # 去重并排序
    extended_segments = sorted(set(extended_segments))
    
    return extended_segments

In [23]:
def extract_pages_from_pdf(pdf_path, output_path, pages, password=None):
    try:
        # 打开原 PDF 文件
        reader = PdfReader(pdf_path)
        
        # 如果 PDF 文件加密且提供了密码，则解密
        if reader.is_encrypted:
            if password:
                try:
                    reader.decrypt(password)
                except PdfReadError as e:
                    print(f"Failed to decrypt the PDF file: {e}")
                    return
            else:
                print("The PDF file is encrypted. Please provide a password.")
                return
        
        writer = PdfWriter()

        # 确保页码是从0开始的索引
        pages = [page - 1 for page in pages]

        # 提取指定页码
        for page_num in pages:
            if 0 <= page_num < len(reader.pages):
                writer.add_page(reader.pages[page_num])
            else:
                print(f"Page {page_num + 1} is out of range.")
        
        # 将提取的页面写入新的 PDF 文件
        with open(output_path, 'wb') as output_pdf_file:
            writer.write(output_pdf_file)
        
        print(f"Successfully created {output_path} with specified pages.")
    except Exception as e:
        print(f"Failed to extract pages from {pdf_path}: {e}")


In [24]:
items = os.listdir('50_Annual_Reports')
companies_50 = [os.getcwd()+'/50_Annual_Reports/'+item for item in items if not item.startswith('.')]
items = os.listdir('8_Annual_Reports')
companies_8 = [os.getcwd()+'/8_Annual_Reports/'+item for item in items if not item.startswith('.')]

In [25]:
def main(company_folder_path,output_information):
    name = os.path.basename(company_folder_path)
    output_information['name'] = name
    print(name)
    
    pdf_path = company_folder_path + '/annual Report.pdf'
    txt_path = company_folder_path + '/annual Report.txt'
    TC_path = company_folder_path + '/'  + 'TC.pdf'
    slide1_path = company_folder_path + '/' + 'slide_a.pdf'
    slide2_path = company_folder_path + '/' + 'slide_b.pdf'
    slide3_path = company_folder_path + '/' + 'slide_c.pdf'
    output_txt_path = company_folder_path + '/' + 'Governance.txt'
    output_pdf_path = company_folder_path + '/' + 'Governance.pdf'
    
    if os.path.exists(txt_path):
        print(f"{pdf_path}  already been converted. Skipping...")
    else:
        print('Coverting the Annual report')
        convert_pdf_to_txt(pdf_path, txt_path)
    
    with open(txt_path, 'r', encoding='utf-8') as file:
        content = file.read()
    split_content = content.split('\f')
    total_pages = len(split_content)
    first7pages = split_content[:7]
    table_page_number, table_page_content = find_contents_text(first7pages) # Find the page has table of contents
    output_information['table_page_number'] = table_page_number
    slide1_page = total_pages//5      # GPT find the math expression from page number to slide number by this three pages give to 
    slide2_page = slide1_page+1
    slide3_page = (total_pages//5)*4
    
    
    extract_page_from_pdf(pdf_path, table_page_number, output_path = TC_path)
    extract_page_from_pdf(pdf_path, slide1_page, output_path = slide1_path)
    extract_page_from_pdf(pdf_path, slide2_page, output_path = slide2_path)
    extract_page_from_pdf(pdf_path, slide3_page, output_path = slide3_path)
    
    print("Extrarting TOC by GPT......")
    dict_res = TOC_to_dict(TC_path) # 从 PDF 中识别出目录，并且用 Python 字典返回
    output_information['dict_res'] = dict_res
    dict_str = dict_res.data[0].content[0].text.value.replace('python\n','').replace('\n','').replace(' ','').replace('```','')
    dict_gpt = json.loads(dict_str) # 从字典字符串中加载字典
    output_information['dict_gpt'] = dict_gpt
    dic = convert_to_page_ranges(dict_gpt) # 将单个页码变成对应的页码范围
    output_information['range_dict'] = dic
    print("-"*100,"\n","TOC Pages number dictionary:",dic,"\n","-"*100)
    
    print("Getting the map by GPT......")
    code_res = get_map(slide1_page,slide2_page,slide3_page,slide1_path,slide2_path,slide3_path) # 让 GPT 找到数学表达式，并且用 Python 函数返回
    output_information['code_res'] = code_res
    pattern = r"```python\n(.+?)\n```"
    relation_code = re.search(pattern, code_res.data[0].content[0].text.value, re.DOTALL).groups()[-1]
    print("-"*100,"\n","GPT find the relation as function:\n",relation_code,"\n","-"*100)
    exec(relation_code) # 运行字符串中的函数定义
    globals()['page_to_slide'] = locals()['page_to_slide'] # 将函数声明为全局的
    
    print("Mapping the dictionary......")
    dict_slide = apply_to_all_values(dic,page_to_slide) # 将目录字典中的页码范围 映射到 slide 范围
    output_information['dict_slide'] = dict_slide
    print("-"*100,"\n","TOC Slides number dictionary:",dict_slide,"\n","-"*100)
    
    print("Getting the Governance text by GPT......")
    gov_res, governance_slides = Governance_slides_numbers(dict_slide) # 让 GPT 从目录字典中根据 标题含义找到 可能包含 governance 信息的 slide
    output_information['governance_slides'] = governance_slides
    output_information['gov_res'] = gov_res
    governance_slides = extend_segments(governance_slides) # 保险起见，将 GPT 找到的范围往左右扩大一点
    governance_text = '\f'.join([split_content[i] for i in governance_slides]) #  governance 的文本切片
    print("-"*100,"\n","Text looks like:",governance_text[:500],'.'*50,governance_text[-500:],"\n","-"*100) 
    with open(output_txt_path, "w") as file:
        file.write(governance_text)
    extract_pages_from_pdf(pdf_path, output_path = output_pdf_path, pages = governance_slides)  # Governance 的PDF切片
    output_information['Pages_number'] = len(governance_slides)
    return output_information

In [26]:
if 'governance_text.csv' in os.listdir(os.getcwd()):
    current_df = pd.read_csv('governance_text.csv')
    print('loaded current output')

loaded current output


In [27]:
output_informations = []

In [28]:
for company_folder_path in companies_8:
    # if 'current_df' in vars():
    #     if os.path.basename(company_folder_path) in current_df['name'].tolist():
    #         print(os.path.basename(company_folder_path)+'already been processed')
    #         continue
    
    output_information = {}
    attempts = 0
    success = False
    while attempts < 5 and not success:
        try:
            output_information = main(company_folder_path,output_information)
            success = True
        except Exception as e:
            attempts += 1
            print(f"Attempt {attempts} failed for {company_folder_path}: {e}")
            time.sleep(1)  # 可选：添加延迟，避免连续快速重试
    if success:
        output_informations.append(output_information)
    else:
        print(f"Failed to process {company_folder_path} after 5 attempts.")
        

Barclays PLC (UK)
/Users/mason_yu/Study/ATFC-MSc/Project/Model/Final/8_Annual_Reports/Barclays PLC (UK)/annual Report.pdf  already been converted. Skipping...
Pages with 'contents': [3]
Extrarting TOC by GPT......
---------------------------------------------------------------------------------------------------- 
 TOC Pages number dictionary: {'Strategicreport': {'WelcometoBarclays': [1], 'TheGroupataglance': [2], "Inthisyear'sreport": [3], 'Chairman’sintroduction': [4, 5, 6], 'ChiefExecutive’sreview': [7, 8, 9], 'Ourbusinessmodel': [10], 'Ourstrategy': [11], 'Ourbusinessenvironment': [12], 'Ourplanandtargets': [13], 'Ournewdivisionalstructure': [14], '2023divisionalreview': [15], 'AboutBarclays': [16, 17, 18, 19, 20, 21, 22, 23]}, 'Ourstakeholders': {'Customersandclients': [24, 25, 26], 'Colleagues': [27, 28, 29], 'Society': [30, 31, 32, 33], 'Investors': [34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48]}, 'Climateandsustainabilityreport': {'Implementingourclimatestrategy'

In [229]:
try:
    output_df = pd.concat([current_df,pd.DataFrame(output_informations)])
except:
    output_df = pd.DataFrame(output_informations)

In [230]:
output_df.to_csv('governance_text.csv',index=False)

In [231]:
output_df

Unnamed: 0,name,table_page_number,dict_res,dict_gpt,range_dict,code_res,dict_slide,governance_slides,gov_res,Pages_number
0,Barclays PLC (UK),3,SyncCursorPage[Message](data=[Message(id='msg_...,"{'Strategicreport': {'WelcometoBarclays': 1, '...","{'Strategicreport': {'WelcometoBarclays': [1],...",SyncCursorPage[Message](data=[Message(id='msg_...,"{'Strategicreport': {'WelcometoBarclays': [1],...","[48, 144, 145, 146, 147, 148, 149, 150, 151, 1...",Here is a flattened list of the relevant page ...,58
1,Volkswagen AG (Europe),2,SyncCursorPage[Message](data=[Message(id='msg_...,{'ToourShareholders': {'LettertoourShareholder...,{'ToourShareholders': {'LettertoourShareholder...,SyncCursorPage[Message](data=[Message(id='msg_...,{'ToourShareholders': {'LettertoourShareholder...,"[41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 5...","```python\n[\n41, 42, 43, 44, 45, 46, 47, 48, ...",52
2,Aroundtown SA (Europe),2,SyncCursorPage[Message](data=[Message(id='msg_...,"{""BOARDOFDIRECTORS'REPORT"": {'AUDITOR’SREPORT'...","{""BOARDOFDIRECTORS'REPORT"": {'AUDITOR’SREPORT'...",SyncCursorPage[Message](data=[Message(id='msg_...,"{""BOARDOFDIRECTORS'REPORT"": {'AUDITOR’SREPORT'...","[101, 102, 103, 104, 105, 106, 107, 108, 109, ...","```python\n[101, 102, 103, 104, 105, 106, 107,...",19
3,Natwest Group PLC (UK),2,SyncCursorPage[Message](data=[Message(id='msg_...,"{'Strategicreport': {'Our2023performance': 3, ...",{'Strategicreport': {'Our2023performance': [3]...,SyncCursorPage[Message](data=[Message(id='msg_...,{'Strategicreport': {'Our2023performance': [4]...,"[84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 9...","```python\n[84, 85, 86, 87, 88, 89, 90, 91, 92...",66
4,HSBC Holdings PLC (UK),1,SyncCursorPage[Message](data=[Message(id='msg_...,{'Contents': {'Strategicreport': {'Performance...,"{'Strategicreport': {'Performancein2023': [1],...",SyncCursorPage[Message](data=[Message(id='msg_...,"{'Strategicreport': {'Performancein2023': [1],...","[87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 9...","```python\n[87, 88, 89, 90, 91, 92, 93, 94, 95...",56
5,Societe Generale (Europe),1,SyncCursorPage[Message](data=[Message(id='msg_...,"{'TableofContents': {'History': 7, 'ProfileofS...","{'History': [7], 'ProfileofSocieteGenerale': [...",SyncCursorPage[Message](data=[Message(id='msg_...,"{'History': [7], 'ProfileofSocieteGenerale': [...","[70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 8...","```python\n[70, 71, 72, 73, 74, 75, 76, 77, 78...",58
6,Deutsche Bank (Europe),2,SyncCursorPage[Message](data=[Message(id='msg_...,{'DeutscheBankGroup': {'LetterfromtheChiefExec...,{'DeutscheBankGroup': {'LetterfromtheChiefExec...,SyncCursorPage[Message](data=[Message(id='msg_...,{'DeutscheBankGroup': {'LetterfromtheChiefExec...,"[49, 50, 51, 232, 480, 481, 482, 483, 484, 485...","```python\n[49, 50, 51, 232, 480, 481, 482, 48...",30
7,Vodafone Group PLC (UK),1,SyncCursorPage[Message](data=[Message(id='msg_...,{'Contents': {'Strategicreport': {'Anewroadmap...,{'Strategicreport': {'AnewroadmapforVodafone':...,SyncCursorPage[Message](data=[Message(id='msg_...,{'Strategicreport': {'AnewroadmapforVodafone':...,"[50, 51, 200]","```python\n[200, 200, 200, 200, 200, 200, 200,...",11
8,Severn Trent Plc,2,SyncCursorPage[Message](data=[Message(id='msg_...,"{'STRATEGICREPORT': {'GroupHighlights': 1, 'Ou...","{'STRATEGICREPORT': {'GroupHighlights': [1], '...",SyncCursorPage[Message](data=[Message(id='msg_...,"{'STRATEGICREPORT': {'GroupHighlights': [2], '...","[105, 106, 107, 108, 109, 110, 111, 112, 113, ...",Here are the relevant pages based on the topic...,45
9,Admiral Group,1,SyncCursorPage[Message](data=[Message(id='msg_...,{'Contents': {'2023FinancialandStrategicHighli...,"{'2023FinancialandStrategicHighlights': [6, 7]...",SyncCursorPage[Message](data=[Message(id='msg_...,"{'2023FinancialandStrategicHighlights': [3, 3]...","[57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 6...","```python\n[57, 58, 59, 60, 61, 62, 63, 64, 65...",50


In [232]:
# set(os.path.basename(item) for item in companies[:20])-set(output_df['name'])

In [31]:
print("""


def page_to_slide(i):
    return i+2
    
    
""")




def page_to_slide(i):
    return i+2
    
    

