In [5]:
# -*- coding: utf-8 -*-
from openai import OpenAI
import os
import pandas as pd
import numpy as np
# import timeout_decorator
import time
import globals
import re
import ast
import traceback
COMPLETION_MODEL = globals.COMPLETION_MODEL
API_KEY = globals.API_KEY


# @timeout_decorator.timeout(100)
def chat_gpt_turbo(message_our,COMPLETION_MODEL,n=1,max_tokens=3000):

    client = OpenAI(api_key=API_KEY)

    completion = client.chat.completions.create(
        model=COMPLETION_MODEL,
        max_tokens = max_tokens,
        n = n,
        temperature = 0,
        messages=message_our
        )

    return completion.choices[0].message


def entity_extraction_func(text):
    client = OpenAI(api_key=API_KEY)

    system_prompt = """
        As an Operations Management Researcher and Natural Language Processing (NLP) Engineer, your primary task is to extract key information about suppliers from the provided text. Please follow these steps strictly:
        1 Entity Recognition: Identify and extract all supplier names present in the text. These are the names of companies or organizations mentioned as distinct entities in the text.
        2 Text Summary: Summarize the main activities or characteristics of each identified supplier based on the content of the text. Ensure that the summary accurately reflects the information in the text.
        Adhere to this format:
        {
            'Supplier Company Name': 'Summary of content about current Supplier',
            ...
        }.
        Do not add or change any content; simply fill in the relevant information.

        Example input text:
        "...Solar Solutions recently completed a large-scale solar energy project, which successfully increased the region's renewable energy supply..."
        Example output:
        { 'Solar Solutions': 'Completed a large-scale solar energy project, increasing the region's renewable energy supply'. }

    """

    user_prompt = f"Here is the text: \n {text}" 

    response = client.chat.completions.create(
    model="gpt-3.5-turbo-1106",
    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ]
    )

    out_put = response.choices[0].message.content
    # print(out_put)

    try:

        # 使用正则表达式匹配可能的字典格式
        dict_match = re.search(r'\{.*?\}', out_put, re.DOTALL)
        # # 将匹配的文本中的单引号转换为双引号，并清理内部的双引号
        # dict_str = dict_match.group().replace("'", '"')
        # dict_str = re.sub(r'(?<!\\)"', '', dict_str)  # 移除非转义的双引号

        # 将匹配的文本处理为符合字典格式的字符串
        dict_str = dict_match.group()
        dict_str = re.sub(r"[^a-zA-Z0-9{}:,. '\"\[\]]", '', dict_str)  # 移除不合法的字符
        dict_str = re.sub(r'(?<!")\b\w+\b(?=":)', lambda x: '"' + x.group() + '"', dict_str)  # 确保键被双引号包围
        # 尝试解析字典
        data = ast.literal_eval(dict_str)
    
    except Exception as e:
        return {}
        # 可以在此处添加其他错误处理逻辑，如返回默认值或继续执行其他任务



    return data


def extract_data(text, entity_extraction_func):
    try:
        # 使用正则表达式匹配可能的字典格式
        dict_match = re.search(r'\{.*?\}', text, re.DOTALL)
        if dict_match:
            # 将匹配的文本中的单引号转换为双引号，并清理内部的双引号
            # dict_str = dict_match.group().replace("'", '"')
            # dict_str = re.sub(r'(?<!\\)"', '', dict_str)  # 移除非转义的双引号

            # 将匹配的文本处理为符合字典格式的字符串
            dict_str = dict_match.group()
            dict_str = re.sub(r"[^a-zA-Z0-9{}:,. '\"\[\]]", '', dict_str)  # 移除不合法的字符
            dict_str = re.sub(r'(?<!")\b\w+\b(?=":)', lambda x: '"' + x.group() + '"', dict_str)  # 确保键被双引号包围

            try:
                # 尝试解析字典
                data = ast.literal_eval(dict_str)
            except (SyntaxError, ValueError):
                # 如果解析失败，使用命名实体识别函数处理文本
                data = entity_extraction_func(text)
        else:
            # 如果没有匹配到字典格式，同样使用命名实体识别函数
            data = entity_extraction_func(text)

        return data
    except Exception as e:
        traceback.print_exc()
        return None 

def save_data(data, path, company_name, ):
    
        # 创建 DataFrame
        df = pd.DataFrame(list(data.items()), columns=['Supplier', 'Content of Supplier'])

        # # 添加编号列
        # df.insert(0, 'Number', range(1, len(df) + 1))

        # # 构建文件路径
        # file_path = f'{path}/{company_name}_suppliers.csv'
        # 构建文件路径
        file_path = f'{path}/{company_name}.csv'

        # 检查路径是否存在，如果不存在，则创建它
        directory = os.path.dirname(file_path)
        if not os.path.exists(directory):
            os.makedirs(directory)

        # 检查文件是否存在，根据情况选择写入模式
        if os.path.exists(file_path):
            # 文件存在，使用追加模式
            df.to_csv(file_path, mode='a', header=False, index=False, sep="\t")
        else:
            # 文件不存在，使用写入模式
            df.to_csv(file_path, index=False, sep="\t")

def read_and_replace_newlines(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        # 读取文件内容
        text = file.read()

    # 删除额外的空白字符和无用符号
    text = re.sub(r'\n+', '\n', text)  # 替换多个连续换行符为单个换行符
    text = re.sub(r'\s+\n', '\n', text)  # 删除行末的空白字符
    text = re.sub(r'\n\s+', '\n', text)  # 删除行首的空白字符
    text = re.sub(r'[^A-Za-z0-9.,;:\'"\(\)\[\]\n]+', ' ', text)  # 删除除基本标点符号和数字字母外的所有字符

    return text


def split_text_into_parts(text_org, num_parts=10):
    extracted_text = text_org
    # 计算每部分的大致长度
    part_length = len(extracted_text) // num_parts

    # 初始化结果列表和当前部分的开始索引
    parts = []
    start = 0

    for _ in range(num_parts - 1):
        # 计算这部分应当结束的大致位置
        end = start + part_length

        # 如果不是在文本末尾，尽量在句子或单词结束处分割
        if end < len(extracted_text):
            while end < len(extracted_text) and extracted_text[end] not in " .,;?!":
                end += 1

        # 截取当前部分
        parts.append(extracted_text[start:end])

        # 更新下一部分的开始位置
        start = end

    # 添加最后一部分
    parts.append(extracted_text[start:])

    return parts

def extract_10k_from_text(text_org):
    
    # 10k 特有的截取过程
    pattern = r"Item\s1\..*Item\s1B\."
    result = re.search(pattern, text_org, re.DOTALL)
    if result:
        extracted_text = result.group(0)
        len_extracted_text = len(extracted_text)
        print(f"{len_extracted_text=} {extracted_text}")
    else:
        extracted_text = None
        print("No match found.")

    return extracted_text

In [6]:
is_10k = True # TODO 配置当前代码运行什么txt的llm程序
            # True运行10k 
            # False 运行 sustainable report

if is_10k:
    folder_path = '10k'  # 10k 部分
else:
    folder_path = 'PDF_Convert_Txts' # 年报部分

txt_path_list = os.listdir(f"{folder_path}")
txt_path_list = sorted(txt_path_list)



if is_10k:
    existed_report_path = f"./Supply_10K_After_GPT/{COMPLETION_MODEL}"
else:
    existed_report_path = f"./Supply_sustainable_report_After_GPT/{COMPLETION_MODEL}"


print(f"总共待处理文档{len(txt_path_list)}个\n文档列表如下：\n{txt_path_list= }")

总共待处理文档74个
文档列表如下：
txt_path_list= ['AAL_6201-AAL_AMERICAN_AIRLINES_GROUP_INC_2013_12_31_aagaa10k-20131231.txt', 'AAL_6201-AAL_AMERICAN_AIRLINES_GROUP_INC_2014_12_31_d829913d10k.txt', 'AAL_6201-AAL_AMERICAN_AIRLINES_GROUP_INC_2015_12_31_d78287d10k.txt', 'AAL_6201-AAL_AMERICAN_AIRLINES_GROUP_INC_2016_12_31_d286458d10k.txt', 'AAL_6201-AAL_AMERICAN_AIRLINES_GROUP_INC_2017_12_31_a10k123117.txt', 'AAL_6201-AAL_AMERICAN_AIRLINES_GROUP_INC_2018_12_31_a10k123118.txt', 'AAL_6201-AAL_AMERICAN_AIRLINES_GROUP_INC_2019_12_31_a10k123119.txt', 'AAL_6201-AAL_AMERICAN_AIRLINES_GROUP_INC_2020_12_31_aal-20201231.txt', 'AAL_6201-AAL_AMERICAN_AIRLINES_GROUP_INC_2021_12_31_aal-20211231.txt', 'AAL_6201-AAL_AMERICAN_AIRLINES_GROUP_INC_2022_12_31_aal-20221231.txt', 'AAPL_320193-AAPL_APPLE_INC_2013_09_28_d590790d10k.txt', 'AAPL_320193-AAPL_APPLE_INC_2014_09_27_d783162d10k.txt', 'AAPL_320193-AAPL_APPLE_INC_2015_09_26_d17062d10k.txt', 'AAPL_320193-AAPL_APPLE_INC_2016_09_24_a201610-k9242016.txt', 'AAPL_320193-AAPL_

In [7]:
text_file_num = 5   # TODO 配置你想要测试的文档数量
org_txt_path_list = txt_path_list.copy()
txt_path_list = txt_path_list[:text_file_num] 

# 如果不想要配置文档数量并测试，而是开始全部爬取，则取下面的注释 TODO
# txt_path_list = org_txt_path_list


print(f"选取处理文档{len(txt_path_list)}个\n 文档列表如下：\n{txt_path_list= }")

选取处理文档5个
 文档列表如下：
txt_path_list= ['AAL_6201-AAL_AMERICAN_AIRLINES_GROUP_INC_2013_12_31_aagaa10k-20131231.txt', 'AAL_6201-AAL_AMERICAN_AIRLINES_GROUP_INC_2014_12_31_d829913d10k.txt', 'AAL_6201-AAL_AMERICAN_AIRLINES_GROUP_INC_2015_12_31_d78287d10k.txt', 'AAL_6201-AAL_AMERICAN_AIRLINES_GROUP_INC_2016_12_31_d286458d10k.txt', 'AAL_6201-AAL_AMERICAN_AIRLINES_GROUP_INC_2017_12_31_a10k123117.txt']


In [11]:
# 断点重续
try:
    existed_reports = set()
    existed_report_list = os.listdir(existed_report_path)
    for existed_report in existed_report_list[:-1]:
        # existed_report = existed_report.replace("_suppliers", "")
        existed_report = existed_report.split(".")[0]
        existed_reports.add(existed_report)
    print(existed_reports)

    if len(existed_report_list) >= 1:
        last_report = existed_report_list[-1] # 删除，重新生成，以保证完整性
        print(f"从文档{last_report}开始")
        import os
        last_file_path = f"{existed_report_path}/{last_report}"# 替换为要删除的文件的路径
        # print(last_file_path)
        # 检查文件是否存在
        if os.path.exists(last_file_path):
            os.remove(last_file_path)
            print(f"File {last_file_path} has been deleted.")
        else:
            print(f"File {last_file_path} does not exist.")

except:
    pass


for txt_path in txt_path_list:
    company_name = txt_path.replace(".txt", "")
    print("开始处理 company_name", txt_path)
    if company_name in existed_reports:
        continue
    modified_text = read_and_replace_newlines(f"{folder_path}/{txt_path}")

    extract_text = extract_10k_from_text(modified_text) # TODO 10k 特有步骤，非10k 可以删除
    modified_text_parts = split_text_into_parts(extract_text)

    system_prompt = """
        As an operations management researcher, \
        your task is to extract the names of suppliers from a company's provided responsibility report and summarize the overall situation of the supplier as mentioned in the report.\
        Strictly adhere to the following example format, without adding or altering any content
        If it does not exist any supplier companies, an empty {} is returned.\
        If it does not exist any supplier companies, following format string is returned.    :\
        {
            'Supplier Company Name': 'Summary of content about current Supplier',
            ...
        }.\
        Here is an output example:\
        {
            'Umicore Finland Oy': 'Conformant refiner sourcing cobalt for Gigafactory Nevada and Fremont external cell sourcing, located in Finland.',
            'Murrin Murrin Nickel Cobalt Plant': 'Conformant refiner sourcing cobalt for Gigafactory Nevada and Fremont external cell sourcing, located in Australia.',
            ...
        }.\
        Any deviation from this format will not be accepted. \
        Any information beyond this format will be disregarded. \
        Please ensure your response strictly follows this structure. \
        Extract the names of the suppliers accurately based on the report's content, and output them in the specified format.

"""
    to_save_data=dict()
    for modified_text_item in modified_text_parts:

        message_our = [
            {"role": "system", "content": system_prompt}
        ]


        user_prompt = f"{modified_text_item}" 
        user_prompt_dic = {"role": "user", "content": user_prompt}
        message_our.append(user_prompt_dic)

        attempts = 0
        success = False

        while attempts < 5 and not success:
            try:
                output = chat_gpt_turbo(message_our, COMPLETION_MODEL).content
                success = True
                print("output", output)
            except Exception as ex:
                traceback.print_exc()
                print('no response from gpt')
                time.sleep(5)
                attempts += 1
                if attempts == 3:
                    break
            

        import re
        if output and re.match(r'^\s*\{\s*\}\s*$', output) is None:
            data = extract_data(output, entity_extraction_func)

        print(f"from output extract {data=}")
        if data and isinstance(data, dict):
            to_save_data.update(data)
        else:
            print(f"data is not instance dict some error happend {data}")

    save_data(to_save_data, existed_report_path, company_name)
    



set()
开始处理 company_name AAL_6201-AAL_AMERICAN_AIRLINES_GROUP_INC_2013_12_31_aagaa10k-20131231.txt
len_extracted_text=45 Item 1.Business5Item1A.Risk Factors25Item 1B.


Traceback (most recent call last):
  File "C:\Users\junchengshen\AppData\Local\Temp\ipykernel_28884\4224630594.py", line 77, in <cell line: 28>
    output = chat_gpt_turbo(message_our, COMPLETION_MODEL).content
  File "C:\Users\junchengshen\AppData\Local\Temp\ipykernel_28884\1418857773.py", line 21, in chat_gpt_turbo
    completion = client.chat.completions.create(
  File "d:\anaconda_0622\lib\site-packages\openai\_utils\_utils.py", line 303, in wrapper
    return func(*args, **kwargs)
  File "d:\anaconda_0622\lib\site-packages\openai\resources\chat\completions.py", line 598, in create
    return self._post(
  File "d:\anaconda_0622\lib\site-packages\openai\_base_client.py", line 1086, in post
    return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))
  File "d:\anaconda_0622\lib\site-packages\openai\_base_client.py", line 846, in request
    return self._request(
  File "d:\anaconda_0622\lib\site-packages\openai\_base_client.py", line 898, in _reques

no response from gpt


KeyboardInterrupt: 

In [None]:
import pandas as pd
import os
import re



# def standardize_supplier_name(name):
#     """ 标准化供应商名称，可根据需要调整正则表达式 """
#     # 示例：将简写转换为完整名称，如“Tesla”转换为“Tesla Inc.”
#     # 注意：这里的规则需要根据具体情况来定制
#     name = re.sub(r'\bTesla\b', 'Tesla Inc.', name)
#     return name

def process_supplier_data(df):

    # # 标准化供应商名称
    # df['Supplier'] = df['Supplier'].apply(standardize_supplier_name)

    # 确保所有的内容都是字符串类型
    df['Content of Supplier'] = df['Content of Supplier'].astype(str)


    # 合并相同的供应商行，拼接内容
    df = df.groupby('Supplier')['Content of Supplier'].apply(lambda x: ' '.join(x)).reset_index()

    # 在结尾添加一行编号
    df['Number'] = range(1, len(df) + 1)

    return df



def process_all(folder_path):

    # 遍历文件夹中的所有文件
    for filename in os.listdir(folder_path):
        print(f"{filename=}")
        if filename.endswith('.csv'):  # 确保文件是CSV格式

            if not is_10k and filename.split(".")[0].endswith('CSR'):
            
                company_name = filename.split(".")[0].split("_")[1]
                year = filename.split(".")[0].split("_")[2]
                report_type = filename.split(".")[0].split("_")[-1]
                # print(company_name)
                # print(year)
                # print(report_type)
            elif is_10k:
                
                # # 使用正则表达式匹配日期和年份部分
                pattern = r"(\d{4})_(\d{2}_\d{2})"
                result = re.search(pattern, filename)

                if result:
                    year = result.group(1)
                    date = result.group(2)
                    print("Year:", year)
                    print("Date:", date)
                else:
                    print("No match found.")
                result = re.search(pattern, filename)
                tic = filename.split("-")[0].split("_")[0]
                tic_company_name_pattern = r"(\w+)_(\d{4}_\d{2}_\d{2})"
                tic_company_name_res = re.search(tic_company_name_pattern, filename)
                if tic_company_name_res:
                    company_name = result.group(1).strip(tic).strip("_")
                    print("TIC:", tic)
                    print("Company Name:", company_name)
                else:
                    print("No match found.")
                company_name = company_name
                report_type = "10k" if is_10k else "sustainable report"
                # print(company_name)
                # print(year)
                # print(report_type)


            file_path = os.path.join(folder_path, filename)
            df = pd.read_csv(file_path, sep="\t")

            # 合并相同的
            result_df = process_supplier_data(df)

            # 把company_name, year, type加到result_df后，每一行的值都相同

            # 向结果DataFrame中添加公司名、年份和类型的列
            result_df['Company'] = company_name
            result_df['Year'] = year
            result_df['Report_type'] = report_type

            # 需要代码补全

            # 可以选择将结果保存回文件
            new_folder_path = f"{folder_path}_post"
            new_file_path = os.path.join(new_folder_path, f"{filename}")
            # 检查路径是否存在，如果不存在，则创建它
            directory = os.path.dirname(new_file_path)
            if not os.path.exists(directory):
                os.makedirs(directory)
            result_df.to_csv(new_file_path, index=False,sep="\t")


process_all("./Supply_After_GPT/gpt-3.5-turbo-1106")

UnboundLocalError: cannot access local variable 'company_name' where it is not associated with a value