In [57]:
import re
import pandas as pd

In [58]:
# 使用标准库读取 Markdown 文件
with open('Knowledge_all.md', 'r', encoding='utf-8') as file:
    text = file.read()

In [59]:
def parse_text(text):
    lines = iter(text.strip().split('\n'))
    data = []

    group = ""
    key_metric_name = ""
    score_definition = ""
    measurement_method = ""
    typical_scoring_contribution = ""
    in_table = False  # 用于标记是否在处理表格内容
    table_lines = []  # 用于存储表格内容

    for line in lines:
        line = line.strip()

        if line.startswith("#### "):  # Group (四级标题)
            # 在更改 group 之前，先保存前一个 group 的最后一个 Key Metric
            if key_metric_name:
                if table_lines:
                    measurement_method += "\n" + "\n".join(table_lines)
                    table_lines = []  # 清空表格行
                data.append([group, key_metric_name, score_definition, measurement_method, typical_scoring_contribution])
                key_metric_name = ""  # 清空当前 Key Metric 信息
            group = line[5:].strip()

        elif line.startswith("##### "):  # Key Metric Name
            if key_metric_name:  # 如果前一个 metric 已经被填充，存储它
                if table_lines:
                    measurement_method += "\n" + "\n".join(table_lines)
                    table_lines = []  # 清空表格行
                data.append([group, key_metric_name, score_definition, measurement_method, typical_scoring_contribution])
            
            key_metric_name = line[6:].strip()
            score_definition = ""
            measurement_method = ""
            typical_scoring_contribution = ""
            in_table = False  # 重置表格状态

        elif line.startswith("**Score Definition:**"):  # Score Definition
            score_definition = next(lines).strip()

        elif line.startswith("**Measurement Method:**"):  # Measurement Method
            measurement_method = next(lines).strip()
            in_table = True  # 标记进入表格解析模式

        elif in_table and line.startswith("|"):  # 表格内容
            table_lines.append(line)

        elif line.startswith("**Typical Scoring Contribution:**"):  # Typical Scoring Contribution
            if table_lines:  # 如果之前有表格内容，先处理表格
                measurement_method += "\n" + "\n".join(table_lines)
                table_lines = []  # 清空表格行
            typical_scoring_contribution = next(lines).strip()
            in_table = False  # 退出表格解析模式

    # Append the last metric if it exists
    if key_metric_name:
        if table_lines:
            measurement_method += "\n" + "\n".join(table_lines)
        data.append([group, key_metric_name, score_definition, measurement_method, typical_scoring_contribution])

    # Convert the collected data into a DataFrame, with an additional "Group" column
    df = pd.DataFrame(data, columns=["Group", "Key Metrics Name", "Score Definition", "Measurement Method", "Typical Scoring Contribution"])
    return df

df = parse_text(text)

In [60]:
df.to_csv("KM_DF.csv", index=False)

In [61]:
range(0,len(df)*2-1,2)

range(0, 209, 2)

In [83]:
str10 = str()
for _, row in df.iloc[0:10].iterrows():
    str10 += 'Key Issues Name:' + row["Group"]  +'\n'
    str10 += 'Key Metrics Name: ' + row['Key Metrics Name'] +'\n\n'
    str10 += 'Score Definition:\n' + row['Score Definition'] + '\n\n'
    str10 += 'Measurement Method:\n' + row['Measurement Method'] + '\n\n'
    str10 += 'Typical Scoring Contribution:\n' + row['Typical Scoring Contribution'] + '\n'
    str10 += '------\n\n'

In [84]:
print(str10)

Key Issues Name:Board
Key Metrics Name: 1. Independent Chair Key Metric

Score Definition:
Is the non-executive chair classified as not independent of management or not independent of other interests (links to employees, government, or major owners) based on MSCI ESG Research criteria?

Measurement Method:
Flagged if yes.

Typical Scoring Contribution:
0.1
------

Key Issues Name:Board
Key Metrics Name: 2. Combined CEO/Chair Key Metric

Score Definition:
Does the company have a combined CEO/chair?

Measurement Method:
Flagged if yes.

Typical Scoring Contribution:
0.2
------

Key Issues Name:Board
Key Metrics Name: 3. Leadership Concerns Key Metric

Score Definition:
Are there concerns that there is a leader with excessive influence, including where there is an executive chair serving alongside a CEO, a former CEO as chair, or unregistered directors providing leadership direction?

Measurement Method:
Flagged if yes.

Typical Scoring Contribution:
0.3
------

Key Issues Name:Board
Key 

In [67]:
df

Unnamed: 0,Group,Key Metrics Name,Score Definition,Measurement Method,Typical Scoring Contribution
0,Board,1. Independent Chair Key Metric,Is the non-executive chair classified as not i...,Flagged if yes.,0.1
1,Board,2. Combined CEO/Chair Key Metric,Does the company have a combined CEO/chair?,Flagged if yes.,0.2
2,Board,3. Leadership Concerns Key Metric,Are there concerns that there is a leader with...,Flagged if yes.,0.3
3,Board,4. Chair Not Independent & No Independent Lead...,"For companies with a non-independent chair, ha...",Flagged if yes.,0.1
4,Board,5. Board Majority Independent of Management Ke...,Are less than a majority of directors independ...,Flagged if yes.,"Variable, based on percentage of independent (..."
...,...,...,...,...,...
100,Business Ethics,6. Regular Audits of Ethical Standards Key Metric,Indicates the frequency and scope of audits re...,"Variable, based on the disclosure of managemen...","[0.0,-0.7,-1.0,-1.4]"
101,Business Ethics,7. Anti-Money Laundering (AML) Policy Key Metric,Indicates the scope of a company's policy on a...,"Variable, based on the disclosure of managemen...","[0.0,-0.7,-1.0,-1.4]"
102,Business Ethics,8. Corruption Risk Exposure & Controversies Ke...,Indicates the company’s exposure to business a...,\n| % of operations in medium- and high-risk g...,"[0.0,-0.6.-1.2,-2.0,-2.6,-4.0]"
103,Business Ethics,9. Business Ethics Controversies Key Metric,Indicates the company’s involvement in anticom...,\n| Assessment | Type | Deduction |...,"[0.0,-0.6,-1.2,-1.8,-2.4,-3.6,-4.0,-7.0]"
