# Setup

In [126]:
import json
from openai import OpenAI
from sentence_transformers import SentenceTransformer, util
import math
import numpy as np
import ast

from tqdm import tqdm

In [35]:
def GPT_4V_depict(img_url):
    client = OpenAI(api_key = '')

    response = client.chat.completions.create(
        model="gpt-4-vision-preview",
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "Describe image preciously in Chinese within 20 words."},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": img_url,
                            "detail": "low",
                        },
                    },
                ],
            }
        ],
        max_tokens=300,
        
    )

    return response

In [16]:
def extract_response(response):
    # Parse the JSON string
    parsed_json = json.loads(response.json())

    # Extract the content and total_tokens
    content = parsed_json['choices'][0]['message']['content']
    total_tokens = parsed_json['usage']['total_tokens']

    return content, total_tokens

In [131]:
def SentTrans(target_text, texts, model_type='all-MiniLM-L6-v2'):
    """
    Default model of Sentence Transformer: all-MiniLM-L6-v2.
    - target_text takes in the objective, the description of image.
    - texts takes in a list of the whole paragraphs in the same page.
    """
    
    # 加载预训练模型
    model = SentenceTransformer(model_type)

    # 计算目标文本的嵌入
    target_embedding = model.encode(target_text, convert_to_tensor=True)

    # 计算一组文本的嵌入
    text_embeddings = model.encode(texts, convert_to_tensor=True)

    # 计算相似度
    cosine_scores = util.pytorch_cos_sim(target_embedding, text_embeddings)

    # 找到最相似的文本
    most_similar_idx = cosine_scores.argmax()

    print(cosine_scores)
    print("最相关的文本是:\n\n", texts[most_similar_idx])
    return texts[most_similar_idx]

In [82]:
def distance(img_cord, text_cord):
    """
    Takes in both 'img_cord' and 'text_cord' in tuple of (x, y).
    Return the distance between two cord.
    """
    img_x, img_y = img_cord
    text_x, text_y = text_cord
    
    return math.sqrt((text_x - img_x)**2 + (text_y - img_y)**2)

# Exhibit a
华润三九，2022，SUS，Page21，img_1

In [3]:
img_url = "https://raw.githubusercontent.com/Lorre-Ramon/Image-Hosting-Service/main/img/ESG_AI/000999.SZ-华润三九-华润三九%202022年度可持续发展报告-2023-03-29.pdf_page_21_img_1.png"

In [4]:
response = GPT_4V_depict(img_url)

In [8]:
content, total_tokens = extract_response(response)
content, total_tokens

/var/folders/kd/t2wnd4h911v09vtv543hpkg80000gn/T/ipykernel_31519/4266333353.py:3: PydanticDeprecatedSince20: The `json` method is deprecated; use `model_dump_json` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.4/migration/
  parsed_json = json.loads(response.json())


('多人在会议室里开会，使用笔记本电脑，前方有投影屏幕。', 476)

In [21]:
texts = ["我们严格遵守《上市公司信息披露管理办法》《上市公司投资者关系管理工作指引》等相关规定，制定或修订《投资者关系管理制度》《信息披露事务管理制度》等公司制度，持续提升信息披露质量及投资者关系管理工作水平，促进公司",
             "我们积极构建合规管理体系，不断强化法律风险管理，积极开展法律合规培训，提升公司合规管理水平，保障公司稳健", 
             "不断完善法律风险防范机制，持续关注医药行业重点领域和关键环节的法律风险合规管理工作。持续开展重大风险评估工作，通过风险信息收集、风险识别、风险评估、风险应对流程，制定风险应对策略，做好事前法律风险管理。实现规章制度、经济合同、重要决策的100%法律审核。",
             "强化法律风险管理",
             "推动多层级多形式的合规文化建设，开展普法宣传活动，增强全员守法合规意识，提高公司全体员工法治意识和法治观念。2022年，公司员工法律意识和行为合规受训41080人次。",
             "培育守法合规文化",
             "开展廉洁合规体系培训",
             "完善合规制度体系，强化合规信息化建设，搭建智慧合规管理平台，定期开展合规检查考核工作，公布合规投诉专用邮箱：CR999-HGTS@999.com.cn，对获取到的举报信息和举报人的身份保密，有效推行合规管理与监督。",
             "完善法律合规建设"]

In [22]:
simi_text = SentTrans(content, texts)

tensor([[0.3346, 0.3956, 0.4620, 0.5540, 0.3638, 0.5326, 0.5792, 0.5252, 0.5359]])
最相关的文本是:

 开展廉洁合规体系培训


# Exhibit b

In [25]:
img_url="https://raw.githubusercontent.com/Lorre-Ramon/Image-Hosting-Service/main/img/ESG_AI/00941.HK-中国移动-中国移动%202022年度可持续发展报告-2023-03-24.pdf_page_3_img_1.png"

In [26]:
response = GPT_4V_depict(img_url)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [27]:
content, total_tokens = extract_response(response)
content, total_tokens

/var/folders/kd/t2wnd4h911v09vtv543hpkg80000gn/T/ipykernel_5989/4266333353.py:3: PydanticDeprecatedSince20: The `json` method is deprecated; use `model_dump_json` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.4/migration/
  parsed_json = json.loads(response.json())


('这是一位身穿深色西装、打着领带的男士，手臂交叉站立，给人严肃专业的印象。', 1172)

In [28]:
texts = ["董事长致辞把握信息文明规律释放信息服务效能赋能社会持续发展",
             "2022年是极不平凡、极为重要的一年，也是中国移动实施“十四五”规划的关键一年。在社会各界的关心和支持下，我们锚定“世界一流信息服务科技创新公司”发展定位，践行创世界一流“力量大厦”发展战略，坚持稳中求进工作总基调，全力推进新基建、融合新要素、激发新动能，有效克服多重挑战，取得了良好成绩。2022年，中国移动的营业收入再创新高，利润在高基数基础上继续保持良好增长，收入结构更加稳健，整体抗风险能力显著提升，企业对经济、社会、环境的价值贡献持续扩大。放眼大局，洞察信息文明发展规律。我们持续深入思考“如何实现社会和企业的可持续发展”这一重大命题，从大历史观、大战略观的高度，深刻洞察了能量和信息驱动人类文明进步的内在规律，并用公式表示为：C=∑[E+I+f(E×I)]。其中，C代表人类文明程度，E代表人类获取和利用能量的水平，I代表人类生成和运用信息的水平，E×I代表能量和信息的融合创新，增长型函数f()代表能量和信息融合创新的多样性和无限可能性，求和符号∑代表文明进步是能量和信息不断累积、发展的结果。随着人类文明演进，从最初的语言、符号、文字，到工业时代的电报、电话，再到信息时代的数据和信息技术，信息的载体不断发展。当前，信息深度融入能量转化和运用的全过程，引发“瓦特×比特”的融合聚变，人类社会加速进入以信息为主导，信息和能量深度融合发展的全新阶段，两者融合创新函数f(E×I)，成为推动人类文明进步的主要动力。数智创新，全力释放信息服务效能。基于对信息文明发展规律的认识，我们系统打造以5G、算力网络、能力中台为重点的新型信息基础设施，着力构建“连接+算力+能力”新型信息服务体系，全面赋能全社会的生产方式、生活方式、治理方式数智化转型。我们坚持创新驱动发展，体系化推进科技创新，牵头5G国际标准197个，输出5G专利超4,100件，稳居全球运营商第一阵营。我们全力保障网络安全、数据安全、通信安全、内容安全，圆满完成重大活动通信保障任务，第一时间开展泸定地震等突发事件的应急保障，切实保护个人数据隐私，维护用户合法权益，持续筑牢安全屏障。截至2022年12月底，中国移动累计开通5G基站达128.5万个，有线宽带接入数2.72亿户，算力规模达8.0EFLOPS，累计打造5G行业商用案例超1.8万个。包容成长，与全社会共享发展成果。我们深入贯彻以人民为中心的发展思想，着力实现与员工、客户等利益相关方共同发展。我们长期坚持“人才强企”战略，高效育才、引才、聚才、用才。积极完善民主管理、职业发展、权益保障体系机制，深入实施“五小”暖", 
             "心工程、“幸福1+1”、员工困难帮扶等项目。我们着力消除残障人士、银发族、文化差异等特殊群体数字应用鸿沟，共享信息化发展成果，覆盖人群超3,716.7万人。我们着力将信息技术能力广泛延伸，持续做好“七大帮扶举措”，对口帮扶1,442个县、乡、村巩固拓展脱贫成果，深入实施“七大乡村数智化工程”，在超过35万个偏远农村推进数字乡村建设，接续全面推进乡村振兴，持续深耕公益慈善，受益群众超1,766万人。我们主动融入国家重大发展战略，争当区域协调发展的“创新引擎”，“一带一路”的“履责先锋”，为畅通国内国际双循环提供有力支撑。绿色发展，着力赋能增长方式转换。我们积极践行绿水青山就是金山银山的理念，努力促进人与自然和谐共生。我们制定碳达峰行动方案，持续开展“C²三能——中国移动碳达峰碳中和行动计划”，稳步降低传统用能，全年节电64.3亿度。我们积极打造绿色供应链，设备采购设置节能技术测评比例超过90%。我们充分发挥信息化技术降碳杠杆作用，推动煤炭、钢铁等传统行业转型升级，推广线上会议、办公、医疗、信息消费等应用，助力城市绿色智慧发展和居民绿色生活。我们广泛支持生态环境数据的采集、监测、挖掘与分析，提高政府的环境监测能力。我们积极参与长江“十年禁渔”等生态保护项目，助力提高生态系统的稳定性、持续性，与大家携手共建美丽地球家园。卓越治理，持续建设可信赖的企业。我们按照权责法定、权责透明、协调运转、有效制衡的要求，积极推动建立现代企业制度，充分发挥董事会、经理层作用，不断健全法人治理结构，夯实公司治理根基。全面推进深化改革，完善市场化经营机制，全面防范和化解各类风险，强化商业道德与反腐败，建设受各方信任的负责任企业。深入推进ESG管理融入公司治理，形成经济价值与社会价值互为助力、相互促进的企业发展新格局。母公司连续18个年度和6个任期获得中央企业负责人经营业绩考核A级。扬帆沧海迎激浪，勇立潮头启新航。展望2023年，我们要紧紧围绕做强做优做大国有资本和国有企业、做强做优做大数字经济的要求，加快建设网络强国、数字中国和智慧社会，加快建设世界一流企业，起高坐稳，携手利益相关方共同求解信息和能量融合创新的“最佳函数”、创造更广泛的社会价值，为中国式现代化创造信息文明时代可持续发展的新范式作出更大贡献！",
             "2023年3月",
             "中国移动有限公司董事长",
             "杨杰"]

In [29]:
simi_text = SentTrans(content, texts)

tensor([[0.3777, 0.3606, 0.4233, 0.1380, 0.4342, 0.3766]])
最相关的文本是:

 中国移动有限公司董事长


# Exhibit c

In [30]:
img_url="https://raw.githubusercontent.com/Lorre-Ramon/Image-Hosting-Service/main/img/ESG_AI/00941.HK-中国移动-中国移动%202022年度可持续发展报告-2023-03-24.pdf_page_7_img_1.png"

In [31]:
response = GPT_4V_depict(img_url)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [32]:
content, total_tokens = extract_response(response)
content, total_tokens

/var/folders/kd/t2wnd4h911v09vtv543hpkg80000gn/T/ipykernel_5989/4266333353.py:3: PydanticDeprecatedSince20: The `json` method is deprecated; use `model_dump_json` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.4/migration/
  parsed_json = json.loads(response.json())


('工人正站在高处的基站天线旁，进行维护或安装工作，背景是晴朗的天空和山景。', 316)

In [97]:
texts = ["▶优化网络体验",
        "公司以用户感知为中心，持续提升网络质量，积极开展质量攻坚专项行动，构建数智化网络运维体系，精准定位、攻克网络问题，精准建设提升网络资源效能，为丰富多样的业务应用提供高品质网络服务支撑。", 
        "面向个人用户",
        "移动网络质量满意度和5G上网质量满意度行业双领先。在全球率先开通VoNR高清通话业务，支持720P高清视频通话，让沟通更优质、更精彩。",
        "面向家庭用户",
        "全网装维服务及时率保持在99%以上。为客户提供“一户一档”质量优化服务，完成15.88万个小区的网络优化，网络稳定性和健壮性持续提升。家宽业务自动开通成功率由95.57%提升至97.90%，家宽业务开通时长由22.72小时缩短至20.53小时。",
        "面向政企用户",
        "专线网络满意度由93.53%提升至96.57%，企宽网络满意度由91.22%提升至94.86%。5G专网标准化场景开通时长大幅缩短，物联网开通及时率提升55PP。",
        "中国移动积极响应海洋强国战略，发挥5G网络广覆盖、高可靠等优势，在近、中、远海区域展开了5G覆盖工程，推动5G网络向海域延伸，有效助力加强海洋监管、发展海洋经济发展、便利渔民出海和游客出行。截至2022年12月底，中国移动海域5G工程覆盖辽宁、山东、江苏、浙江、福建、广东、广西、海南等沿海省份，实现对近海的广泛覆盖。",
        "案例中国移动5G服务海洋",
        "◎在“海上社区”建设5G网络◎渔民通过VoNR与家人视频报平安",
        "提升泛在融合“算力服务”",
        "中国移动贯彻落实国家“东数西算”工程部署，以“算力泛在、算网共生、智能编排、一体服务”为发展目标，优化数据中心布局和高速直连网络规划，加快构建泛在融合的算力网络，打造一点接入、即取即用的“算力服务”，建设全国20毫秒、省域5毫秒左右、地市1毫秒的三级低时延算力服务圈，促进算力成为像水、电一样的社会级服务。",
        "加快打造算力网络，发力算网原创技术算力，指数据的处理能力。算力网络是中国移动2021年提出的原创型技术理念，指提供算力和网络深度融合、一体化服务的新型基础设施。算力越强，处理数据的能力就越强，能够得到的有用数据就越大越精准，算力网络必将成为支撑全社会数智化发展的重要底座。中国移动站在科技创新前沿，融合“ABCDNETS”八大核心要素，持续推进算力网络建设。其中，云、边、端（Cloud/Edge/Terminal）共同构成多层立体的泛在算力架构；网络（Network）作为连接用户、数据和算力的桥梁，通过与算力的深度融合，共同构成算力网络的新型基础设施；算力网络需要通过融数（Data）注智（AI），构建算网大脑，打造统一、敏捷、高效的算网资源供给体系；区块链（Blockchain）是实现算力可信交易的基石；安全（Security）是保障算力网络可靠运行的根本，需要融入到算力网络体系中，形成内生安全防护机制。",
        "大数据（Data）",
        "云计算（Cloud）",
        "区块链（Blockchain）",
        "人工智能（AI）",
        "边缘计算（Edge）",
        "终端（Terminal）",
        "网络（Network）",
        "安全（Security）",
        "网络（Network）",
        "三个阶段目标三个融通",
        "网络无所不达",
        "算力无所不在",
        "智能无所不及",
        "1泛在协同算力跨地域",
        "算力跨层级",
        "算力跨场景",
        "2融合统一",
        "3一体内生",
        "物理空间",
        "逻辑空间",
        "异构空间"]

In [36]:
simi_text = SentTrans(content, texts)

tensor([[0.3299, 0.4789, 0.5655, 0.6310, 0.5341, 0.4248, 0.5328, 0.2565, 0.4660,
         0.5145, 0.6199, 0.4264, 0.4668, 0.1876, 0.3692, 0.4075, 0.4432, 0.4332,
         0.4318, 0.4304, 0.3955, 0.4571, 0.3955, 0.5495, 0.4248, 0.4556, 0.4730,
         0.5855, 0.4539, 0.4539, 0.4467, 0.4319]])
最相关的文本是:

 移动网络质量满意度和5G上网质量满意度行业双领先。在全球率先开通VoNR高清通话业务，支持720P高清视频通话，让沟通更优质、更精彩。


# Exhibit 4

In [38]:
img_url="https://raw.githubusercontent.com/Lorre-Ramon/Image-Hosting-Service/main/img/ESG_AI/00941.HK-中国移动-中国移动%202022年度可持续发展报告-2023-03-24.pdf_page_7_img_2.png"

In [39]:
response = GPT_4V_depict(img_url)
content, total_tokens = extract_response(response)
content, total_tokens

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
/var/folders/kd/t2wnd4h911v09vtv543hpkg80000gn/T/ipykernel_5989/4266333353.py:3: PydanticDeprecatedSince20: The `json` method is deprecated; use `model_dump_json` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.4/migration/
  parsed_json = json.loads(response.json())


('一名戴安全帽的男子和一位小女孩在码头前拿着东西合影，背后是停泊的渔船。', 319)

In [40]:
simi_text = SentTrans(content, texts)

tensor([[0.3528, 0.4469, 0.5307, 0.4675, 0.5545, 0.4642, 0.5284, 0.1849, 0.3779,
         0.4278, 0.5735, 0.4840, 0.4858, 0.1656, 0.3993, 0.4441, 0.4959, 0.3870,
         0.4447, 0.4078, 0.4018, 0.4005, 0.4018, 0.5444, 0.4430, 0.4990, 0.4365,
         0.5388, 0.5042, 0.5042, 0.5388, 0.5585]])
最相关的文本是:

 ◎在“海上社区”建设5G网络◎渔民通过VoNR与家人视频报平安


In [42]:
simi_textim = SentTrans(content, texts)

tensor([[0.3528, 0.4469, 0.5307, 0.4675, 0.5545, 0.4642, 0.5284, 0.1849, 0.3779,
         0.4278, 0.5735, 0.4840, 0.4858, 0.1656, 0.3993, 0.4441, 0.4959, 0.3870,
         0.4447, 0.4078, 0.4018, 0.4005, 0.4018, 0.5444, 0.4430, 0.4990, 0.4365,
         0.5388, 0.5042, 0.5042, 0.5388, 0.5585, 0.5283, 0.5283, 0.5283]])
最相关的文本是:

 ◎在“海上社区”建设5G网络◎渔民通过VoNR与家人视频报平安


# Exhibit e

In [18]:
img_url="https://raw.githubusercontent.com/Lorre-Ramon/ESGImage/main/Test01_resized/00941.HK-中国移动-中国移动%202022年度可持续发展报告-2023-03-24.pdf_page_7_img_2.png"

In [23]:
response = GPT_4V_depict(img_url)
content, total_tokens = extract_response(response)
content, total_tokens

/var/folders/kd/t2wnd4h911v09vtv543hpkg80000gn/T/ipykernel_31519/4266333353.py:3: PydanticDeprecatedSince20: The `json` method is deprecated; use `model_dump_json` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.4/migration/
  parsed_json = json.loads(response.json())


('这是一张照片，里面有一位身穿工作服和头盔的男士和一个拿着文件的小女孩，背景是渔港。', 154)

In [26]:
simi_text = SentTrans(content, texts)

tensor([[0.3640, 0.4817, 0.5739, 0.5059, 0.5458, 0.4335, 0.5455, 0.2232, 0.3147,
         0.3897, 0.4796, 0.4326, 0.4603, 0.1920, 0.3741, 0.4348, 0.4631, 0.4177,
         0.4318, 0.4588, 0.4554, 0.3368, 0.4554, 0.4608, 0.4838, 0.5106, 0.5085,
         0.4717, 0.4720, 0.4720, 0.4755, 0.5248, 0.4892, 0.4892, 0.4892]])
最相关的文本是:

 面向个人用户


In [27]:
response

ChatCompletion(id='chatcmpl-8JHAm3BompzI0KZN6pKo62xwftuPC', choices=[Choice(finish_reason=None, index=0, message=ChatCompletionMessage(content='这是一张照片，里面有一位身穿工作服和头盔的男士和一个拿着文件的小女孩，背景是渔港。', role='assistant', function_call=None, tool_calls=None), finish_details={'type': 'stop', 'stop': '<|fim_suffix|>'})], created=1699604936, model='gpt-4-1106-vision-preview', object='chat.completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=49, prompt_tokens=105, total_tokens=154))

# Exhibit f

In [41]:
img_url="https://raw.githubusercontent.com/Lorre-Ramon/ESGImage/main/Test01_resized/00941.HK-中国移动-中国移动%202022年度可持续发展报告-2023-03-24.pdf_page_7_img_1.png"

In [42]:
response = GPT_4V_depict(img_url)
content, total_tokens = extract_response(response)
content, total_tokens, response

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
/var/folders/kd/t2wnd4h911v09vtv543hpkg80000gn/T/ipykernel_31519/4266333353.py:3: PydanticDeprecatedSince20: The `json` method is deprecated; use `model_dump_json` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.4/migration/
  parsed_json = json.loads(response.json())


('工作人员在晴朗天气下，登高维护山顶的通信基站和天线。',
 135,
 ChatCompletion(id='chatcmpl-8JHKq3qGi9bQmIlofll8GoYRP7V4y', choices=[Choice(finish_reason=None, index=0, message=ChatCompletionMessage(content='工作人员在晴朗天气下，登高维护山顶的通信基站和天线。', role='assistant', function_call=None, tool_calls=None), finish_details={'type': 'stop', 'stop': '<|fim_suffix|>'})], created=1699605560, model='gpt-4-1106-vision-preview', object='chat.completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=32, prompt_tokens=103, total_tokens=135)))

In [43]:
simi_text = SentTrans(content, texts)

tensor([[0.3442, 0.4297, 0.5937, 0.5867, 0.5159, 0.4246, 0.4951, 0.2858, 0.4384,
         0.5049, 0.6023, 0.3700, 0.4458, 0.1243, 0.3844, 0.4556, 0.4648, 0.4736,
         0.4125, 0.4440, 0.4275, 0.3555, 0.4275, 0.5442, 0.4757, 0.4779, 0.5007,
         0.5687, 0.4479, 0.4479, 0.4038, 0.4551, 0.5775, 0.5775, 0.5775]])
最相关的文本是:

 ◎在“海上社区”建设5G网络◎渔民通过VoNR与家人视频报平安


# 中国移动

In [47]:
import pandas as pd 
import requests

## 从Github中取得图片url

In [48]:
# 仓库所有者的名字
owner = 'Lorre-Ramon'
# 仓库的名字
repo = 'ESGImage'
# 文件夹的路径
path = 'Test01_resized'

# GitHub API 的 URL
api_url = f'https://api.github.com/repos/{owner}/{repo}/contents/{path}'

# 发送请求
response = requests.get(api_url)
data = response.json()

# 创建一个列表，用于存储文件信息
files_data = []

# 遍历响应数据，提取图片文件的名称和URL
for file in data:
    if file['name'].endswith(('.png', '.jpg', '.jpeg', '.gif')):
        files_data.append({'file_name': file['name'], 'url': file['download_url']})


## 向图片坐标DataFrame添加对应url

In [58]:
df = pd.read_excel("/Users/improvise/Desktop/保研/实证论文/ESG/Playground/01_Extraction/Test01/03 PyMu_img01_coordinate/coordinate_final.xlsx")
df = pd.DataFrame(df)

df_files_data = pd.DataFrame(files_data)

df_merged = pd.merge(df, df_files_data, on='file_name', how='left')
df_merged.drop("Unnamed: 0", axis=1, inplace=True)

# 保存合并后的DataFrame回到新的Excel文件
df_merged.to_excel("/Users/improvise/Desktop/保研/实证论文/ESG/Playground/01_Extraction/Test01/03 PyMu_img01_coordinate/coord_url.xlsx", index=False)

# 打印结果，确认合并是否成功
df_merged

Unnamed: 0,file_name,x0,y0,x1,y1,centre_coordinate,url
0,01138.HK-中远海能-中远海能 中远海能2022年可持续发展报告-2023-03-31...,187.718994,702.485046,411.766998,722.046021,"(299.7429962158203, 712.2655334472656)",
1,01138.HK-中远海能-中远海能 中远海能2022年可持续发展报告-2023-03-31...,246.468994,741.485046,358.854980,761.046021,"(302.6619873046875, 751.2655334472656)",
2,01138.HK-中远海能-中远海能 中远海能2022年可持续发展报告-2023-03-31...,90.000000,73.080002,507.600006,320.279999,"(298.8000030517578, 196.68000030517578)",
3,01138.HK-中远海能-中远海能 中远海能2022年可持续发展报告-2023-03-31...,90.000000,100.319977,505.320007,489.239990,"(297.6600036621094, 294.7799835205078)",
4,01138.HK-中远海能-中远海能 中远海能2022年可持续发展报告-2023-03-31...,90.000000,247.440002,505.679993,395.040009,"(297.8399963378906, 321.24000549316406)",
...,...,...,...,...,...,...,...
11500,600356.SH-恒丰纸业-恒丰纸业 恒丰纸业2022可持续发展报告-2023-03-15...,201.309631,581.452515,516.938354,769.132202,"(359.1239929199219, 675.2923583984375)",
11501,600356.SH-恒丰纸业-恒丰纸业 恒丰纸业2022可持续发展报告-2023-03-15...,968.806030,139.261810,1110.485718,224.077393,"(1039.6458740234375, 181.6696014404297)",
11502,600356.SH-恒丰纸业-恒丰纸业 恒丰纸业2022可持续发展报告-2023-03-15...,969.845520,227.706161,1110.485718,313.471558,"(1040.1656188964844, 270.58885955810547)",
11503,600356.SH-恒丰纸业-恒丰纸业 恒丰纸业2022可持续发展报告-2023-03-15...,-1.349877,0.000244,596.325378,842.894775,"(297.4877505302429, 421.447509765625)",


## 从GPT-4V处获得图片描述

In [66]:
df_merged[184:252]['file_name'], df_merged[184:252]['url']

(184    00941.HK-中国移动-中国移动 2022年度可持续发展报告-2023-03-24.pd...
 185    00941.HK-中国移动-中国移动 2022年度可持续发展报告-2023-03-24.pd...
 186    00941.HK-中国移动-中国移动 2022年度可持续发展报告-2023-03-24.pd...
 187    00941.HK-中国移动-中国移动 2022年度可持续发展报告-2023-03-24.pd...
 188    00941.HK-中国移动-中国移动 2022年度可持续发展报告-2023-03-24.pd...
                              ...                        
 247    00941.HK-中国移动-中国移动 2022年度可持续发展报告-2023-03-24.pd...
 248    00941.HK-中国移动-中国移动 2022年度可持续发展报告-2023-03-24.pd...
 249    00941.HK-中国移动-中国移动 2022年度可持续发展报告-2023-03-24.pd...
 250    00941.HK-中国移动-中国移动 2022年度可持续发展报告-2023-03-24.pd...
 251    00941.HK-中国移动-中国移动 2022年度可持续发展报告-2023-03-24.pd...
 Name: file_name, Length: 68, dtype: object,
 184    https://raw.githubusercontent.com/Lorre-Ramon/...
 185    https://raw.githubusercontent.com/Lorre-Ramon/...
 186    https://raw.githubusercontent.com/Lorre-Ramon/...
 187    https://raw.githubusercontent.com/Lorre-Ramon/...
 188    https://raw.githubusercontent.com/Lorre-Ramon/...
                           

In [72]:
img_dipt = []
total_length = len(df_merged[184:252]['file_name'])

for file_name, img_url in tqdm(zip(df_merged[184:252]['file_name'], df_merged[184:252]['url']), total=total_length):
    response = GPT_4V_depict(img_url)
    content, total_tokens = extract_response(response)
    img_dipt.append({'file_name': file_name,
                     'dipt': content,
                     'token': total_tokens})

  0%|                                                    | 0/68 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
/var/folders/kd/t2wnd4h911v09vtv543hpkg80000gn/T/ipykernel_31519/4266333353.py:3: PydanticDeprecatedSince20: The `json` method is deprecated; use `model_dump_json` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.4/migration/
  parsed_json = json.loads(response.json())
  1%|▋                                           | 1/68 [00:03<03:25,  3.06s/it]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explic

 16%|██████▉                                    | 11/68 [01:02<06:44,  7.10s/it]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
/var/folders/kd/t2wnd4h911v09vtv543hpkg80000gn/T/ipykernel_31519/4266333353.py:3: PydanticDeprecatedSince20: The `json` method is deprecated; use `model_dump_json` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.4/migration/
  parsed_json = json.loads(response.json())
 18%|███████▌                                   | 12/68 [01:10<06:58,  7.47s/it]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explic

 32%|█████████████▉                             | 22/68 [02:05<03:56,  5.14s/it]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
/var/folders/kd/t2wnd4h911v09vtv543hpkg80000gn/T/ipykernel_31519/4266333353.py:3: PydanticDeprecatedSince20: The `json` method is deprecated; use `model_dump_json` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.4/migration/
  parsed_json = json.loads(response.json())
 34%|██████████████▌                            | 23/68 [02:09<03:30,  4.67s/it]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explic

 49%|████████████████████▊                      | 33/68 [02:55<03:07,  5.35s/it]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
/var/folders/kd/t2wnd4h911v09vtv543hpkg80000gn/T/ipykernel_31519/4266333353.py:3: PydanticDeprecatedSince20: The `json` method is deprecated; use `model_dump_json` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.4/migration/
  parsed_json = json.loads(response.json())
 50%|█████████████████████▌                     | 34/68 [02:59<02:47,  4.93s/it]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explic

 65%|███████████████████████████▊               | 44/68 [03:47<01:57,  4.88s/it]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
/var/folders/kd/t2wnd4h911v09vtv543hpkg80000gn/T/ipykernel_31519/4266333353.py:3: PydanticDeprecatedSince20: The `json` method is deprecated; use `model_dump_json` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.4/migration/
  parsed_json = json.loads(response.json())
 66%|████████████████████████████▍              | 45/68 [03:51<01:50,  4.82s/it]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explic

 81%|██████████████████████████████████▊        | 55/68 [04:51<01:01,  4.72s/it]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
/var/folders/kd/t2wnd4h911v09vtv543hpkg80000gn/T/ipykernel_31519/4266333353.py:3: PydanticDeprecatedSince20: The `json` method is deprecated; use `model_dump_json` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.4/migration/
  parsed_json = json.loads(response.json())
 82%|███████████████████████████████████▍       | 56/68 [04:54<00:50,  4.23s/it]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explic

 97%|█████████████████████████████████████████▋ | 66/68 [06:01<00:12,  6.04s/it]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
/var/folders/kd/t2wnd4h911v09vtv543hpkg80000gn/T/ipykernel_31519/4266333353.py:3: PydanticDeprecatedSince20: The `json` method is deprecated; use `model_dump_json` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.4/migration/
  parsed_json = json.loads(response.json())
 99%|██████████████████████████████████████████▎| 67/68 [06:05<00:05,  5.16s/it]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explic

In [78]:
df_dipt = pd.DataFrame(img_dipt)
df_merged_2 = pd.merge(df_merged, df_dipt, on='file_name', how='left')
df_merged_2

Unnamed: 0,file_name,x0,y0,x1,y1,centre_coordinate,url,dipt,token
0,01138.HK-中远海能-中远海能 中远海能2022年可持续发展报告-2023-03-31...,187.718994,702.485046,411.766998,722.046021,"(299.7429962158203, 712.2655334472656)",,,
1,01138.HK-中远海能-中远海能 中远海能2022年可持续发展报告-2023-03-31...,246.468994,741.485046,358.854980,761.046021,"(302.6619873046875, 751.2655334472656)",,,
2,01138.HK-中远海能-中远海能 中远海能2022年可持续发展报告-2023-03-31...,90.000000,73.080002,507.600006,320.279999,"(298.8000030517578, 196.68000030517578)",,,
3,01138.HK-中远海能-中远海能 中远海能2022年可持续发展报告-2023-03-31...,90.000000,100.319977,505.320007,489.239990,"(297.6600036621094, 294.7799835205078)",,,
4,01138.HK-中远海能-中远海能 中远海能2022年可持续发展报告-2023-03-31...,90.000000,247.440002,505.679993,395.040009,"(297.8399963378906, 321.24000549316406)",,,
...,...,...,...,...,...,...,...,...,...
11500,600356.SH-恒丰纸业-恒丰纸业 恒丰纸业2022可持续发展报告-2023-03-15...,201.309631,581.452515,516.938354,769.132202,"(359.1239929199219, 675.2923583984375)",,,
11501,600356.SH-恒丰纸业-恒丰纸业 恒丰纸业2022可持续发展报告-2023-03-15...,968.806030,139.261810,1110.485718,224.077393,"(1039.6458740234375, 181.6696014404297)",,,
11502,600356.SH-恒丰纸业-恒丰纸业 恒丰纸业2022可持续发展报告-2023-03-15...,969.845520,227.706161,1110.485718,313.471558,"(1040.1656188964844, 270.58885955810547)",,,
11503,600356.SH-恒丰纸业-恒丰纸业 恒丰纸业2022可持续发展报告-2023-03-15...,-1.349877,0.000244,596.325378,842.894775,"(297.4877505302429, 421.447509765625)",,,


## 获取页码

In [77]:
img_info = []

for file_name in tqdm(df_dipt['file_name']):
    _, _, page, _, index  = file_name.split("_")
    index = int(int("".join([i for i in index if i not in ".png"])))
    page = int(page)
    img_info.append({'file_name': file_name,
                    'page': page,
                    'p_index': index})
    
df_img_info = pd.DataFrame(img_info)
df_img_info.head(15)

100%|███████████████████████████████████████| 68/68 [00:00<00:00, 271114.71it/s]


Unnamed: 0,file_name,page,p_index
0,00941.HK-中国移动-中国移动 2022年度可持续发展报告-2023-03-24.pd...,1,1
1,00941.HK-中国移动-中国移动 2022年度可持续发展报告-2023-03-24.pd...,3,1
2,00941.HK-中国移动-中国移动 2022年度可持续发展报告-2023-03-24.pd...,6,1
3,00941.HK-中国移动-中国移动 2022年度可持续发展报告-2023-03-24.pd...,7,1
4,00941.HK-中国移动-中国移动 2022年度可持续发展报告-2023-03-24.pd...,7,2
5,00941.HK-中国移动-中国移动 2022年度可持续发展报告-2023-03-24.pd...,8,1
6,00941.HK-中国移动-中国移动 2022年度可持续发展报告-2023-03-24.pd...,9,1
7,00941.HK-中国移动-中国移动 2022年度可持续发展报告-2023-03-24.pd...,9,2
8,00941.HK-中国移动-中国移动 2022年度可持续发展报告-2023-03-24.pd...,10,1
9,00941.HK-中国移动-中国移动 2022年度可持续发展报告-2023-03-24.pd...,11,1


In [79]:
df_merged_3 = pd.merge(df_merged_2, df_img_info, on='file_name', how='left')
df_merged_3

Unnamed: 0,file_name,x0,y0,x1,y1,centre_coordinate,url,dipt,token,page,p_index
0,01138.HK-中远海能-中远海能 中远海能2022年可持续发展报告-2023-03-31...,187.718994,702.485046,411.766998,722.046021,"(299.7429962158203, 712.2655334472656)",,,,,
1,01138.HK-中远海能-中远海能 中远海能2022年可持续发展报告-2023-03-31...,246.468994,741.485046,358.854980,761.046021,"(302.6619873046875, 751.2655334472656)",,,,,
2,01138.HK-中远海能-中远海能 中远海能2022年可持续发展报告-2023-03-31...,90.000000,73.080002,507.600006,320.279999,"(298.8000030517578, 196.68000030517578)",,,,,
3,01138.HK-中远海能-中远海能 中远海能2022年可持续发展报告-2023-03-31...,90.000000,100.319977,505.320007,489.239990,"(297.6600036621094, 294.7799835205078)",,,,,
4,01138.HK-中远海能-中远海能 中远海能2022年可持续发展报告-2023-03-31...,90.000000,247.440002,505.679993,395.040009,"(297.8399963378906, 321.24000549316406)",,,,,
...,...,...,...,...,...,...,...,...,...,...,...
11500,600356.SH-恒丰纸业-恒丰纸业 恒丰纸业2022可持续发展报告-2023-03-15...,201.309631,581.452515,516.938354,769.132202,"(359.1239929199219, 675.2923583984375)",,,,,
11501,600356.SH-恒丰纸业-恒丰纸业 恒丰纸业2022可持续发展报告-2023-03-15...,968.806030,139.261810,1110.485718,224.077393,"(1039.6458740234375, 181.6696014404297)",,,,,
11502,600356.SH-恒丰纸业-恒丰纸业 恒丰纸业2022可持续发展报告-2023-03-15...,969.845520,227.706161,1110.485718,313.471558,"(1040.1656188964844, 270.58885955810547)",,,,,
11503,600356.SH-恒丰纸业-恒丰纸业 恒丰纸业2022可持续发展报告-2023-03-15...,-1.349877,0.000244,596.325378,842.894775,"(297.4877505302429, 421.447509765625)",,,,,


In [80]:
# 保存合并后的DataFrame回到新的Excel文件
df_merged_3.to_excel("/Users/improvise/Desktop/保研/实证论文/ESG/Playground/01_Extraction/Test01/03 PyMu_img01_coordinate/coord_dipt.xlsx", index=False)

## 语义匹配

In [83]:
df_text = pd.read_excel("/Users/improvise/Desktop/保研/实证论文/ESG/Playground/01_Extraction/Test01/01 text/text_df_2.xlsx")

In [139]:
dis_info = []

for file_name, page, content, img_cord in tqdm(zip(df_merged_3[184:252]['file_name'], df_merged_3[184:252]['page'], df_merged_3[184:252]['dipt'], df_merged_3[184:252]['centre_coordinate'])):
    PDF_name, _, _, _, _ = file_name.split("_")
    texts = list(df_text.loc[(df_text["PDF_name"] == PDF_name) & (df_text['page'] == page)]['content'])
    if texts != []:
        simi_text = SentTrans(content, texts)
        [text_cord] = df_text.loc[(df_text['PDF_name'] == PDF_name)&(df_text['content'] == simi_text)][['center_x', 'center_y']].values.tolist()
    
        if type(img_cord) == str:
            img_cord = ast.literal_eval(img_cord)

        dist = distance(img_cord, text_cord)

        dis_info.append({'file_name': file_name,
                        'simi_text': simi_text,
                        'dist': dist})
    else:
        print("\nFile: {},\nNo match on page {}, proceeding to the next page\n\n".format(PDF_name, int(page)))

0it [00:00, ?it/s]


File: 00941.HK-中国移动-中国移动 2022年度可持续发展报告-2023-03-24.pdf,
No match on page 1, proceeding to the next page



2it [00:00,  4.55it/s]

tensor([[0.0764, 0.4051, 0.4048, 0.4516, 0.1331, 0.4539, 0.0764, 0.3604]])
最相关的文本是:

 中国移动有限公司董事长


3it [00:01,  2.66it/s]

tensor([[0.0395, 0.3421, 0.4277, 0.4755, 0.2650, 0.4277, 0.3294, 0.5384, 0.5946,
         0.3502, 0.3939, 0.4407, 0.2699, 0.6049, 0.2237, 0.3941, 0.3735, 0.4416,
         0.4103, 0.3619, 0.3719]])
最相关的文本是:

 全网VoLTE高清通话覆盖率达到99%


4it [00:01,  1.78it/s]

tensor([[0.0179, 0.3140, 0.4455, 0.4944, 0.5858, 0.5226, 0.3983, 0.5021, 0.2155,
         0.4673, 0.5641, 0.5798, 0.4167, 0.4628, 0.1953, 0.3562, 0.4412, 0.4456,
         0.3754, 0.4207, 0.4189, 0.4298, 0.4298, 0.3880, 0.5337, 0.4173, 0.4294,
         0.4624, 0.5642, 0.4155, 0.4155, 0.4972, 0.4284, 0.7014, 0.7014, 0.7014]])
最相关的文本是:

 物理空间


5it [00:02,  1.50it/s]

tensor([[0.0241, 0.4343, 0.4696, 0.5604, 0.4454, 0.5661, 0.4262, 0.5124, 0.2449,
         0.2601, 0.3624, 0.4450, 0.5047, 0.3657, 0.1922, 0.3798, 0.4049, 0.4393,
         0.3596, 0.4710, 0.3950, 0.4354, 0.4354, 0.3015, 0.4727, 0.5227, 0.5505,
         0.4268, 0.5109, 0.5447, 0.5447, 0.4018, 0.4193, 0.5069, 0.5069, 0.5069]])
最相关的文本是:

 面向家庭用户


6it [00:03,  1.37it/s]

tensor([[ 0.1356,  0.3881,  0.3078,  0.4877,  0.1124,  0.4812,  0.4398,  0.5214,
          0.2850,  0.2617,  0.5198,  0.2896,  0.5076,  0.5390,  0.5531,  0.4854,
          0.5566,  0.5051,  0.3675,  0.5203,  0.5700,  0.4927,  0.6954,  0.5273,
          0.4167,  0.5670,  0.7102,  0.4709,  0.6227,  0.5456,  0.5822,  0.5033,
          0.3621, -0.0497,  0.4698,  0.5443,  0.4165]])
最相关的文本是:

 ◎2022年中国移动举行创客马拉松大赛


7it [00:04,  1.30it/s]

tensor([[0.0052, 0.4482, 0.4590, 0.5415, 0.4603, 0.3833, 0.4466, 0.5035, 0.3147,
         0.4373, 0.4407, 0.3717, 0.4436, 0.4725, 0.4445, 0.4172, 0.2105, 0.3392,
         0.2177, 0.3349, 0.4658, 0.4838, 0.3286, 0.3147, 0.3117, 0.3575, 0.4248,
         0.3906, 0.2876, 0.4843, 0.3900, 0.2683, 0.3904, 0.1852, 0.3519, 0.4237]])
最相关的文本是:

 推进数智化生产


8it [00:05,  1.27it/s]

tensor([[0.0141, 0.5391, 0.6319, 0.6578, 0.5188, 0.4203, 0.5034, 0.5764, 0.3912,
         0.5445, 0.4788, 0.4606, 0.5873, 0.5493, 0.5491, 0.4956, 0.3167, 0.3878,
         0.2313, 0.3232, 0.5617, 0.5728, 0.3941, 0.3912, 0.4302, 0.4163, 0.5135,
         0.4507, 0.3388, 0.5684, 0.5452, 0.3838, 0.4612, 0.1366, 0.4381, 0.5390]])
最相关的文本是:

 推进数智化生产


9it [00:06,  1.25it/s]

tensor([[0.0942, 0.2341, 0.3291, 0.5039, 0.3416, 0.1863, 0.2750, 0.6478, 0.4383,
         0.2716, 0.3818, 0.3818, 0.3262, 0.5211, 0.8471, 0.4784, 0.3376, 0.3352,
         0.3246, 0.4538, 0.7470, 0.4535, 0.4555, 0.3936, 0.3577, 0.7450, 0.7513,
         0.4312, 0.4903, 0.5166, 0.3101]])
最相关的文本是:

 公司为文旅、商贸及建筑地产等相关行业领域的行业主管部门、企事业单位、个人用户打造智慧化的解决方案及产品，支撑行业主管部门加强行业管控，助力企业提升服务效能，为人民群众提供便利。


10it [00:07,  1.21it/s]

tensor([[ 0.0682,  0.3578,  0.5320,  0.4159,  0.4079,  0.5999,  0.6157,  0.5909,
          0.2630,  0.4676,  0.4233,  0.3249,  0.4665,  0.4736,  0.4646,  0.4155,
          0.4973,  0.5307,  0.3515,  0.3734,  0.4243,  0.5368,  0.3277,  0.1969,
         -0.0122,  0.3438,  0.4349,  0.4714,  0.5127,  0.5597,  0.4439,  0.3220,
          0.3265,  0.5862,  0.5505,  0.0173,  0.0078,  0.0310,  0.4341,  0.4742,
          0.4509,  0.4873,  0.4566]])
最相关的文本是:

 冬奥冰雪元宇宙


11it [00:07,  1.20it/s]

tensor([[ 0.0476,  0.3262,  0.4136,  0.3513,  0.3335,  0.4656,  0.5282,  0.4754,
          0.3440,  0.4366,  0.4268,  0.4293,  0.4814,  0.4365,  0.4444,  0.2807,
          0.4613,  0.5568,  0.3683,  0.3940,  0.4602,  0.4774,  0.2893,  0.1748,
          0.0285,  0.4104,  0.4575,  0.4302,  0.4516,  0.4753,  0.3928,  0.3253,
          0.4546,  0.3964,  0.5242, -0.0910,  0.0480,  0.0750,  0.4013,  0.4728,
          0.4985,  0.5030,  0.6140]])
最相关的文本是:

 橙络络，中国移动动感地带与年轻用户共创打造的AI少女，也是动感地带全新品牌代言人


12it [00:08,  1.20it/s]

tensor([[0.0628, 0.3161, 0.3825, 0.3349, 0.3361, 0.5289, 0.5395, 0.5808, 0.2558,
         0.4695, 0.3414, 0.3696, 0.4650, 0.5224, 0.4675, 0.4349, 0.4479, 0.3799,
         0.3729, 0.3703, 0.4876, 0.5006, 0.2709, 0.1602, 0.0905, 0.3719, 0.4715,
         0.4716, 0.4982, 0.4575, 0.4182, 0.3749, 0.3678, 0.4621, 0.4990, 0.0439,
         0.0806, 0.0691, 0.4161, 0.4988, 0.4477, 0.3853, 0.5171]])
最相关的文本是:

 世界杯元宇宙互动文旅元宇宙元宇宙音乐盛典中国移动积极推进厦门元宇宙建设，打造国内文旅元宇宙标杆示范项目。在第22届中国国际投资贸易洽谈会期间，中国移动推出鼓浪屿元宇宙AR夜景首秀，带来海陆空间交互、数实空间交织的超时空体验。


13it [00:09,  1.22it/s]

tensor([[0.0209, 0.5021, 0.4815, 0.4557, 0.4192, 0.3955, 0.3455, 0.5824, 0.5166,
         0.5652, 0.3302, 0.5199, 0.4702, 0.4643, 0.6031, 0.5724, 0.6013, 0.5447,
         0.5474, 0.5720, 0.5030, 0.5481, 0.4369, 0.4634, 0.5958, 0.4547, 0.4785,
         0.4453]])
最相关的文本是:

 重点推动提升网络、产品和触点“三大质量”投诉问题整改，实现手机、家宽和政企客户满意度持续提升。


14it [00:10,  1.24it/s]

tensor([[0.0064, 0.4343, 0.3772, 0.3173, 0.4294, 0.4255, 0.2636, 0.5415, 0.5505,
         0.5269, 0.4183, 0.4647, 0.4457, 0.4495, 0.5289, 0.4963, 0.4871, 0.5882,
         0.6501, 0.5301, 0.5250, 0.4640, 0.5129, 0.5324, 0.5728, 0.4026, 0.3064,
         0.4028]])
最相关的文本是:

 用内容打动客户


15it [00:10,  1.29it/s]

tensor([[0.0201, 0.4459, 0.6340, 0.5964, 0.5126, 0.2678, 0.5600, 0.5612, 0.4680,
         0.5034, 0.4690, 0.6232, 0.3813, 0.4003, 0.4095, 0.5336, 0.5661, 0.7066,
         0.5811, 0.6255, 0.4424, 0.4337, 0.3392, 0.6925]])
最相关的文本是:

 数字沈阳项目按照创建“东北数字第一城”目标，推进数字政府核心底座、中枢搭建、数据中台等5大中台建设，汇集政务、社会、城市、产业等领域数据，实现数字底座“一体贯通”。


16it [00:11,  1.31it/s]

tensor([[0.1450, 0.4978, 0.4354, 0.4182, 0.6169, 0.5784, 0.8209, 0.4965, 0.6530,
         0.4474, 0.5129, 0.4266, 0.8175, 0.5807, 0.4665, 0.2641, 0.3370, 0.5400,
         0.4591, 0.5339, 0.5231, 0.4908, 0.2819, 0.5219]])
最相关的文本是:

 案例全程护航，保障央视高清直播北京冬奥会火炬传递


17it [00:12,  1.35it/s]

tensor([[0.0596, 0.3913, 0.4525, 0.4002, 0.5138, 0.3659, 0.4822, 0.4063, 0.4892,
         0.3638, 0.4524, 0.4828, 0.4403, 0.3675, 0.3402, 0.3371, 0.3113, 0.5100,
         0.3487, 0.4676, 0.4201, 0.3812, 0.2579, 0.5051]])
最相关的文本是:

 ▶助力数智化治理能力全面提升，让城市生活更美好


18it [00:12,  1.49it/s]

tensor([[0.0252, 0.4646, 0.5814, 0.6146, 0.5321, 0.4203, 0.5639, 0.5302, 0.4026,
         0.5694, 0.4882, 0.5306, 0.4968, 0.5092, 0.1687, 0.2839, 0.4066, 0.4646,
         0.5255, 0.2803]])
最相关的文本是:

 ▶推进提速降费，广泛惠企利民


19it [00:13,  1.41it/s]

tensor([[0.0340, 0.5818, 0.7011, 0.7230, 0.4314, 0.6022, 0.4676, 0.6539, 0.5343,
         0.7169, 0.7124, 0.5625, 0.5176, 0.5176, 0.5045, 0.5858, 0.4477, 0.5255,
         0.5913, 0.7344, 0.6891, 0.4295, 0.5664, 0.6537, 0.5663, 0.3890, 0.3669]])
最相关的文本是:

 北京冬奥会期间，中国移动自研实时智能字幕，解决了听障人士在直播中“听不到，听不清”难题，受到中国聋人协会高度认可。


20it [00:14,  1.39it/s]

tensor([[0.0608, 0.5940, 0.5664, 0.4712, 0.3227, 0.6228, 0.3781, 0.4002, 0.5003,
         0.6514, 0.7206, 0.5717, 0.5250, 0.5055, 0.4905, 0.5420, 0.2826, 0.4750,
         0.4874, 0.5409, 0.3531, 0.3770, 0.5095, 0.4607, 0.5849, 0.4201, 0.4265]])
最相关的文本是:

 让老年客户享受到“语速再慢一些”“多介绍一遍”等更优质周全的服务体验。


21it [00:15,  1.37it/s]

tensor([[0.0956, 0.5742, 0.5238, 0.3542, 0.3808, 0.5041, 0.2745, 0.3393, 0.7468,
         0.5554, 0.4705, 0.5178, 0.6887, 0.5630, 0.3885, 0.5357, 0.1866, 0.5735,
         0.5413, 0.5480, 0.3520, 0.4361, 0.5011, 0.4498, 0.6362, 0.4510, 0.5642]])
最相关的文本是:

 ◎老人通过AI体感互动设备体验健身课


22it [00:15,  1.36it/s]

tensor([[0.1470, 0.6043, 0.5240, 0.4243, 0.3655, 0.5283, 0.2737, 0.4057, 0.5349,
         0.6536, 0.6531, 0.4491, 0.5896, 0.5185, 0.4109, 0.5421, 0.1977, 0.5508,
         0.5622, 0.5734, 0.3989, 0.4124, 0.5459, 0.4753, 0.5987, 0.4124, 0.4064]])
最相关的文本是:

 发挥集中运营优势，针对全国65岁以上老年客户拨打10086客服热线，增设方便快捷的“一键进入”客服人工坐席，免去客户多层按键等待环节。


23it [00:16,  1.35it/s]

tensor([[0.1381, 0.6093, 0.5105, 0.3886, 0.3417, 0.5409, 0.3272, 0.4355, 0.6467,
         0.5820, 0.4867, 0.5200, 0.5438, 0.5476, 0.4071, 0.5529, 0.2780, 0.5785,
         0.5474, 0.5392, 0.3817, 0.4588, 0.5316, 0.4965, 0.5910, 0.3898, 0.4699]])
最相关的文本是:

 ◎老人通过AI体感互动设备体验健身课


24it [00:17,  1.36it/s]

tensor([[0.0009, 0.4122, 0.4304, 0.2899, 0.5794, 0.5544, 0.4781, 0.5451, 0.5615,
         0.5063, 0.4396, 0.5235, 0.4282, 0.4197, 0.4772, 0.4319, 0.4621, 0.4292,
         0.4060, 0.4580, 0.3030, 0.3330, 0.2757, 0.2757, 0.3608, 0.2680, 0.4993,
         0.2680, 0.4271, 0.4346, 0.5154, 0.4137, 0.4271, 0.3606, 0.3361, 0.4012,
         0.3696, 0.2858, 0.2858, 0.3763, 0.3805, 0.4922, 0.4467, 0.4172, 0.3805,
         0.3899, 0.4679, 0.4871, 0.5276, 0.4443, 0.3963, 0.4535, 0.4085, 0.4666,
         0.3493, 0.4348, 0.3814, 0.3486, 0.5357, 0.2850]])
最相关的文本是:

 ◎中国移动联合北京急救中心升级改造5G急救车，助力紧急医疗救援效能提升


25it [00:18,  1.36it/s]

tensor([[0.0547, 0.5492, 0.4988, 0.4687, 0.5340, 0.5410, 0.5588, 0.3980, 0.5211,
         0.5051, 0.5757, 0.4512, 0.5606, 0.4391, 0.5060, 0.4580, 0.5471, 0.4834,
         0.4300, 0.5691, 0.5039, 0.5044, 0.4954, 0.4954, 0.5302, 0.4438, 0.5697,
         0.4438, 0.5595, 0.5564, 0.5688, 0.5459, 0.5595, 0.6336, 0.4719, 0.6013,
         0.4471, 0.4471, 0.4471, 0.4967, 0.5001, 0.5525, 0.5149, 0.5232, 0.5001,
         0.4840, 0.4794, 0.5952, 0.5954, 0.5128, 0.4667, 0.4931, 0.4027, 0.5006,
         0.6615, 0.4885, 0.4613, 0.4857, 0.5325, 0.4671]])
最相关的文本是:

 聚焦帮扶四不摘


26it [00:18,  1.36it/s]

tensor([[0.0034, 0.5835, 0.5904, 0.3984, 0.5394, 0.5368, 0.5538, 0.4172, 0.5975,
         0.5571, 0.5731, 0.4285, 0.5488, 0.5040, 0.5517, 0.4582, 0.6280, 0.4904,
         0.4571, 0.5211, 0.4521, 0.4963, 0.4255, 0.4255, 0.5048, 0.3598, 0.5488,
         0.3598, 0.5514, 0.5441, 0.6593, 0.5436, 0.5514, 0.4049, 0.4343, 0.5076,
         0.5036, 0.3682, 0.3682, 0.5867, 0.5291, 0.6451, 0.5588, 0.5312, 0.5291,
         0.4989, 0.5287, 0.6040, 0.6365, 0.5127, 0.4780, 0.6901, 0.4855, 0.5181,
         0.4500, 0.4924, 0.3516, 0.5826, 0.5749, 0.3923]])
最相关的文本是:

 开展全公司乡村振兴先进评选表彰、帮扶示范点和数智乡村最佳实践案例评选活动，以激励先进、树立标杆、总结经验，促进工作更高质量开展。


27it [00:19,  1.36it/s]

tensor([[0.0785, 0.4963, 0.6255, 0.3436, 0.7326, 0.7542, 0.5475, 0.6337, 0.7484,
         0.6247, 0.5184, 0.5264, 0.4446, 0.3837, 0.4550, 0.4649, 0.5121, 0.4157,
         0.4121, 0.5698, 0.5393, 0.4344, 0.4125, 0.4125, 0.4726, 0.3141, 0.4973,
         0.3141, 0.5019, 0.5028, 0.5494, 0.4421, 0.5019, 0.4091, 0.5005, 0.4766,
         0.4107, 0.3232, 0.3232, 0.4610, 0.4708, 0.5448, 0.5160, 0.4595, 0.4708,
         0.5416, 0.6686, 0.5347, 0.4684, 0.4605, 0.5942, 0.4886, 0.4676, 0.4746,
         0.4652, 0.4201, 0.3092, 0.4674, 0.4631, 0.3392]])
最相关的文本是:

 ◎中国移动为广州卫健委120应急指挥中心建设5G+多网融合应急调度平台，服务珠三角区域发展


28it [00:20,  1.37it/s]

tensor([[-0.0357,  0.6151,  0.5236,  0.4106,  0.5325,  0.5254,  0.4949,  0.3724,
          0.4880,  0.5147,  0.5057,  0.3461,  0.5247,  0.4816,  0.5022,  0.3445,
          0.5101,  0.4513,  0.4175,  0.4815,  0.5338,  0.5431,  0.5549,  0.5549,
          0.5143,  0.3729,  0.5322,  0.3729,  0.4890,  0.6037,  0.6027,  0.4917,
          0.4890,  0.4326,  0.4397,  0.6903,  0.4204,  0.3707,  0.3707,  0.4905,
          0.4719,  0.5975,  0.4796,  0.5107,  0.4719,  0.4413,  0.4699,  0.6633,
          0.4427,  0.3948,  0.3298,  0.4279,  0.4312,  0.4726,  0.5266,  0.4556,
          0.4047,  0.5896,  0.3935,  0.4081]])
最相关的文本是:

 人才帮扶


29it [00:21,  1.38it/s]

tensor([[0.1213, 0.4846, 0.5120, 0.5224, 0.5879, 0.4172, 0.4617, 0.5377, 0.4643,
         0.3379, 0.4655, 0.5450, 0.6408, 0.5439, 0.6305, 0.5228, 0.4293, 0.4520,
         0.3578, 0.4071, 0.5057, 0.4052, 0.5006, 0.3383]])
最相关的文本是:

 云视讯乡村高清视频会场落地数达6.1万个，软终端乡村用户60.7万个


30it [00:21,  1.39it/s]

tensor([[0.1080, 0.5383, 0.4368, 0.4411, 0.5148, 0.4177, 0.4789, 0.4625, 0.4915,
         0.4409, 0.5221, 0.5745, 0.5981, 0.4740, 0.5945, 0.4724, 0.4441, 0.4984,
         0.3488, 0.4423, 0.5256, 0.3639, 0.4768, 0.3693]])
最相关的文本是:

 云视讯乡村高清视频会场落地数达6.1万个，软终端乡村用户60.7万个


31it [00:22,  1.40it/s]

tensor([[0.0968, 0.5305, 0.5367, 0.4371, 0.5960, 0.4559, 0.5121, 0.4554, 0.5827,
         0.5011, 0.5510, 0.5830, 0.5524, 0.4556, 0.5663, 0.5594, 0.4662, 0.5492,
         0.4199, 0.4850, 0.5635, 0.3810, 0.4591, 0.4104]])
最相关的文本是:

 ◎山东移动为山东省枣庄市西岗镇搭建数字乡村平台，打通3.6万户居民信息，连接72个村居视联网、386处电子天眼，实现“一屏管全镇”，90％的民生事务“扫码直达”，村民办事效率提升30%以上


32it [00:23,  1.41it/s]

tensor([[0.0364, 0.4351, 0.4573, 0.3306, 0.3966, 0.3133, 0.4141, 0.4513, 0.4750,
         0.3054, 0.4006, 0.4427, 0.4408, 0.3098, 0.4373, 0.4103, 0.3729, 0.4609,
         0.3694, 0.3668, 0.4129, 0.2815, 0.4200, 0.3121]])
最相关的文本是:

 和对讲乡村用户达148.49万个


33it [00:23,  1.41it/s]

tensor([[0.0707, 0.4926, 0.4878, 0.3512, 0.5310, 0.4856, 0.4553, 0.4385, 0.5284,
         0.4111, 0.5581, 0.5477, 0.5479, 0.4329, 0.5061, 0.6020, 0.4946, 0.4762,
         0.4249, 0.4574, 0.5846, 0.3734, 0.4310, 0.3960]])
最相关的文本是:

 打造乡村金融示范项目38个、大数据业务调用8,809万次


34it [00:24,  1.41it/s]

tensor([[0.0701, 0.5689, 0.5702, 0.5119, 0.6491, 0.4560, 0.5237, 0.5728, 0.5445,
         0.3488, 0.5433, 0.6485, 0.6498, 0.6997, 0.7575, 0.5186, 0.5203, 0.5149,
         0.3723, 0.5296, 0.6332, 0.4204, 0.3926, 0.4636]])
最相关的文本是:

 防返贫平台落地10省、40余个地市


35it [00:25,  1.39it/s]

tensor([[0.1498, 0.5624, 0.6097, 0.6746, 0.5537, 0.5245, 0.5553, 0.4979, 0.4800,
         0.5432, 0.6084, 0.7698, 0.6247, 0.3944, 0.5224, 0.5224, 0.5266, 0.5224,
         0.3750, 0.5031, 0.5958, 0.5599, 0.4843, 0.4134, 0.4917, 0.5754]])
最相关的文本是:

 ◎贵州移动工作人员前往稻田安装“黔移庄园”平台直播溯源标识


36it [00:26,  1.37it/s]

tensor([[0.0495, 0.5293, 0.6113, 0.4295, 0.3711, 0.3428, 0.4640, 0.3627, 0.4352,
         0.4352, 0.4988, 0.4878, 0.4167, 0.3608, 0.4293, 0.4293, 0.4420, 0.4293,
         0.3876, 0.5027, 0.5563, 0.5603, 0.4286, 0.4691, 0.4932, 0.5306]])
最相关的文本是:

 中国移动紧密围绕构建现代农业生产经营体系，推进“互联网+”农业建设，促进信息技术与农业生产、经营、管理、服务全面深度融合。


37it [00:26,  1.36it/s]

tensor([[0.1355, 0.5846, 0.5249, 0.5346, 0.5444, 0.5344, 0.5119, 0.4976, 0.4198,
         0.4756, 0.5576, 0.6550, 0.5262, 0.4380, 0.5524, 0.5524, 0.5310, 0.5524,
         0.3049, 0.5661, 0.5650, 0.5676, 0.4016, 0.3710, 0.4804, 0.5716]])
最相关的文本是:

 ◎贵州移动工作人员前往稻田安装“黔移庄园”平台直播溯源标识


38it [00:27,  1.35it/s]

tensor([[0.1413, 0.6053, 0.5556, 0.5305, 0.5041, 0.4904, 0.6350, 0.5609, 0.4366,
         0.4827, 0.6008, 0.6682, 0.5555, 0.3881, 0.5464, 0.5464, 0.5566, 0.5464,
         0.4101, 0.4914, 0.6563, 0.6042, 0.4612, 0.4727, 0.5778, 0.5733]])
最相关的文本是:

 ◎贵州移动工作人员前往稻田安装“黔移庄园”平台直播溯源标识


39it [00:28,  1.34it/s]

tensor([[0.1308, 0.5897, 0.5233, 0.5592, 0.5090, 0.4898, 0.5183, 0.4821, 0.4671,
         0.4388, 0.5351, 0.6812, 0.5879, 0.3812, 0.5347, 0.5347, 0.5186, 0.5347,
         0.2725, 0.5025, 0.5705, 0.5800, 0.4772, 0.3352, 0.5256, 0.5445]])
最相关的文本是:

 ◎贵州移动工作人员前往稻田安装“黔移庄园”平台直播溯源标识


40it [00:29,  1.33it/s]

tensor([[0.0380, 0.5751, 0.5522, 0.4646, 0.4035, 0.4000, 0.4580, 0.4215, 0.4382,
         0.4255, 0.5707, 0.5490, 0.4658, 0.4716, 0.5243, 0.5243, 0.5391, 0.5243,
         0.2587, 0.4286, 0.5275, 0.5963, 0.4368, 0.3811, 0.5646, 0.5555]])
最相关的文本是:

 ▶创新运营公益平台


41it [00:29,  1.32it/s]

tensor([[0.0573, 0.4185, 0.4398, 0.4408, 0.3550, 0.3773, 0.3567, 0.2990, 0.4370,
         0.3978, 0.4351, 0.5177, 0.4724, 0.3400, 0.3861, 0.3861, 0.4022, 0.3861,
         0.2840, 0.4501, 0.4261, 0.4930, 0.3933, 0.3137, 0.4520, 0.4143]])
最相关的文本是:

 ◎贵州移动工作人员前往稻田安装“黔移庄园”平台直播溯源标识


42it [00:30,  1.32it/s]

tensor([[0.0567, 0.5064, 0.4870, 0.4181, 0.3585, 0.3379, 0.3774, 0.3889, 0.4297,
         0.4251, 0.4412, 0.5200, 0.3727, 0.2878, 0.4135, 0.4135, 0.4171, 0.4135,
         0.2780, 0.6009, 0.5668, 0.5012, 0.4877, 0.4162, 0.4609, 0.5262]])
最相关的文本是:

 案例科技内容赋能，浇灌汤原培根铸魂“文化树”


43it [00:31,  1.38it/s]

tensor([[0.0299, 0.3498, 0.6214, 0.4936, 0.5152, 0.5014, 0.5453, 0.3943, 0.4156,
         0.5079, 0.3077, 0.4718, 0.5151, 0.5549, 0.4199, 0.5146, 0.5255, 0.4880,
         0.4986, 0.6499, 0.4906, 0.4350, 0.5751, 0.6089, 0.4156, 0.5473, 0.3679,
         0.4976, 0.4543, 0.4568, 0.3437, 0.4504, 0.5674, 0.4879, 0.4572, 0.4992,
         0.4711, 0.3668]])
最相关的文本是:

 立体支援泸定抗震救灾，打造信息服务“生命线”


44it [00:31,  1.42it/s]

tensor([[0.0014, 0.3563, 0.5045, 0.5438, 0.4435, 0.4892, 0.6015, 0.4187, 0.4173,
         0.5520, 0.2980, 0.5642, 0.4333, 0.5756, 0.4197, 0.6357, 0.5942, 0.3717,
         0.4038, 0.5138, 0.5517, 0.4128, 0.4903, 0.4738, 0.4780, 0.4718, 0.3675,
         0.5375, 0.4864, 0.4281, 0.3707, 0.4274, 0.5335, 0.4297, 0.4066, 0.5923,
         0.5609, 0.3998]])
最相关的文本是:

 投身志愿公益13年，累计参加各项志愿者活动1,500余小时。牵头成立“郑州管城分公司党员志愿者服务队”，开展“预防电信诈骗”公益授课30余场，为贫困留守儿童捐赠手机20余部、爱心图书1,000多本。


45it [00:32,  1.45it/s]

tensor([[0.0850, 0.4330, 0.5389, 0.5030, 0.5339, 0.5634, 0.4410, 0.3649, 0.4257,
         0.4502, 0.3692, 0.3868, 0.4488, 0.4790, 0.4567, 0.4987, 0.4014, 0.3244,
         0.3990, 0.5227, 0.4780, 0.5569, 0.4606, 0.5100, 0.4169, 0.5459, 0.4270,
         0.5161, 0.5304, 0.4747, 0.6224, 0.4608, 0.6640, 0.5268, 0.4300, 0.4462,
         0.5271, 0.3670]])
最相关的文本是:

 地震发生后，中国移动公益平台及时行动，主动作为，联系慈善组织联合策划开展专题募捐活动，募集善款超过91.3万元，累计2.36万人次参与捐赠。


46it [00:33,  1.48it/s]

tensor([[0.0088, 0.5653, 0.5411, 0.5020, 0.5522, 0.5252, 0.5600, 0.5241, 0.4558,
         0.5812, 0.3460, 0.5193, 0.4955, 0.5495, 0.5539, 0.5883, 0.5469, 0.3725,
         0.3778, 0.5170, 0.7005, 0.5288, 0.5070, 0.6821, 0.5049, 0.4834, 0.4328,
         0.6362, 0.5804, 0.4342, 0.4019, 0.4499, 0.6347, 0.4745, 0.5455, 0.4887,
         0.5369, 0.6981]])
最相关的文本是:

 ◎中国移动设置10086救灾爱心专席


47it [00:34,  1.42it/s]

tensor([[-0.0093,  0.4965,  0.4048,  0.4573,  0.4506,  0.4055,  0.2898,  0.4281,
          0.4158,  0.5955,  0.4425,  0.3923,  0.2274,  0.5178,  0.4649,  0.5336,
          0.6201,  0.4044,  0.3155,  0.5100,  0.5370,  0.3996,  0.5848,  0.3287,
          0.4440,  0.4406,  0.4426,  0.3775]])
最相关的文本是:

 ▶稳定位，强化区域协调发展


48it [00:34,  1.38it/s]

tensor([[-0.0170,  0.5456,  0.5283,  0.5454,  0.5145,  0.5948,  0.3546,  0.6237,
          0.5274,  0.6374,  0.6009,  0.3936,  0.3875,  0.5074,  0.5735,  0.6718,
          0.6896,  0.5177,  0.3933,  0.7141,  0.5437,  0.4872,  0.5983,  0.4309,
          0.4954,  0.5640,  0.5357,  0.5910]])
最相关的文本是:

 责任透视窗：推动区域协调，中国移动在行动


49it [00:35,  1.37it/s]

tensor([[0.0408, 0.5057, 0.7433, 0.2744, 0.6190, 0.4548, 0.4036, 0.4785, 0.5911,
         0.4408, 0.5186, 0.4339, 0.5159, 0.7375, 0.8196, 0.6085, 0.5772, 0.4192,
         0.4013, 0.7914, 0.8292, 0.8225, 0.6403, 0.5127, 0.6028, 0.7783]])
最相关的文本是:

 中国移动国际公司印尼子公司在印尼西爪哇省展玉地区SantoYusup孤儿院开展“送温暖献爱心”活动。


50it [00:36,  1.36it/s]

tensor([[0.0596, 0.7080, 0.5955, 0.2963, 0.5068, 0.6542, 0.3827, 0.4794, 0.5539,
         0.4183, 0.5054, 0.3953, 0.3532, 0.6348, 0.4961, 0.4616, 0.3410, 0.4935,
         0.4205, 0.4858, 0.4303, 0.4033, 0.3792, 0.5687, 0.4302, 0.3167]])
最相关的文本是:

 深化海外履责


51it [00:37,  1.34it/s]

tensor([[0.0578, 0.4365, 0.4903, 0.5998, 0.5026, 0.5496, 0.3616, 0.4346, 0.5993,
         0.3981, 0.6716, 0.5304, 0.6210, 0.4921, 0.5362, 0.5197, 0.5230, 0.6253,
         0.6145, 0.5093, 0.5293, 0.5765, 0.5452, 0.5828, 0.5928, 0.5217, 0.6231,
         0.5474]])
最相关的文本是:

 增加对退休干部、身患重大疾病员工、受灾员工及其他特殊群体员工的帮扶力度。


52it [00:37,  1.33it/s]

tensor([[-0.0104,  0.4343,  0.5172,  0.7046,  0.4163,  0.4328,  0.2426,  0.4357,
          0.4526,  0.3373,  0.6032,  0.5109,  0.4956,  0.4869,  0.5438,  0.5473,
          0.5409,  0.5560,  0.6517,  0.5005,  0.5803,  0.5249,  0.4868,  0.6468,
          0.6279,  0.5449,  0.5061,  0.5668]])
最相关的文本是:

 践行职工人文关怀


53it [00:38,  1.32it/s]

tensor([[-0.0268,  0.4613,  0.3941,  0.6550,  0.3779,  0.3880,  0.2617,  0.4629,
          0.4233,  0.3104,  0.6026,  0.5565,  0.5017,  0.5135,  0.5232,  0.4205,
          0.5169,  0.5641,  0.6223,  0.4917,  0.5708,  0.4664,  0.4112,  0.6342,
          0.6667,  0.4102,  0.5652,  0.5315]])
最相关的文本是:

 参加第二届人民健康大会、健康企业圆桌论坛、职业健康管理人员研讨培训等，进行职业健康工作的经验分享和亮点介绍，对标学习先进单位职业健康工作的优秀实践。


54it [00:39,  1.36it/s]

tensor([[0.1294, 0.3346, 0.5795, 0.5773, 0.4776, 0.5773, 0.5623, 0.6252, 0.4594,
         0.6181, 0.5981, 0.3746, 0.2731, 0.6364, 0.4047, 0.4769, 0.3847, 0.4646,
         0.3516, 0.5245, 0.3390, 0.3376, 0.3346, 0.3346, 0.3346, 0.3346, 0.4959,
         0.5449, 0.3628, 0.3736, 0.3346, 0.4865, 0.3346, 0.3346, 0.5411, 0.5369,
         0.5078, 0.3346, 0.5760, 0.2731, 0.6141, 0.3847, 0.3516, 0.3847]])
最相关的文本是:

 2三能中国移动碳达峰碳中和行动计划


55it [00:40,  1.35it/s]

tensor([[0.1538, 0.4824, 0.5336, 0.3330, 0.4775, 0.3430, 0.4290, 0.4193, 0.3286,
         0.4678, 0.4711, 0.4214, 0.4997, 0.4729, 0.4654, 0.4266, 0.5685, 0.6404,
         0.5087, 0.4688, 0.4742, 0.4742, 0.3198, 0.5486, 0.4604, 0.3207]])
最相关的文本是:

 “5G+”赋能绿色数据中心建设


56it [00:40,  1.35it/s]

tensor([[0.0201, 0.5009, 0.6076, 0.3748, 0.4959, 0.3799, 0.3587, 0.4044, 0.3157,
         0.5085, 0.4648, 0.4397, 0.5172, 0.4185, 0.4015, 0.4466, 0.5442, 0.3511,
         0.3839, 0.4866, 0.4581, 0.5028, 0.3819, 0.5605, 0.5107, 0.3759]])
最相关的文本是:

 ◎安徽移动打造低碳基站


57it [00:41,  1.35it/s]

tensor([[0.1059, 0.5387, 0.5792, 0.4671, 0.4422, 0.4691, 0.5108, 0.6613, 0.4273,
         0.4394, 0.5482, 0.7798, 0.4824, 0.7195, 0.6830, 0.7206, 0.6070, 0.6080,
         0.3542, 0.4823, 0.4355, 0.5237, 0.4480, 0.4987, 0.5404, 0.4613]])
最相关的文本是:

 案例业界首例新型绿色数据中心成套技术，引领绿色低碳发展


58it [00:42,  1.35it/s]

tensor([[-0.0412,  0.4963,  0.4069,  0.3360,  0.3621,  0.3359,  0.4133,  0.6177,
          0.3111,  0.4176,  0.5056,  0.6625,  0.3973,  0.6511,  0.6122,  0.6295,
          0.5941,  0.5308,  0.3359,  0.4251,  0.4182,  0.4764,  0.3394,  0.4242,
          0.5040,  0.3411]])
最相关的文本是:

 案例业界首例新型绿色数据中心成套技术，引领绿色低碳发展


59it [00:42,  1.36it/s]

tensor([[0.0154, 0.3740, 0.5435, 0.3397, 0.4600, 0.3587, 0.4757, 0.3664, 0.4524,
         0.5069, 0.4963, 0.4281, 0.5090, 0.5185, 0.5247, 0.4517, 0.4104, 0.3613,
         0.5461, 0.5229, 0.1994, 0.2200, 0.3985, 0.3306, 0.3985]])
最相关的文本是:

 “绿色奥运”是2022年北京冬奥会“绿色、共享、开放、廉洁”的四大理念之首。北京移动积极研究和创新绿色低碳基站技术，打造规划、建设、运行和维护全生命周期“低碳”基站体系，用实际行动助力绿色冬奥。


60it [00:43,  1.37it/s]

tensor([[0.1136, 0.3203, 0.5269, 0.4060, 0.5762, 0.3196, 0.4792, 0.5256, 0.5267,
         0.5865, 0.5543, 0.5899, 0.5703, 0.6183, 0.6094, 0.5464, 0.6205, 0.6001,
         0.5706, 0.5052, 0.4119, 0.2489, 0.4730, 0.4990, 0.4730]])
最相关的文本是:

 中国移动节能宣传周迄今已成功举办十四届，更多2022年中国移动节能宣传周活动详细情况，欢迎扫描下方二维码了解。


61it [00:44,  1.33it/s]

tensor([[0.0837, 0.6242, 0.6487, 0.4145, 0.5040, 0.4325, 0.4519, 0.5449, 0.5254,
         0.5880, 0.4922, 0.5849, 0.6032, 0.5577, 0.4328, 0.5704, 0.5058, 0.4290,
         0.4421, 0.4718, 0.4356, 0.5219, 0.6688, 0.5315, 0.6118, 0.5661, 0.4803,
         0.6188, 0.5806, 0.6482]])
最相关的文本是:

 推广污染防治领域信息化应用


62it [00:45,  1.32it/s]

tensor([[0.0181, 0.4367, 0.3762, 0.3375, 0.6116, 0.5076, 0.6229, 0.5157, 0.6366,
         0.4450, 0.4201, 0.4701, 0.4720, 0.5555, 0.4143, 0.4406, 0.4628, 0.3807,
         0.3203, 0.3614, 0.3905, 0.4114, 0.5319, 0.3666, 0.2813, 0.2813, 0.2813]])
最相关的文本是:

 自主研发5G胶囊机器人，面向地处偏远、无光纤覆盖污水泵站，5G广域专网实现多角度全面巡查和仪表数据高清回传，提升水务全时空监测预警能力。调度中心与应急指挥车通过5G专网实现超高清视频联动，提升水务应急指挥能力。


63it [00:46,  1.32it/s]

tensor([[0.0920, 0.5450, 0.4448, 0.4524, 0.5211, 0.4510, 0.4723, 0.5127, 0.5382,
         0.5510, 0.4542, 0.5198, 0.5045, 0.5911, 0.4248, 0.5401, 0.5763, 0.3462,
         0.2959, 0.6164, 0.4353, 0.4801, 0.5857, 0.4074, 0.3410, 0.3410, 0.3410]])
最相关的文本是:

 ◎5G无人机巡田


64it [00:46,  1.32it/s]

tensor([[0.0602, 0.4710, 0.3612, 0.3945, 0.6766, 0.5437, 0.6657, 0.5421, 0.6907,
         0.4872, 0.4012, 0.4909, 0.5829, 0.5476, 0.4696, 0.4657, 0.4930, 0.3467,
         0.3188, 0.4003, 0.4182, 0.4212, 0.6184, 0.3648, 0.3161, 0.3161, 0.3161]])
最相关的文本是:

 自主研发5G胶囊机器人，面向地处偏远、无光纤覆盖污水泵站，5G广域专网实现多角度全面巡查和仪表数据高清回传，提升水务全时空监测预警能力。调度中心与应急指挥车通过5G专网实现超高清视频联动，提升水务应急指挥能力。


65it [00:47,  1.44it/s]

tensor([[-0.0035,  0.4467,  0.4230,  0.3609,  0.3062,  0.4607,  0.4457,  0.3322,
          0.5959,  0.3169,  0.5215,  0.4862,  0.4230,  0.5545,  0.5368,  0.3931]])
最相关的文本是:

 ▶开展多元沟通


66it [00:47,  1.69it/s]

tensor([[0.0316, 0.4965, 0.4108, 0.4669, 0.3133, 0.3841]])
最相关的文本是:

 独立鉴证报告


68it [00:48,  1.42it/s]

tensor([[0.0753, 0.5082, 0.1810, 0.5555, 0.3152, 0.1101]])
最相关的文本是:

 验证声明

File: 00941.HK-中国移动-中国移动 2022年度可持续发展报告-2023-03-24.pdf,
No match on page 50, proceeding to the next page






In [141]:
df_distance = pd.DataFrame(dis_info)
df_merged_4 = pd.merge(df_merged_3, df_distance, on='file_name', how = 'left')
df_merged_4

Unnamed: 0,file_name,x0,y0,x1,y1,centre_coordinate,url,dipt,token,page,p_index,simi_text,dist
0,01138.HK-中远海能-中远海能 中远海能2022年可持续发展报告-2023-03-31...,187.718994,702.485046,411.766998,722.046021,"(299.7429962158203, 712.2655334472656)",,,,,,,
1,01138.HK-中远海能-中远海能 中远海能2022年可持续发展报告-2023-03-31...,246.468994,741.485046,358.854980,761.046021,"(302.6619873046875, 751.2655334472656)",,,,,,,
2,01138.HK-中远海能-中远海能 中远海能2022年可持续发展报告-2023-03-31...,90.000000,73.080002,507.600006,320.279999,"(298.8000030517578, 196.68000030517578)",,,,,,,
3,01138.HK-中远海能-中远海能 中远海能2022年可持续发展报告-2023-03-31...,90.000000,100.319977,505.320007,489.239990,"(297.6600036621094, 294.7799835205078)",,,,,,,
4,01138.HK-中远海能-中远海能 中远海能2022年可持续发展报告-2023-03-31...,90.000000,247.440002,505.679993,395.040009,"(297.8399963378906, 321.24000549316406)",,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11500,600356.SH-恒丰纸业-恒丰纸业 恒丰纸业2022可持续发展报告-2023-03-15...,201.309631,581.452515,516.938354,769.132202,"(359.1239929199219, 675.2923583984375)",,,,,,,
11501,600356.SH-恒丰纸业-恒丰纸业 恒丰纸业2022可持续发展报告-2023-03-15...,968.806030,139.261810,1110.485718,224.077393,"(1039.6458740234375, 181.6696014404297)",,,,,,,
11502,600356.SH-恒丰纸业-恒丰纸业 恒丰纸业2022可持续发展报告-2023-03-15...,969.845520,227.706161,1110.485718,313.471558,"(1040.1656188964844, 270.58885955810547)",,,,,,,
11503,600356.SH-恒丰纸业-恒丰纸业 恒丰纸业2022可持续发展报告-2023-03-15...,-1.349877,0.000244,596.325378,842.894775,"(297.4877505302429, 421.447509765625)",,,,,,,


In [143]:
df_merged_4.to_excel("/Users/improvise/Desktop/保研/实证论文/ESG/Playground/01_Extraction/Test01/03 PyMu_img01_coordinate/distance.xlsx", index=False)