<a href="https://colab.research.google.com/github/Hijuli66/33/blob/master/Text.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 1. 挂载 Google Drive
from google.colab import drive
drive.mount('/content/drive')

# 2. 设定根目录
import os
BASE_DIR = '/content/drive/MyDrive'

# 3. 读取两个 CSV 文件
import pandas as pd

path1 = os.path.join(BASE_DIR, '33/Data/reports1.csv')
path2 = os.path.join(BASE_DIR, '33/Data/reports2.csv')

df1 = pd.read_csv(path1)
df2 = pd.read_csv(path2)
df_merged = pd.concat([df1, df2], ignore_index=True)

# 4. 检查合并结果
print(f"合并后总行数: {len(df_merged)}")
print(df_merged.head())
print(df_merged.tail())

# 5. 保存到新文件
output_path = os.path.join(BASE_DIR, '33/Data/reports_merged.csv')
df_merged.to_csv(output_path, index=False, encoding='utf-8-sig')
print(f"已保存到: {output_path}")

Mounted at /content/drive
合并后总行数: 9603
         image_id                                             report  label
0   covid_424.png  患者基本信息：姓名李华，女，38岁，门诊号2024051512，联系方式139XXXX890...      1
1  covid_4161.png  患者基本信息，姓名：张三，性别：男，年龄：58岁，门诊号：M2025061208，联系方式：...      1
2  covid_4188.png  姓名：刘伟，男，42 岁，门诊号 2024051309，联系方式 139XXXX1234，就...      1
3  covid_4245.png  姓名：陈刚，男，46 岁，门诊号 2024051310，联系方式 137XXXX5678，就...      1
4  covid_4202.png  患者基本信息：姓名张明，男，45岁，门诊号2024051308，联系方式138XXXX567...      1
             image_id                                             report  \
9598  normal_1838.png  患者方雨桐，主诉 “发热伴肌肉酸痛 2 天，咳少量白痰”。最高体温 38.5℃，服对乙酰氨基...   
9599  normal_1906.png  患者林皓宇，主诉 “咳嗽伴咽痛 3 天，夜间咳醒”。无发热，咳少量白痰，自行服咽炎片无效，无...   
9600  normal_1827.png  患者郑驽，主诉 “发热伴咳黄痰 2 天，伴轻微胸闷”。最高体温 38.7℃，服对乙酰氨基酚后...   
9601  normal_1955.png  患者白百合，主诉 “干咳伴乏力 4 天，咽痛加重 1 天”。无发热，自行服止咳糖浆无效，无疫...   
9602              NaN                                                NaN   

      label  
9598      0  
9599      0  
9600  

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:


!pip install -q transformers keybert sentence-transformers pandas numpy torch

import torch
device = 0 if torch.cuda.is_available() else -1
print(f"使用设备: {'GPU' if device==0 else 'CPU'}")

# ========================================
# 步骤2：加载数据
# ========================================
import pandas as pd
import os


data_path = '/content/drive/MyDrive/33/Data/reports_merged.csv'

df = pd.read_csv(data_path)

print("列数:", len(df.columns))
print("列名:", list(df.columns))
print("\n前5行：")
print(df.head(5))

# 取前 9604 条（若数据不足则全部处理）
df = df.head(9604).reset_index(drop=True)
total_rows = len(df)
print(f"将处理 {total_rows} 条记录（每批 100 条）")

# ========================================
# 步骤3：加载模型
# ========================================
from sentence_transformers import SentenceTransformer
from keybert import KeyBERT
from transformers import pipeline

# KeyBERT + 文本编码模型
embedder = SentenceTransformer('shibing624/text2vec-base-chinese')
kw_model = KeyBERT(embedder)

# NER 模型（医疗实体）
ner_model = pipeline(
    "ner",
    model="ckiplab/bert-base-chinese-ner",
    tokenizer="ckiplab/bert-base-chinese-ner",
    aggregation_strategy="simple",
    device=device
)
print("模型加载完成！")

# ========================================
# 步骤4：文本向量化函数
# ========================================
def extract_text_embedding(text):
    if not text or not isinstance(text, str):
        return [0.0] * 768
    return embedder.encode(text, convert_to_numpy=True, show_progress_bar=False).tolist()

# ========================================
# 步骤5：【核心】关键词提取函数
# ========================================
import re

def extract_keywords(text, top_n=12):
    if not text or not isinstance(text, str):
        return "无关键信息"

    keywords = set()
# 1. KeyBERT
    try:
        kws = kw_model.extract_keywords(
            text,
            keyphrase_ngram_range=(1, 3),
            stop_words=None,
            top_n=top_n,
            use_mmr=True,
            diversity=0.5
        )
        keywords.update([kw[0] for kw in kws])
    except Exception as e:
        print(f"KeyBERT 出错（跳过）: {e}")

    # 2. NER
    try:
        ents = ner_model(text)
        for e in ents:
            if e['score'] > 0.8:
                word = e['word'].replace('##', '')
                keywords.add(word)
    except Exception as e:
        print(f"NER 出错（跳过）: {e}")

    # 3. 关键词提取
    text_clean = text.replace(' ', '')

    # 发热
    keywords.update(re.findall(r'发热', text_clean))

    # 咳嗽
    cough = re.findall(r'(干咳|有痰|咳黄痰|咳白痰|痰量)', text_clean)
    keywords.update(cough)

    # 伴随症状
    symptoms = re.findall(r'(乏力|肌肉酸痛|咽痛|胸闷|呼吸困难|气促|头痛|恶心|呕吐|腹泻)', text_clean)
    keywords.update(symptoms)

    # 流行病学史
    keywords.update(re.findall(r'(疫区旅居史|新冠患者接触史|密切接触史)', text_clean))
    vaccine = re.findall(r'接种新冠疫苗\s*\d+剂', text_clean)
    keywords.update(vaccine)

    # 吸烟饮酒
    keywords.update(re.findall(r'吸烟', text_clean))
    keywords.update(re.findall(r'饮酒', text_clean))

    # 体征
    keywords.update(re.findall(r'(咽部充血|扁桃体肿大|淋巴结肿大|肺部啰音|呼吸音减弱)', text_clean))

    # 实验室检查
    wbc = re.findall(r'白细胞计数\s*\d+\.\d+\s*[×x]\s*10⁹/L', text_clean)
    lymph = re.findall(r'淋巴细胞比例\s*\d+\.\d+%?', text_clean)
    crp = re.findall(r'C反应蛋白\s*\d+\.\d+\s*mg/L', text_clean)
    pct = re.findall(r'降钙素原\s*\d+\.\d+\s*ng/mL', text_clean)
    keywords.update(wbc + lymph + crp + pct)

    # 核酸/抗体
    nucleic = re.findall(r'(新冠病毒核酸|核酸)\s*(阳性|阴性)', text_clean)
    antibody = re.findall(r'(新冠病毒抗体|抗体)\s*(阳性|阴性)', text_clean)
    keywords.update([f"{a}{b}" for a, b in (nucleic + antibody)])

    # 影像学
    ct_patterns = [
        r'胸部CT[^。；,]*?双肺[^。；,]*?磨玻璃影[^。；,]*?',
        r'[左右][上下]肺[^。；,]*?炎症性改变[^。；,]*?',
        r'右肺下叶实变[^。；,]*?',
        r'肺野清晰[^。；,]*?心影正常[^。；,]*?无异常密度影',
        r'X光胸片[^。；,]*?双肺纹理正常[^。；,]*?肺门影未见增大',
        r'肺部通气良好[^。；,]*?无炎症影'
    ]
    for p in ct_patterns:
        keywords.update(re.findall(p, text_clean))

    # 诊断
    diag_patterns = [
        r'新型冠状病毒肺炎[^。；,]*?',
        r'社区获得性肺炎[^。；,]*?',
        r'病毒性肺炎[^。；,]*?',
        r'新冠肺炎合并细菌性肺炎',
        r'轻型|普通型|重型|危重型'
    ]
    for p in diag_patterns:
        keywords.update(re.findall(p, text_clean))

    result = list(keywords)[:top_n]
    return ', '.join(result) if result else "无关键信息"

# =================
# 步骤6：分批处理
# =================
import os
import pandas as pd

# 输出路径
keywords_path = '/content/drive/MyDrive/33/Text/keywords.csv'
text_features_path = '/content/drive/MyDrive/33/Text/text_features.csv'

# 创建目录
os.makedirs(os.path.dirname(keywords_path), exist_ok=True)
os.makedirs(os.path.dirname(text_features_path), exist_ok=True)

# ========== 自动检测已处理数量 ==========
def get_processed_count(csv_path):
    if not os.path.exists(csv_path):
        return 0
    try:
        df_temp = pd.read_csv(csv_path)
        return len(df_temp)
    except:
        return 0

processed_keywords = get_processed_count(keywords_path)
processed_features = get_processed_count(text_features_path)
start_idx = max(processed_keywords, processed_features)  # 取最大值防止出错

print(f"检测到已处理 {start_idx} 条记录，将从第 {start_idx + 1} 条开始...")

# ========== 处理未完成==========
if start_idx >= total_rows:
    print("全部已处理完成！无需运行！")
else:
    batch_size = 100
    total_processed = start_idx

    print(f"开始处理剩余 {total_rows - start_idx} 条记录（从 {start_idx} 开始）\n")

    for i in range(start_idx, total_rows, batch_size):
        end_idx = min(i + batch_size, total_rows)
        batch = df.iloc[i:end_idx].copy()

        # 提取关键词和向量
        batch['keywords'] = batch['report'].apply(lambda x: extract_keywords(x, top_n=12))
        batch['text_embedding'] = batch['report'].apply(extract_text_embedding)
        batch['feature_vector'] = batch['text_embedding'].apply(lambda x: ','.join(map(str, x)))

        # ========== 追加覆盖==========
        kw_batch = batch[['image_id', 'keywords', 'label']]
        feat_batch = batch[['image_id', 'feature_vector', 'label']]


        kw_batch.to_csv(keywords_path, mode='a', header=not os.path.exists(keywords_path), index=False)
        feat_batch.to_csv(text_features_path, mode='a', header=not os.path.exists(text_features_path), index=False)

        total_processed += len(batch)

        # 实时打印
        print(f"已成功处理 {total_processed}/{total_rows} 行 "
              f"({total_processed/total_rows*100:.2f}%)")
        print(f"   追加 {len(batch)} 行 → keywords.csv")
        print(f"   追加 {len(batch)} 行 → text_features.csv")
        print(f"   示例: {batch.iloc[0]['image_id']} → {batch.iloc[0]['keywords'][:50]}...")
        print("-" * 70)

    print("\n续跑完成！")
    print(f"最终文件: {keywords_path}")
    print(f"共处理 {total_processed} 条记录。")

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/41.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.4/41.4 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25h使用设备: CPU
列数: 3
列名: ['image_id', 'report', 'label']

前5行：
         image_id                                             report  label
0   covid_424.png  患者基本信息：姓名李华，女，38岁，门诊号2024051512，联系方式139XXXX890...      1
1  covid_4161.png  患者基本信息，姓名：张三，性别：男，年龄：58岁，门诊号：M2025061208，联系方式：...      1
2  covid_4188.png  姓名：刘伟，男，42 岁，门诊号 2024051309，联系方式 139XXXX1234，就...      1
3  covid_4245.png  姓名：陈刚，男，46 岁，门诊号 2024051310，联系方式 137XXXX5678，就...      1
4  covid_4202.png  患者基本信息：姓名张明，男，45岁，门诊号2024051308，联系方式138XXXX567...      1
将处理 9603 条记录（每批 100 条）


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/856 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/409M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/319 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/407M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/407M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cpu


模型加载完成！
检测到已处理 9603 条记录，将从第 9604 条开始...
全部已处理完成！无需运行！
