# Read MRCONSO.RRF file
## Target subset of UMLS

In [1]:
import csv

# 定义文件路径  /DATA1/llm-research/2022AA-full/
# mrconso_path = '../resource/Radiology/2024AA/META/MRCONSO.RRF'
mrconso_path = '/Users/hanbin/Downloads/2024AA-full/0827/0827/2024AA/META/MRCONSO.RRF'

concepts = {}

with open(mrconso_path, 'r', encoding='utf-8') as f:
    reader = csv.reader(f, delimiter='|')
    for row in reader:
        cui = row[0]
        concept_name = row[14]
        concepts[cui] = concept_name

print(len(concepts))

1411747


In [3]:
# first 10 concepts
for cui, name in list(concepts.items())[:10]:
    print(cui, name)

C0000696 a fiber
C0000726 Abdominal structure (body structure)
C0000727 acute abdomen (diagnosis)
C0000729 [D]Abdominal cramps (situation)
C0000731 abdominal distention (physical finding)
C0000734 Abdominal mass (disorder)
C0000735 ABDOMINAL NEOPL
C0000737 Unspecified abdominal pain
C0000739 Musculature of abdomen
C0000741 Abducens nerve structure (body structure)


In [2]:
import pandas as pd
from rank_bm25 import BM25Okapi
import numpy as np
from collections import OrderedDict

# 读取MRCONSO.RRF文件
mrconso_path = '/Users/hanbin/Downloads/2024AA-full/0827/0827/2024AA/META/MRCONSO.RRF'
df = pd.read_csv(mrconso_path, sep='|', header=None, usecols=[0, 14], names=['CUI', 'STR'])

# 去重
df = df.drop_duplicates()

# 创建CUI到STR的映射
cui_to_str = df.set_index('CUI')['STR'].to_dict()

# 创建语料库，确保所有值都是字符串
corpus = df['STR'].astype(str).tolist()

# 对语料库进行分词，同时处理可能的非字符串值
def tokenize(doc):
    return str(doc).lower().split()

tokenized_corpus = [tokenize(doc) for doc in corpus]

# 创建BM25模型
bm25 = BM25Okapi(tokenized_corpus)

def get_top_n_candidates(query, n=5):
    tokenized_query = tokenize(query)
    doc_scores = bm25.get_scores(tokenized_query)
    
    # 使用OrderedDict来去重，同时保持顺序
    results = OrderedDict()
    for idx in np.argsort(doc_scores)[::-1]:
        cui = df.iloc[idx]['CUI']
        if cui not in results and len(results) < n:
            results[cui] = cui_to_str[cui]
    
    return list(results.items())

# 测试函数
test_queries = [
    "abdominal pain",
    "heart failure",
    "lung cancer",
    "diabetes"
]

# test 'lung'
test_queries = [
    "lung",
    "Normal"
]

for query in test_queries:
    print(f"Query: {query}")
    candidates = get_top_n_candidates(query)
    for i, (cui, str_value) in enumerate(candidates, 1):
        print(f"{i}. {cui}: {str_value}")
    print()

Query: lung
1. C0746117: mass left lung
2. C1278908: Entire lung (body structure)
3. C0024109: Structure of lungs, unspecified
4. C4037972: Lung
5. C2200156: lung tissue swab of left lung (lab test)

Query: Normal
1. C1553399: normal
2. C1704701: Normality-Based Dosing Unit
3. C1550457: Normal
4. C0205307: Normal (qualifier value)
5. C1553406: EntityStatusNormal



In [None]:
import pandas as pd
from rank_bm25 import BM25Okapi
import numpy as np
from collections import OrderedDict
from openai import OpenAI
import json
import os
from tqdm import tqdm

# 设置您的OpenAI API密钥
os.environ["OPENAI_API_KEY"] = "your-api-key-here"

# 读取MRCONSO.RRF文件
df = pd.read_csv('MRCONSO.RRF', sep='|', header=None, usecols=[0, 14], names=['CUI', 'STR'])

# 去重
df = df.drop_duplicates()

# 创建CUI到STR的映射
cui_to_str = df.set_index('CUI')['STR'].to_dict()

# 创建语料库，确保所有值都是字符串
corpus = df['STR'].astype(str).tolist()

# 对语料库进行分词，同时处理可能的非字符串值
def tokenize(doc):
    return str(doc).lower().split()

tokenized_corpus = [tokenize(doc) for doc in corpus]

# 创建BM25模型
bm25 = BM25Okapi(tokenized_corpus)

def get_top_n_candidates(query, n=5):
    tokenized_query = tokenize(query)
    doc_scores = bm25.get_scores(tokenized_query)
    
    # 使用OrderedDict来去重，同时保持顺序
    results = OrderedDict()
    for idx in np.argsort(doc_scores)[::-1]:
        cui = df.iloc[idx]['CUI']
        if cui not in results and len(results) < n:
            results[cui] = cui_to_str[cui]
    
    return list(results.items())

def normalize_entity(entity, results, report_context="", myModel="gpt-3.5-turbo"):
    client = OpenAI()

    results_str = json.dumps(results, indent=2)

    messages = [
        {"role": "system", "content": "You are an expert in named entity normalization for medical terms using the UMLS ontology. Your task is to analyze the given entity and search results, then select the most appropriate normalized form or the most likely UMLS concept."},
        {"role": "user", "content": f"Entity: {entity}\n\nSearch Results:\n{results_str}\n\nReport Context: {report_context}\n\nBased on these results and the report context, provide the most appropriate CUI and name for this entity. If there's no exact match, provide the most likely UMLS concept. If the entity is unlikely to be normalized, return ('unnormalizable', 'unnormalizable'). Respond in the format: (cui, name)"}
    ]

    completion = client.chat.completions.create(
        model=myModel,
        messages=messages
    )

    response = completion.choices[0].message.content.strip()
    try:
        cui, name = eval(response)
    except:
        cui, name = "unnormalizable", "unnormalizable"

    return cui, name

# 示例使用
test_queries = [
    "abdominal pain",
    "heart failure",
    "lung cancer",
    "diabetes"
]

# 模拟报告上下文
report_context = "Patient presents with severe abdominal pain and has a history of heart disease."

for query in tqdm(test_queries):
    print(f"Query: {query}")
    candidates = get_top_n_candidates(query)
    cui, name = normalize_entity(query, candidates, report_context)
    print(f"Normalized: {cui}: {name}")
    print()

# 批量处理实体
def process_entities(entities, report_context=""):
    results = []
    for entity in tqdm(entities):
        candidates = get_top_n_candidates(entity)
        cui, name = normalize_entity(entity, candidates, report_context)
        results.append((entity, cui, name))
    return results

# 示例批量处理
entities = ["hypertension", "myocardial infarction", "asthma", "pneumonia"]
batch_results = process_entities(entities, report_context)

# 显示批量处理结果
for entity, cui, name in batch_results:
    print(f"Entity: {entity}")
    print(f"Normalized: {cui}: {name}")
    print()

In [6]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from rank_bm25 import BM25Okapi
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from Levenshtein import distance as levenshtein_distance

# 下载NLTK的停用词
import nltk
nltk.download('stopwords')

# 读取MRCONSO.RRF文件
# 假设文件结构为：CUI|LAT|TS|LUI|STT|SUI|ISPREF|AUI|SAUI|SCUI|SDUI|SAB|TTY|CODE|STR|SRL|SUPPRESS|CVF
df = pd.read_csv('MRCONSO.RRF', sep='|', header=None, usecols=[0, 14], names=['cui', 'term'])

# 只保留英语术语
df = df[df.iloc[:, 1] == 'ENG']

# 数据预处理函数
def preprocess_text(text):
    # 转换为小写
    text = text.lower()
    # 移除特殊字符
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # 移除停用词
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [word for word in words if word not in stop_words]
    # 词干提取
    ps = PorterStemmer()
    words = [ps.stem(word) for word in words]
    return ' '.join(words)

# 应用预处理
df['processed_term'] = df['term'].apply(preprocess_text)

# 创建一个词汇表
corpus = df['processed_term'].tolist()

# 创建一个CountVectorizer对象
vectorizer = CountVectorizer(tokenizer=lambda x: x.split())

# 将corpus转换为词频矩阵
term_matrix = vectorizer.fit_transform(corpus)

# 创建BM25Okapi对象
bm25 = BM25Okapi(term_matrix)

def fuzzy_match(query, term, max_distance=2):
    """使用Levenshtein距离进行模糊匹配"""
    query_words = query.split()
    term_words = term.split()
    
    if len(query_words) != len(term_words):
        return False
    
    for q_word, t_word in zip(query_words, term_words):
        if levenshtein_distance(q_word, t_word) > max_distance:
            return False
    return True

def search_umls(query, top_k=5, fuzzy=True):
    # 预处理查询
    processed_query = preprocess_text(query)
    
    # 使用BM25模型计算相似度得分
    scores = bm25.get_scores(processed_query.split())
    
    # 获取前top_k*2个结果的索引（我们获取更多的结果，以便在应用模糊匹配后仍有足够的候选项）
    top_indices = np.argsort(scores)[::-1][:top_k*2]
    
    # 应用模糊匹配（如果启用）
    results = []
    for idx in top_indices:
        term = df.iloc[idx]['term']
        if not fuzzy or fuzzy_match(query, term):
            results.append({
                'cui': df.iloc[idx]['cui'],
                'term': term,
                'score': scores[idx]
            })
        if len(results) == top_k:
            break
    
    return results

# 测试搜索函数
test_query = "lung"
results = search_umls(test_query)

print(f"Search results for '{test_query}':")
for result in results:
    print(f"CUI: {result['cui']}, Term: {result['term']}, Score: {result['score']:.4f}")

ModuleNotFoundError: No module named 'Levenshtein'