In [20]:
import pandas as pd

In [21]:
differences_data = pd.read_json('data/Entity_curation/comparison_results/differences.json')
matches_data = pd.read_json('data/Entity_curation/comparison_results/matches.json')

In [22]:
import re

def parse_entities(entity_string):
    """解析实体字符串，提取所有的 ("实体名", 实体类型) 对"""
    if not entity_string or entity_string.strip() in ["null", "", "None"]:
        return set()
    
    pattern = r'\("([^"]+)",\s*([^)]+)\)'
    matches = re.findall(pattern, str(entity_string))
    return set((entity.strip(), entity_type.strip()) for entity, entity_type in matches)

def merge_and_deduplicate(row):
    """合并多个字段的实体并去重"""
    all_entities = set()
    
    for field in ['OUTPUT', 'gt_r', 'checked', 'check_myself']:
        entities = parse_entities(row[field])
        all_entities.update(entities)
    
    if not all_entities:
        return "null"
    
    # 将集合转换回标准格式字符串，按字母顺序排序以保持一致性
    sorted_entities = sorted(all_entities, key=lambda x: (x[0].lower(), x[1]))
    return ", ".join(f'("{entity}", {entity_type})' for entity, entity_type in sorted_entities)

# 应用到两个数据集
differences_data['all_checks'] = differences_data.apply(merge_and_deduplicate, axis=1)
matches_data['all_checks'] = matches_data.apply(merge_and_deduplicate, axis=1)

In [23]:
differences_data.to_json('data/Entity_curation/comparison_results/differences_updated.json', orient='records', lines=False, indent=4)
matches_data.to_json('data/Entity_curation/comparison_results/matches_updated.json', orient='records', lines=False, indent=4)

In [24]:
differences_data

Unnamed: 0,INPUT,type,ref,OUTPUT,gt_r,checked,check_myself,all_checks
0,"Results: In total, 549 common differentially e...",paper,title: OncoTargets and Therapy@Yang Yang,"(""cancer"", Disease), (""colon cancer"", Disease)...","(""cancer"", Disease), (""colon cancer"", Disease)...","(""cancer"", Disease), (""colon cancer"", Disease)...","(""cancer"", Disease), (""colon cancer"", Disease)...","(""cancer"", Disease), (""colon"", Organ), (""colon..."
1,LncRNA Colorectal Neoplasia Differentially Exp...,paper,title: Long non-coding RNA CRNDE may be associ...,"(""cervical cancer"", Disease), (""cancer"", Disea...","(""cervical cancer"", Disease), (""cancer"", Disea...","(""cervical cancer"", Disease), (""cancer"", Disea...","checked: (""Colorectal Neoplasia Differentially...","(""cancer"", Disease), (""cancers"", Disease), (""c..."
2,"The lncRNAs CCAT2, NEAT1, FOXO1, and PVT1 have...",paper,title: Upregulation of lncRNA CASC2 Suppresses...,"(""CCAT2"", Gene Symbol), (""PVT1"", Gene Symbol),...","(""CCAT2"", Gene Symbol), (""PVT1"", Gene Symbol),...","(""CCAT2"", Gene Symbol), (""PVT1"", Gene Symbol),...","(""CCAT2"", Gene Symbol), (""NEAT1"", Gene Symbol)...","(""BC"", Disease), (""Cancer"", Disease), (""cancer..."
3,TNBC is a subtype of breast cancer with high r...,paper,title: Long noncoding RNA SOX21-AS1 regulates ...,"(""cancer"", Disease), (""breast cancer"", Disease...","(""cancer"", Disease), (""breast cancer"", Disease...","(""cancer"", Disease), (""breast cancer"", Disease...","(""breast"", Organ), (""cancer"", Disease), (""brea...","(""breast"", Organ), (""breast cancer"", Disease),..."
4,The absolute limit of detection $\mathrm{{LOD}...,standard,title: Molecular biomarker analysis – Detectio...,"(""goose"", Species), (""pigeon"", Species), (""duc...","(""goose"", Species), (""pigeon"", Species), (""duc...","(""goose"", Species), (""pigeon"", Species), (""duc...","(""DNA"", Target), (""pigeon"", Species), (""goose""...","(""anser"", Species), (""Anser"", Species), (""DNA""..."
...,...,...,...,...,...,...,...,...
118,"Until now, it has been found that more than 70...",paper,title: Downregulation of lncRNA CCAT1 enhances...,"(""tumor"", Disease)","(""tumor"", Disease)","(""tumor"", Disease)","(""lncRNAs"", Target), (""tumor"", Disease)","(""lncRNAs"", Target), (""tumor"", Disease)"
119,"Therefore, it is of great importance to improv...",paper,title: A C C E P T E D M A N U S C R I P T Exp...,"(""cervical cancer"", Disease), (""therapeutic ta...","(""cervical cancer"", Disease), (""therapeutic ta...","(""cervical cancer"", Disease), (""therapeutic ta...","(""cervical cancer"", Disease), (""cervical"", Org...","(""cancer"", Disease), (""cervical"", Organ), (""ce..."
120,Lung cancer is the leading cause of cancer dea...,paper,title: MetaLnc9 Facilitates Lung Cancer Metast...,"(""Lung cancer"", Disease), (""cancer"", Disease)","(""Lung cancer"", Disease), (""cancer"", Disease)","(""Lung cancer"", Disease), (""cancer"", Disease)","(""Lung cancer"", Disease), (""Lung"", Organ), (""c...","(""cancer"", Disease), (""Lung"", Organ), (""Lung c..."
121,Total RNA was extracted from tissues or cultur...,paper,title: LINC00665 Induces Acquired Resistance t...,,,,"(""tissues"", Tissue), (""cultured cells"", Cell),...","(""cells"", Cell), (""cultured cells"", Cell), (""t..."


In [None]:
# differences_data只需要保留'INPUT', 'type', 'ref', 'OUTPUT', 'gt_r', 'checked'。但是其中checked的内容需要用all_checks的内容替换
differences_data = differences_data[['INPUT', 'type', 'ref', 'OUTPUT', 'gt_r', 'all_checks', "checked"]]
differences_data = differences_data.rename(columns={'checked': 'checked_raw'})

matches_data = matches_data[['INPUT', 'type', 'ref', 'OUTPUT', 'gt_r', 'all_checks', "checked"]]
matches_data = matches_data.rename(columns={'checked': 'checked_raw'})

In [26]:
curated_data = pd.concat([differences_data, matches_data], ignore_index=True)
# curated_data.to_json('data/Entity_curation/comparison_results/curated_results.json

In [27]:
def compare_entity_fields(row):
    """比较checked_raw和all_checks字段，判断是否实质上不同"""
    checked_raw = parse_entities(row['checked_raw'])
    all_checks = parse_entities(row['all_checks'])
    
    # 如果两个集合不相等，说明有实质性差异
    return checked_raw != all_checks

# 添加一个列来标记是否有实质性差异
curated_data['has_difference'] = curated_data.apply(compare_entity_fields, axis=1)

# 统计有实质性差异的行数
total_rows = len(curated_data)
rows_with_differences = curated_data['has_difference'].sum()
rows_without_differences = total_rows - rows_with_differences

print(f"总行数: {total_rows}")
print(f"checked_raw和all_checks有实质性差异的行数: {rows_with_differences}")
print(f"checked_raw和all_checks没有实质性差异的行数（仅顺序不同或完全相同）: {rows_without_differences}")
print(f"差异比例: {rows_with_differences/total_rows*100:.2f}%")



null_checked_raw_count = curated_data[
    (curated_data['checked_raw'].isin(['null', '', 'None'])) &
    (~curated_data['all_checks'].isin(['null', '', 'None']))
].shape[0]
print(f'checked_raw是字符串的“null”的但是“all_checks不是“null”的行数: {null_checked_raw_count}')
# # 查看有差异的样本
# print("\n有实质性差异的样本（前5个）:")
# diff_samples = curated_data[curated_data['has_difference']][['INPUT', 'checked_raw', 'all_checks']].head()
# for idx, row in diff_samples.iterrows():
#     print(f"\n行 {idx}:")
#     print(f"INPUT: {row['INPUT'][:100]}...")
#     print(f"checked_raw: {row['checked_raw']}")
#     print(f"all_checks: {row['all_checks']}")

KeyError: 'checked_raw'

In [17]:
# 统计checked_raw是字符串的“null”的但是“”all_checks不是“null”的行数
null_checked_raw_count = curated_data[(curated_data['checked_raw'].isin(['null', '', 'None'])) & (~curated_data['all_checks'].isin(['null', '', 'None']))].shape[0]
print(f'checked_raw是字符串的“null”的但是“all_checks不是“null”的行数: {null_checked_raw_count}')

checked_raw是字符串的“null”的但是“all_checks不是“null”的行数: 10


In [18]:
null_checked_raw_count = curated_data[
    (curated_data['checked_raw'].isin(['null', '', 'None'])) &
    (~curated_data['all_checks'].isin(['null', '', 'None']))
].shape[0]
print(f'checked_raw是字符串的“null”的但是“all_checks不是“null”的行数: {null_checked_raw_count}')

checked_raw是字符串的“null”的但是“all_checks不是“null”的行数: 10
