In [8]:
import os
import re
import numpy as np
from summa import keywords
from nltk.stem import PorterStemmer
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('punkt')

# ========== 配置参数 ==========
DOMAIN_TERMS = {
    'laser', 'quantum', 'modulus', 'polymer', 'nanoparticle', 'spectroscopy',
    'synthesis', 'alloy', 'composite', 'asphalt', 'cullet', 'stiffness', 
    'bitumen', 'asphaltic', 'glass', 'particle'
}

EXTENDED_STOPWORDS = {
    'however', 'furthermore', 'conclusion', 'experiment', 'methodology',
    'result', 'study', 'research', 'data', 'analysis', 'table', 'figure', 
    'sample', 'test', 'show', 'based', 'using'
}

# ========== 核心函数 ==========
def load_data(data_path):
    """增强型数据加载（带词干化和格式验证）"""
    texts = []
    keywords_list = []
    stemmer = PorterStemmer()
    
    for filename in sorted(os.listdir(data_path)):
        if not filename.endswith(".txt"):
            continue
            
        text_path = os.path.join(data_path, filename)
        ann_path = os.path.join(data_path, filename.replace(".txt", ".ann"))
        
        try:
            with open(text_path, "r", encoding="utf-8", errors="ignore") as f:
                texts.append(f.read().strip())
        except Exception as e:
            print(f"Error reading {text_path}: {str(e)}")
            continue
        
        doc_keywords = []
        if os.path.exists(ann_path):
            with open(ann_path, "r", encoding="utf-8", errors="ignore") as f:
                for line in f:
                    parts = line.strip().split("\t")
                    if len(parts) >= 3:
                        annot_parts = parts[1].split()
                        if len(annot_parts) == 0:
                            continue
                            
                        annot_type = annot_parts[0]
                        if annot_type in ["Term", "Material", "Process", "T"]:
                            keyword = parts[2].lower().strip()
                            if 2 < len(keyword) < 50 and not keyword.isnumeric():
                                doc_keywords.append(keyword)
        
        # 保留原始关键词但基于词干去重
        seen_stems = set()
        unique_kws = []
        for kw in doc_keywords:
            stem = stemmer.stem(kw)
            if stem not in seen_stems:
                seen_stems.add(stem)
                unique_kws.append(kw)
        keywords_list.append(unique_kws)
    
    return texts, keywords_list

def clean_text(text):
    """工业级文本清洗管道"""
    text = re.sub(r'\[\d+(-\d+)?\]', '', text)
    text = re.sub(r'&[a-z]+;', lambda m: {'&alpha;':'alpha','&beta;':'beta'}.get(m.group(), ''), text)
    text = re.sub(r'(\w+)-(\w+)', r'\1_\2', text)
    text = re.sub(r'\|.*?\|', ' ', text)
    text = re.sub(r'\b(\d+)([a-zA-Z]+)\b', r'\1 \2', text)
    text = re.sub(r'[^\w\s_]', '', text)
    return re.sub(r'\s+', ' ', text).lower().strip()

def extract_keywords(text, ratio=0.3):
    """领域自适应关键词提取（返回前5个关键词）"""
    try:
        cleaned = clean_text(text)
        if len(cleaned.split()) < 10:
            return []
        
        kw = keywords.keywords(
            cleaned,
            ratio=ratio,
            words=True,
            split=True,
            scores=False,
            language="english"
        )
        
        processed_kws = []
        stemmer = PorterStemmer()
        for k in kw:
            k_clean = k.strip().lower()
            if (len(k_clean) < 3 or 
                k_clean in EXTENDED_STOPWORDS or
                not any(c.isalpha() for c in k_clean)):
                continue
                
            stemmed = stemmer.stem(k_clean)
            if stemmed in DOMAIN_TERMS or k_clean in DOMAIN_TERMS:
                processed_kws.insert(0, k_clean)
            else:
                processed_kws.append(k_clean)
        
        # 基于词干去重并取前5
        seen_stems = set()
        unique_kws = []
        for kw in processed_kws:
            stem = stemmer.stem(kw)
            if stem not in seen_stems:
                seen_stems.add(stem)
                unique_kws.append(kw)
        return unique_kws[:5]
        
    except Exception as e:
        print(f"提取错误: {str(e)}")
        return []

def evaluate(y_true, y_pred):
    """评估函数（使用TF-IDF计算相似度）"""
    stemmer = PorterStemmer()
    
    tp = pred = true = 0
    cos_sim_total = 0.0
    valid_sim_docs = 0
    
    # 预处理所有关键词
    all_true = [' '.join([stemmer.stem(kw) for kw in doc]) for doc in y_true]
    all_pred = [' '.join([stemmer.stem(kw) for kw in doc]) for doc in y_pred]
    
    # 计算TF-IDF
    vectorizer = TfidfVectorizer()
    try:
        X_true = vectorizer.fit_transform(all_true)
        X_pred = vectorizer.transform(all_pred)
        cos_sims = cosine_similarity(X_true, X_pred).diagonal()
        avg_cos = np.mean(cos_sims)
    except:
        avg_cos = 0.0
    
    # 原始评估逻辑
    for true_kws, pred_kws in zip(y_true, y_pred):
        true_set = set(stemmer.stem(kw) for kw in true_kws)
        pred_set = set(stemmer.stem(kw) for kw in pred_kws)
        
        if not true_set:
            continue
            
        common = true_set & pred_set
        tp += len(common)
        pred += len(pred_set)
        true += len(true_set)
    
    precision = tp / pred if pred > 0 else 0
    recall = tp / true if true > 0 else 0
    f1 = 2*precision*recall/(precision+recall) if (precision+recall) >0 else 0
    
    return precision, recall, f1, avg_cos

# ========== 主程序 ==========
def main():
    train_path = r"E:\HKULearning\2025 spring\STAT8021\group work\scienceie.github.io-master\resources\scienceie2017_train\train2"
    test_path = r"E:\HKULearning\2025 spring\STAT8021\group work\scienceie.github.io-master\resources\semeval_articles_test"

    print("Loading training data...")
    X_train, y_train = load_data(train_path)
    print(f"Loaded {len(X_train)} training samples | Avg keywords: {np.mean([len(x) for x in y_train]):.1f}")

    print("\nLoading test data...")
    X_test, y_test = load_data(test_path)
    print(f"Loaded {len(X_test)} test samples | Avg keywords: {np.mean([len(x) for x in y_test]):.1f}")

    print("\nExtracting keywords...")
    y_pred = []
    for i, text in enumerate(X_test):
        kws = extract_keywords(text)
        y_pred.append(kws)
        if i < 3:
            print(f"Doc {i+1} Pred: {kws[:5]}...")

    precision, recall, f1, cos_sim = evaluate(y_test, y_pred)

    print("\n评估结果:")
    print(f"Precision@5: {precision:.2%}")
    print(f"Recall@5:    {recall:.2%}")
    print(f"F1@5:        {f1:.2%}")
    print(f"TF-IDF Cosine Similarity: {cos_sim:.4f}")

    valid_docs = sum(1 for x in y_test if x)
    print(f"\n有效标注文档: {valid_docs}/{len(y_test)}")
    print(f"预测关键词示例: {y_pred[0][:5] if y_pred else 'None'}")

if __name__ == "__main__":
    main()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sphy9\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Loading training data...
Loaded 350 training samples | Avg keywords: 12.8

Loading test data...
Loaded 100 test samples | Avg keywords: 14.1

Extracting keywords...
Doc 1 Pred: ['surface']...
Doc 2 Pred: ['surface']...
Doc 3 Pred: ['alloys']...

评估结果:
Precision@5: 17.44%
Recall@5:    1.07%
F1@5:        2.01%
TF-IDF Cosine Similarity: 0.1236

有效标注文档: 100/100
预测关键词示例: ['surface']


In [3]:
import os
import re
import numpy as np
from summa import keywords
from nltk.stem import PorterStemmer
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('punkt')

# ========== 配置参数 ==========
DOMAIN_TERMS = {
    'laser', 'quantum', 'modulus', 'polymer', 'nanoparticle', 'spectroscopy',
    'synthesis', 'alloy', 'composite', 'asphalt', 'cullet', 'stiffness', 
    'bitumen', 'asphaltic', 'glass', 'particle'
}

EXTENDED_STOPWORDS = {
    'however', 'furthermore', 'conclusion', 'experiment', 'methodology',
    'result', 'study', 'research', 'data', 'analysis', 'table', 'figure', 
    'sample', 'test', 'show', 'based', 'using'
}

# ========== 核心函数 ==========
def load_data(data_path):
    texts = []
    keywords_list = []
    stemmer = PorterStemmer()
    
    for filename in sorted(os.listdir(data_path)):
        if not filename.endswith(".txt"):
            continue
            
        text_path = os.path.join(data_path, filename)
        ann_path = os.path.join(data_path, filename.replace(".txt", ".ann"))
        
        try:
            with open(text_path, "r", encoding="utf-8", errors="ignore") as f:
                texts.append(f.read().strip())
        except Exception as e:
            print(f"Error reading {text_path}: {str(e)}")
            continue
        
        doc_keywords = []
        if os.path.exists(ann_path):
            with open(ann_path, "r", encoding="utf-8", errors="ignore") as f:
                for line in f:
                    parts = line.strip().split("\t")
                    if len(parts) >= 3:
                        annot_parts = parts[1].split()
                        if len(annot_parts) == 0:
                            continue
                            
                        annot_type = annot_parts[0]
                        if annot_type in ["Term", "Material", "Process", "T"]:
                            keyword = parts[2].lower().strip()
                            if 2 < len(keyword) < 50 and not keyword.isnumeric():
                                doc_keywords.append(keyword)
        
        seen_stems = set()
        unique_kws = []
        for kw in doc_keywords:
            stem = stemmer.stem(kw)
            if stem not in seen_stems:
                seen_stems.add(stem)
                unique_kws.append(kw)
        keywords_list.append(unique_kws)
    
    return texts, keywords_list

def clean_text(text):
    text = re.sub(r'\[\d+(-\d+)?\]', '', text)
    text = re.sub(r'&[a-z]+;', lambda m: {'&alpha;':'alpha','&beta;':'beta'}.get(m.group(), ''), text)
    text = re.sub(r'(\w+)-(\w+)', r'\1_\2', text)
    text = re.sub(r'\|.*?\|', ' ', text)
    text = re.sub(r'\b(\d+)([a-zA-Z]+)\b', r'\1 \2', text)
    text = re.sub(r'[^\w\s_]', '', text)
    return re.sub(r'\s+', ' ', text).lower().strip()

def extract_keywords(text):
    """关键修改点1：强制提取至少25个候选词"""
    try:
        cleaned = clean_text(text)
        if len(cleaned.split()) < 10:
            return []
        
        # 修改点：使用words=30提供足够候选词
        kw = keywords.keywords(
            cleaned,
            words=30,          # 提取30个候选词用于后续过滤
            split=True,
            scores=False,
            language="english"
        )
        
        processed_kws = []
        stemmer = PorterStemmer()
        for k in kw:
            k_clean = k.strip().lower()
            # 放宽过滤条件（关键修改点2）
            if len(k_clean) < 3 or k_clean in EXTENDED_STOPWORDS:
                continue
                
            stemmed = stemmer.stem(k_clean)
            # 优先保留领域术语
            if stemmed in DOMAIN_TERMS or k_clean in DOMAIN_TERMS:
                processed_kws.insert(0, k_clean)
            else:
                processed_kws.append(k_clean)
        
        # 关键修改点3：动态调整最终数量
        seen_stems = set()
        unique_kws = []
        for kw in processed_kws:
            stem = stemmer.stem(kw)
            if stem not in seen_stems:
                seen_stems.add(stem)
                unique_kws.append(kw)
        return unique_kws[:25]  # 确保最终输出25个
        
    except Exception as e:
        print(f"提取错误: {str(e)}")
        return []

def evaluate(y_true, y_pred):
    stemmer = PorterStemmer()
    
    tp = pred = true = 0
    cos_sim_total = 0.0
    
    all_true = [' '.join([stemmer.stem(kw) for kw in doc]) for doc in y_true]
    all_pred = [' '.join([stemmer.stem(kw) for kw in doc]) for doc in y_pred]
    
    vectorizer = TfidfVectorizer()
    try:
        X_true = vectorizer.fit_transform(all_true)
        X_pred = vectorizer.transform(all_pred)
        cos_sims = cosine_similarity(X_true, X_pred).diagonal()
        avg_cos = np.mean(cos_sims)
    except:
        avg_cos = 0.0
    
    # 关键修改点4：严格确保评估前25个
    for true_kws, pred_kws in zip(y_true, y_pred):
        pred_set = set(stemmer.stem(kw) for kw in pred_kws[:25])  # 显式截断
        true_set = set(stemmer.stem(kw) for kw in true_kws)
        
        if not true_set:
            continue
            
        common = true_set & pred_set
        tp += len(common)
        pred += len(pred_set)
        true += len(true_set)
    
    precision = tp / pred if pred > 0 else 0
    recall = tp / true if true > 0 else 0
    f1 = 2*precision*recall/(precision+recall) if (precision+recall) >0 else 0
    
    return precision, recall, f1, avg_cos

# ========== 主程序 ==========
def main():
    train_path = r"E:\HKULearning\2025 spring\STAT8021\group work\scienceie.github.io-master\resources\scienceie2017_train\train2"
    test_path = r"E:\HKULearning\2025 spring\STAT8021\group work\scienceie.github.io-master\resources\semeval_articles_test"

    print("Loading training data...")
    X_train, y_train = load_data(train_path)
    print(f"Loaded {len(X_train)} training samples | Avg keywords: {np.mean([len(x) for x in y_train]):.1f}")

    print("\nLoading test data...")
    X_test, y_test = load_data(test_path)
    print(f"Loaded {len(X_test)} test samples | Avg keywords: {np.mean([len(x) for x in y_test]):.1f}")

    print("\nExtracting keywords...")
    y_pred = []
    for i, text in enumerate(X_test):
        kws = extract_keywords(text)
        y_pred.append(kws)
        if i < 3:
            print(f"Doc {i+1} Pred: {kws[:25]}")  # 显示完整25个

    precision, recall, f1, cos_sim = evaluate(y_test, y_pred)

    print("\n评估结果:")
    print(f"Precision@25: {precision:.4f}")
    print(f"Recall@25:    {recall:.4f}")
    print(f"F1@25:        {f1:.4f}")
    print(f"TF-IDF Cosine Similarity: {cos_sim:.4f}")

if __name__ == "__main__":
    main()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sphy9\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Loading training data...
Loaded 350 training samples | Avg keywords: 12.8

Loading test data...
Loaded 100 test samples | Avg keywords: 14.1

Extracting keywords...
Doc 1 Pred: ['followed', 'vessel', 'stl surface', 'removing', 'cerebral', 'd model', 'triangulation', 'resin vero following', 'printed', 'd_dsa', 'triangular meshes', 'computational volumetric mesh', 'digital subtraction angiographic', 'prairie mn', 'pro', 'software amira version', 'file', 'ma usa']
Doc 2 Pred: ['surface', 'fig', 'time hydrozincite', 'surfaces investigated dp', 'formation rates', 'main corrosion products', 'absorbance units', 'allows comparisons', 'initial spreading ability', 'nacl_containing droplets', 'diamond polished', 'overall', 'rate', 'compared', 'reduces', 'preformed']
Doc 3 Pred: ['particle', 'alloys', 'corrosion', 'phases', 'relatively', 'thermomechanical', 'resulting', 'applications', 'potentials', 'particularly', 'requires surface', 'preferential cathodic sites', 'damage', 'high', 'favourable me