In [2]:
import os
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, recall_score, f1_score
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import numpy as np

nltk.download('punkt')
nltk.download('stopwords')

# 修改为正确的路径
TRAIN_DIR = r"E:\HKULearning\2025 spring\STAT8021\group work\scienceie.github.io-master\resources\scienceie2017_train\train2"
TEST_DIR = r"E:\HKULearning\2025 spring\STAT8021\group work\scienceie.github.io-master\resources\semeval_articles_test"

# 移除可能误删关键信息的科学停用词
SCIENCE_STOPWORDS = set([
    'figure', 'equation', 'table'  # 仅保留明显非关键词的停用词
])

def load_scienceie_data(data_dir):
    documents = []
    true_keywords = []
    
    txt_files = [f for f in os.listdir(data_dir) if f.endswith(".txt")]
    
    for txt_file in txt_files:
        ann_file = txt_file.replace('.txt', '.ann')
        ann_path = os.path.join(data_dir, ann_file)
        
        if not os.path.exists(ann_path):
            continue
        
        txt_path = os.path.join(data_dir, txt_file)
        with open(txt_path, 'r', encoding='utf-8') as f:
            text = f.read()
        
        keywords = parse_annotation(ann_path)
        
        documents.append(text)
        true_keywords.append(keywords)
    
    return documents, true_keywords

def parse_annotation(ann_path):
    keywords = []
    try:
        with open(ann_path, 'r', encoding='utf-8') as f:
            for line in f:
                if line.startswith('T'):
                    parts = line.strip().split('\t')
                    if len(parts) >= 3:
                        # 修正关键bug：正确提取关键词位置
                        keyword = parts[2].strip()
                        keywords.append(keyword)
    except Exception as e:
        print(f"Annotation error: {str(e)}")
    return list(set(keywords))

def science_preprocess(text):
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english')) | SCIENCE_STOPWORDS
    tokens = [token.lower() for token in tokens]  # 统一小写处理
    
    # 更宽松的过滤策略
    tokens = [
        token for token in tokens 
        if token not in stop_words and
        token not in string.punctuation and
        len(token) > 2  # 过滤过短词
    ]
    return ' '.join(tokens)

def evaluate_science_v2(true_list, pred_list):
    precisions = []
    recalls = []
    f1s = []
    
    for true_kws, pred_kws in zip(true_list, pred_list):
        true_set = set(kw.lower() for kw in true_kws)
        pred_set = set(kw.lower() for kw in pred_kws)
        
        tp = len(true_set & pred_set)
        precision = tp / len(pred_set) if len(pred_set) > 0 else 0
        recall = tp / len(true_set) if len(true_set) > 0 else 0
        f1 = 2*(precision*recall)/(precision+recall) if (precision+recall)>0 else 0
        
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)
    
    return {
        'precision': np.mean(precisions),
        'recall': np.mean(recalls),
        'f1': np.mean(f1s)
    }

if __name__ == "__main__":
    # Load data
    train_docs, train_keywords = load_scienceie_data(TRAIN_DIR)
    test_docs, test_keywords = load_scienceie_data(TEST_DIR)

    # Optimized TF-IDF parameters
    vectorizer = TfidfVectorizer(
        ngram_range=(1, 2),
        max_df=0.75,
        min_df=1,
        token_pattern=r'(?u)\b[\w-]+\b',
        stop_words=list(SCIENCE_STOPWORDS))
    preprocessed_train = [science_preprocess(doc) for doc in train_docs]
    vectorizer.fit(preprocessed_train)

    # Preprocess test data
    preprocessed_test = [science_preprocess(doc) for doc in test_docs]
    tfidf_test = vectorizer.transform(preprocessed_test)

    # Extract top 5 keywords per document
    feature_names = vectorizer.get_feature_names_out()
    test_pred_keywords = []
    for i in range(tfidf_test.shape[0]):
        row = tfidf_test[i].toarray().flatten()
        # Get indices of top 5 TF-IDF scores
        top_indices = row.argsort()[-5:][::-1]
        keywords = [feature_names[idx] for idx in top_indices]
        test_pred_keywords.append(keywords[:5])  # Ensure exactly 5 keywords

    # Evaluate F1@5 and Recall@5
    metrics = evaluate_science_v2(test_keywords, test_pred_keywords)
    print("\nResults @5:")
    print(f"Precision@5: {metrics['precision']:.4f}")
    print(f"Recall@5:    {metrics['recall']:.4f}")
    print(f"F1@5:        {metrics['f1']:.4f}")

    # Calculate average cosine similarity
    from sklearn.metrics.pairwise import cosine_similarity

    cos_sim_total = 0.0
    valid_docs = 0
    feature_names = vectorizer.get_feature_names_out()

    for true_kws, pred_kws in zip(test_keywords, test_pred_keywords):
        # Convert keywords to lowercase for consistency
        true_kws_lower = [kw.lower() for kw in true_kws]
        # Create binary vectors
        true_vec = np.array([1 if word in true_kws_lower else 0 for word in feature_names], dtype=np.float32)
        pred_vec = np.array([1 if word in pred_kws else 0 for word in feature_names], dtype=np.float32)
        
        # Handle zero vectors
        true_norm = np.linalg.norm(true_vec)
        pred_norm = np.linalg.norm(pred_vec)
        if true_norm == 0 or pred_norm == 0:
            sim = 0.0
        else:
            sim = cosine_similarity([true_vec], [pred_vec])[0][0]
        cos_sim_total += sim
        valid_docs += 1

    average_cosine = cos_sim_total / valid_docs if valid_docs > 0 else 0.0
    print(f"\nAverage Cosine Similarity: {average_cosine:.4f}")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sphy9\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sphy9\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



Results @5:
Precision@5: 0.1440
Recall@5:    0.0433
F1@5:        0.0648

Average Cosine Similarity: 0.1827


In [3]:
if __name__ == "__main__":
    # Load data
    train_docs, train_keywords = load_scienceie_data(TRAIN_DIR)
    test_docs, test_keywords = load_scienceie_data(TEST_DIR)

    # Optimized TF-IDF parameters
    vectorizer = TfidfVectorizer(
        ngram_range=(1, 2),
        max_df=0.75,
        min_df=1,
        token_pattern=r'(?u)\b[\w-]+\b',
        stop_words=list(SCIENCE_STOPWORDS))
    preprocessed_train = [science_preprocess(doc) for doc in train_docs]
    vectorizer.fit(preprocessed_train)

    # Preprocess test data
    preprocessed_test = [science_preprocess(doc) for doc in test_docs]
    tfidf_test = vectorizer.transform(preprocessed_test)

    # 修改点1：改为提取25个关键词
    feature_names = vectorizer.get_feature_names_out()
    test_pred_keywords = []
    for i in range(tfidf_test.shape[0]):
        row = tfidf_test[i].toarray().flatten()
        # 修改点2：获取前25个索引
        top_indices = row.argsort()[-25:][::-1]
        keywords = [feature_names[idx] for idx in top_indices]
        # 修改点3：保留前25个关键词
        test_pred_keywords.append(keywords[:25])

    # Evaluate F1@25 and Recall@25
    metrics = evaluate_science_v2(test_keywords, test_pred_keywords)
    # 修改点4：更新结果标签
    print("\nResults @25:")
    print(f"Precision@25: {metrics['precision']:.4f}")
    print(f"Recall@25:    {metrics['recall']:.4f}")
    print(f"F1@25:        {metrics['f1']:.4f}")

    # Calculate average cosine similarity
    from sklearn.metrics.pairwise import cosine_similarity

    cos_sim_total = 0.0
    valid_docs = 0
    feature_names = vectorizer.get_feature_names_out()



Results @25:
Precision@25: 0.0624
Recall@25:    0.0933
F1@25:        0.0718
