In [5]:
import os
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, recall_score, f1_score
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

nltk.download('punkt')
nltk.download('stopwords')

# ------------------
# 路径配置（根据实际情况修改）
# ------------------
TRAIN_DIR = r"E:\HKULearning\2025 spring\STAT8021\group work\scienceie.github.io-master\resources\scienceie2017_train\train2"
TEST_DIR = r"E:\HKULearning\2025 spring\STAT8021\group work\scienceie.github.io-master\resources\semeval_articles_test"

# ------------------
# 改进版数据加载
# ------------------
def load_scienceie_data(data_dir):
    """只加载有标注文件的样本"""
    documents = []
    true_keywords = []
    
    # 遍历所有txt文件
    txt_files = [f for f in os.listdir(data_dir) if f.endswith(".txt")]
    print(f"发现 {len(txt_files)} 个文本文件")
    
    for txt_file in txt_files:
        ann_file = txt_file.replace('.txt', '.ann')
        ann_path = os.path.join(data_dir, ann_file)
        
        # 跳过无标注文件
        if not os.path.exists(ann_path):
            print(f"跳过无标注文件: {txt_file}")
            continue
            
        # 读取文本
        txt_path = os.path.join(data_dir, txt_file)
        with open(txt_path, 'r', encoding='utf-8') as f:
            text = f.read()
            
        # 解析标注
        keywords = parse_annotation(ann_path)
        
        documents.append(text)
        true_keywords.append(keywords)
    
    print(f"成功加载有效样本：{len(documents)} 个")
    return documents, true_keywords

def parse_annotation(ann_path):
    """更健壮的标注解析"""
    keywords = []
    try:
        with open(ann_path, 'r', encoding='utf-8') as f:
            for line in f:
                # 兼容不同标注格式：T1 KeyPhrase 0 5 example 或 T1\tKeyPhrase 0 5\texample
                if line.startswith('T'):
                    parts = line.strip().split('\t')
                    if len(parts) >= 2:
                        # 提取标注内容（如："KeyPhrase 0 5 example"）
                        content = parts[1]
                        keyword = content.split(' ')[-1]  # 获取最后一个元素作为关键词
                        keywords.append(keyword.strip())
    except Exception as e:
        print(f"解析标注文件 {ann_path} 失败：{str(e)}")
    return list(set(keywords))  # 去重

# ------------------
# 其他函数保持不变
# ------------------
# （保持与之前相同的预处理、TF-IDF提取和评估函数）

if __name__ == "__main__":
    # 加载训练集
    print("加载训练集...")
    train_docs, train_keywords = load_scienceie_data(TRAIN_DIR)
    
    # 加载测试集
    print("\n加载测试集...")
    test_docs, test_keywords = load_scienceie_data(TEST_DIR)
    
    # 验证数据一致性
    assert len(test_docs) == len(test_keywords), "测试集文本与标注数量不匹配！"
    
    # 后续处理保持不变...

加载训练集...
发现 350 个文本文件
成功加载有效样本：350 个

加载测试集...
发现 100 个文本文件
成功加载有效样本：100 个


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sphy9\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sphy9\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
import os
import string
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, recall_score, f1_score
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

# 下载NLTK资源
nltk.download('punkt')
nltk.download('stopwords')

# ------------------
# 自定义路径配置
# ------------------
TRAIN_DIR = r"E:\HKULearning\2025 spring\STAT8021\group work\scienceie.github.io-master\resources\scienceie2017_train\train2"
TEST_DIR = r"E:\HKULearning\2025 spring\STAT8021\group work\scienceie.github.io-master\resources\semeval_articles_test"

# ------------------
# 科学领域专用停用词
# ------------------
SCIENCE_STOPWORDS = set([
    'figure', 'equation', 'table', 'method', 'result', 'study', 
    'data', 'research', 'model', 'analysis', 'experiment'
])

# ------------------
# 数据集加载函数
# ------------------
def load_scienceie_data(data_dir):
    """
    加载ScienceIE数据集
    返回：(documents, true_keywords)
    """
    documents = []
    true_keywords = []
    
    # 遍历所有txt文件
    for filename in os.listdir(data_dir):
        if not filename.endswith(".txt"):
            continue
            
        # 读取文本内容
        txt_path = os.path.join(data_dir, filename)
        with open(txt_path, 'r', encoding='utf-8') as f:
            text = f.read()
            
        # 读取对应的标注文件（假设为.ann格式）
        ann_path = os.path.join(data_dir, filename.replace('.txt', '.ann'))
        keywords = parse_annotation(ann_path)
        
        documents.append(text)
        true_keywords.append(keywords)
    
    return documents, true_keywords

def parse_annotation(ann_path):
    """
    解析标注文件获取关键词
    格式示例：T1	KeyPhrase 0 5	example
    """
    keywords = []
    try:
        with open(ann_path, 'r', encoding='utf-8') as f:
            for line in f:
                if line.startswith('T'):
                    parts = line.split('\t')
                    if len(parts) > 1:
                        keywords.append(parts[1].split(' ')[0])  # 提取第一个标注字段
    except FileNotFoundError:
        pass
    return list(set(keywords))  # 去重

# ------------------
# 文本预处理
# ------------------
def science_preprocess(text):
    """
    针对科学文献的预处理：
    1. 保留大小写（科学术语区分大小写）
    2. 移除通用停用词和科学专用停用词
    3. 处理特殊符号（保留化学式如H2O）
    """
    # 分词
    tokens = word_tokenize(text)
    
    # 过滤停用词
    stop_words = set(stopwords.words('english')) | SCIENCE_STOPWORDS
    tokens = [token for token in tokens if token.lower() not in stop_words]
    
    # 处理标点（保留包含数字和字母的组合）
    tokens = [
        token for token in tokens 
        if not all(c in string.punctuation for c in token)
    ]
    
    return ' '.join(tokens)

# ------------------
# TF-IDF关键词提取
# ------------------
def extract_science_keywords(docs, top_n=10):
    """
    针对科学文献优化的TF-IDF
    """
    vectorizer = TfidfVectorizer(
        ngram_range=(1, 3),  # 支持长术语（如"machine learning"）
        max_df=0.85,         # 过滤高频词
        min_df=2,            # 过滤低频词
        stop_words='english',
        token_pattern=r'(?u)\b[A-Za-z0-9_][A-Za-z0-9_]+\b'  # 保留带数字的词
    )
    
    tfidf_matrix = vectorizer.fit_transform(docs)
    feature_names = vectorizer.get_feature_names_out()
    
    keywords_list = []
    for i in range(tfidf_matrix.shape[0]):
        row = tfidf_matrix[i].toarray().flatten()
        top_indices = row.argsort()[-top_n:][::-1]
        keywords = [feature_names[idx] for idx in top_indices]
        keywords_list.append(keywords)
    
    return keywords_list

# ------------------
# 评估指标计算
# ------------------
def evaluate_science(true_list, pred_list):
    """
    改进的评估方法：
    1. 考虑部分匹配（如预测"neural network" vs 真实"neural networks"）
    2. 加权F1-score
    """
    y_true = []
    y_pred = []
    
    for true_kws, pred_kws in zip(true_list, pred_list):
        # 创建匹配集合
        matched = set()
        for pred in pred_kws:
            for true in true_kws:
                if pred.lower() in true.lower() or true.lower() in pred.lower():
                    matched.add(pred)
                    break
        
        # 生成二进制向量
        true_vec = [1] * len(true_kws)
        pred_vec = [1 if kw in matched else 0 for kw in pred_kws]
        
        y_true.extend(true_vec)
        y_pred.extend(pred_vec)
    
    return {
        'precision': precision_score(y_true, y_pred, average='micro'),
        'recall': recall_score(y_true, y_pred, average='micro'),
        'f1': f1_score(y_true, y_pred, average='micro')
    }

# ------------------
# 主流程
# ------------------
if __name__ == "__main__":
    # 加载数据
    train_docs, train_keywords = load_scienceie_data(TRAIN_DIR)
    test_docs, test_keywords = load_scienceie_data(TEST_DIR)
    
    # 预处理
    preprocessed_train = [science_preprocess(doc) for doc in train_docs]
    preprocessed_test = [science_preprocess(doc) for doc in test_docs]
    
    # 训练TF-IDF模型（在训练集上）
    vectorizer = TfidfVectorizer(
        ngram_range=(1, 3),
        max_df=0.85,
        min_df=2,
        token_pattern=r'(?u)\b[A-Za-z0-9_][A-Za-z0-9_]+\b'
    ).fit(preprocessed_train)
    
    # 在测试集上提取关键词
    tfidf_test = vectorizer.transform(preprocessed_test)
    feature_names = vectorizer.get_feature_names_out()
    
    test_pred_keywords = []
    for i in range(tfidf_test.shape[0]):
        row = tfidf_test[i].toarray().flatten()
        top_indices = row.argsort()[-10:][::-1]
        test_pred_keywords.append([feature_names[idx] for idx in top_indices])
    
    # 评估
    metrics = evaluate_science(test_keywords, test_pred_keywords)
    print(f"测试集表现：")
    print(f"Precision: {metrics['precision']:.4f}")
    print(f"Recall:    {metrics['recall']:.4f}")
    print(f"F1-score:  {metrics['f1']:.4f}")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sphy9\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sphy9\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


ValueError: Found input variables with inconsistent numbers of samples: [274, 1000]

In [12]:
import os
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, recall_score, f1_score
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np

# ------------------
# 改进版评估函数
# ------------------
def evaluate_science_v2(true_list, pred_list):
    """
    文档级别的评估（每个文档独立计算指标）
    """
    precisions = []
    recalls = []
    f1s = []
    
    for true_kws, pred_kws in zip(true_list, pred_list):
        # 处理空预测情况
        if len(pred_kws) == 0:
            prec = 0.0 if len(true_kws) > 0 else 1.0
            rec = 0.0
            precisions.append(prec)
            recalls.append(rec)
            continue
            
        # 计算匹配数
        matched = set()
        for pred in pred_kws:
            for true in true_kws:
                if pred.lower() == true.lower():
                    matched.add(pred)
        
        tp = len(matched)
        precision = tp / len(pred_kws)
        recall = tp / len(true_kws) if len(true_kws) > 0 else 0.0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
        
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)
    
    return {
        'precision': np.mean(precisions),
        'recall': np.mean(recalls),
        'f1': np.mean(f1s)
    }

# ------------------
# 关键代码调整点（主流程）
# ------------------
if __name__ == "__main__":
    # [保持数据加载部分不变]
    
    # 预处理时增加空文档过滤
    def safe_preprocess(doc):
        processed = science_preprocess(doc)
        return processed if len(processed) > 10 else "[EMPTY]"  # 过滤长度<10的无效文档
    
    # 预处理训练集
    preprocessed_train = [safe_preprocess(doc) for doc in train_docs]
    
    # 移除训练空文档
    valid_train_idx = [i for i, txt in enumerate(preprocessed_train) if txt != "[EMPTY]"]
    preprocessed_train = [preprocessed_train[i] for i in valid_train_idx]
    train_keywords = [train_keywords[i] for i in valid_train_idx]
    
    # 训练TF-IDF
    vectorizer = TfidfVectorizer(
        ngram_range=(1, 3),
        max_df=0.85,
        min_df=2,
        token_pattern=r'(?u)\b[A-Za-z0-9_][A-Za-z0-9_]+\b'
    ).fit(preprocessed_train)
    
    # 预处理测试集
    preprocessed_test = [safe_preprocess(doc) for doc in test_docs]
    
    # 生成预测关键词
    tfidf_test = vectorizer.transform(preprocessed_test)
    feature_names = vectorizer.get_feature_names_out()
    
    test_pred_keywords = []
    for i in range(tfidf_test.shape[0]):
        row = tfidf_test[i].toarray().flatten()
        top_indices = row.argsort()[-10:][::-1]  # 取top10
        keywords = [feature_names[idx] for idx in top_indices if row[idx] > 0]  # 过滤零权重词
        test_pred_keywords.append(keywords)
    
    # 验证维度一致性
    assert len(test_keywords) == len(test_pred_keywords), \
        f"维度不匹配：真实标签{len(test_keywords)} vs 预测{len(test_pred_keywords)}"
    
    # 使用改进评估函数
    metrics = evaluate_science_v2(test_keywords, test_pred_keywords)
    print("\n评估结果（文档级平均）：")
    print(f"Precision: {metrics['precision']:.4f}")
    print(f"Recall:    {metrics['recall']:.4f}")
    print(f"F1-score:  {metrics['f1']:.4f}")


评估结果（文档级平均）：
Precision: 0.0070
Recall:    0.0267
F1-score:  0.0110


In [16]:
import os
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, recall_score, f1_score
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import numpy as np

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Path configurations (modify according to your directory structure)
TRAIN_DIR = r"E:\HKULearning\2025 spring\STAT8021\group work\scienceie.github.io-master\resources\scienceie2017_train\train2"
TEST_DIR = r"E:\HKULearning\2025 spring\STAT8021\group work\scienceie.github.io-master\resources\semeval_articles_test"

# Science-specific stopwords
SCIENCE_STOPWORDS = set([
    'figure', 'equation', 'table', 'method', 'result', 'study',
    'data', 'research', 'model', 'analysis', 'experiment'
])

def load_scienceie_data(data_dir):
    documents = []
    true_keywords = []
    
    txt_files = [f for f in os.listdir(data_dir) if f.endswith(".txt")]
    print(f"Found {len(txt_files)} text files")
    
    for txt_file in txt_files:
        ann_file = txt_file.replace('.txt', '.ann')
        ann_path = os.path.join(data_dir, ann_file)
        
        if not os.path.exists(ann_path):
            print(f"Skipping unannotated file: {txt_file}")
            continue
        
        txt_path = os.path.join(data_dir, txt_file)
        with open(txt_path, 'r', encoding='utf-8') as f:
            text = f.read()
        
        keywords = parse_annotation(ann_path)
        
        documents.append(text)
        true_keywords.append(keywords)
    
    print(f"Successfully loaded {len(documents)} valid samples")
    return documents, true_keywords

def parse_annotation(ann_path):
    keywords = []
    try:
        with open(ann_path, 'r', encoding='utf-8') as f:
            for line in f:
                if line.startswith('T'):
                    parts = line.strip().split('\t')
                    if len(parts) >= 2:
                        content = parts[1]
                        keyword = content.split(' ')[-1]
                        keywords.append(keyword.strip())
    except Exception as e:
        print(f"Failed to parse annotation file {ann_path}: {str(e)}")
    return list(set(keywords))

def science_preprocess(text):
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english')) | SCIENCE_STOPWORDS
    tokens = [token for token in tokens if token.lower() not in stop_words]
    tokens = [
        token for token in tokens 
        if not all(c in string.punctuation for c in token)
    ]
    return ' '.join(tokens)

def evaluate_science_v2(true_list, pred_list):
    precisions = []
    recalls = []
    f1s = []
    
    for true_kws, pred_kws in zip(true_list, pred_list):
        if len(pred_kws) == 0:
            prec = 0.0 if len(true_kws) > 0 else 1.0
            rec = 0.0
            precisions.append(prec)
            recalls.append(rec)
            continue
        
        matched = set()
        for pred in pred_kws:
            for true in true_kws:
                if pred.lower() == true.lower():
                    matched.add(pred)
        
        tp = len(matched)
        precision = tp / len(pred_kws)
        recall = tp / len(true_kws) if len(true_kws) > 0 else 0.0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
        
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)
    
    return {
        'precision': np.mean(precisions),
        'recall': np.mean(recalls),
        'f1': np.mean(f1s)
    }

if __name__ == "__main__":
    print("Loading training set...")
    train_docs, train_keywords = load_scienceie_data(TRAIN_DIR)
    
    print("\nLoading test set...")
    test_docs, test_keywords = load_scienceie_data(TEST_DIR)
    
    def safe_preprocess(doc):
        processed = science_preprocess(doc)
        return processed if len(processed) > 10 else "[EMPTY]"
    
    preprocessed_train = [safe_preprocess(doc) for doc in train_docs]
        # Remove empty documents from the training set
    valid_train_idx = [i for i, txt in enumerate(preprocessed_train) if txt != "[EMPTY]"]
    preprocessed_train = [preprocessed_train[i] for i in valid_train_idx]
    train_keywords = [train_keywords[i] for i in valid_train_idx]
    
    # Train TF-IDF on preprocessed training data
    vectorizer = TfidfVectorizer(
        ngram_range=(1, 3),
        max_df=0.85,
        min_df=2,
        token_pattern=r'(?u)\b[A-Za-z0-9_][A-Za-z0-9_]+\b'
    ).fit(preprocessed_train)
    
    # Preprocess test set
    preprocessed_test = [safe_preprocess(doc) for doc in test_docs]
    
    # Generate predicted keywords for the test set
    tfidf_test = vectorizer.transform(preprocessed_test)
    feature_names = vectorizer.get_feature_names_out()
    
    test_pred_keywords = []
    for i in range(tfidf_test.shape[0]):
        row = tfidf_test[i].toarray().flatten()
        top_indices = row.argsort()[-10:][::-1]  # Keep top 10
        keywords = [feature_names[idx] for idx in top_indices if row[idx] > 0]  # Filter zero weight words
        test_pred_keywords.append(keywords)
    
    # Validate dimensional consistency
    assert len(test_keywords) == len(test_pred_keywords), \
        f"Dimensional mismatch: true labels {len(test_keywords)} vs predictions {len(test_pred_keywords)}"
    
    # Use the improved evaluation function
    metrics = evaluate_science_v2(test_keywords, test_pred_keywords)
    print("\nEvaluation results (document-level average):")
    print(f"Precision: {metrics['precision']:.4f}")
    print(f"Recall:    {metrics['recall']:.4f}")
    print(f"F1-score:  {metrics['f1']:.4f}")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sphy9\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sphy9\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Loading training set...
Found 350 text files
Successfully loaded 350 valid samples

Loading test set...
Found 100 text files
Successfully loaded 100 valid samples

Evaluation results (document-level average):
Precision: 0.0010
Recall:    0.0008
F1-score:  0.0009


In [20]:
import os
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, recall_score, f1_score
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import numpy as np

nltk.download('punkt')
nltk.download('stopwords')

# 修改为正确的路径
TRAIN_DIR = r"E:\HKULearning\2025 spring\STAT8021\group work\scienceie.github.io-master\resources\scienceie2017_train\train2"
TEST_DIR = r"E:\HKULearning\2025 spring\STAT8021\group work\scienceie.github.io-master\resources\semeval_articles_test"

# 移除可能误删关键信息的科学停用词
SCIENCE_STOPWORDS = set([
    'figure', 'equation', 'table'  # 仅保留明显非关键词的停用词
])

def load_scienceie_data(data_dir):
    documents = []
    true_keywords = []
    
    txt_files = [f for f in os.listdir(data_dir) if f.endswith(".txt")]
    
    for txt_file in txt_files:
        ann_file = txt_file.replace('.txt', '.ann')
        ann_path = os.path.join(data_dir, ann_file)
        
        if not os.path.exists(ann_path):
            continue
        
        txt_path = os.path.join(data_dir, txt_file)
        with open(txt_path, 'r', encoding='utf-8') as f:
            text = f.read()
        
        keywords = parse_annotation(ann_path)
        
        documents.append(text)
        true_keywords.append(keywords)
    
    return documents, true_keywords

def parse_annotation(ann_path):
    keywords = []
    try:
        with open(ann_path, 'r', encoding='utf-8') as f:
            for line in f:
                if line.startswith('T'):
                    parts = line.strip().split('\t')
                    if len(parts) >= 3:
                        # 修正关键bug：正确提取关键词位置
                        keyword = parts[2].strip()
                        keywords.append(keyword)
    except Exception as e:
        print(f"Annotation error: {str(e)}")
    return list(set(keywords))

def science_preprocess(text):
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english')) | SCIENCE_STOPWORDS
    tokens = [token.lower() for token in tokens]  # 统一小写处理
    
    # 更宽松的过滤策略
    tokens = [
        token for token in tokens 
        if token not in stop_words and
        token not in string.punctuation and
        len(token) > 2  # 过滤过短词
    ]
    return ' '.join(tokens)

def evaluate_science_v2(true_list, pred_list):
    precisions = []
    recalls = []
    f1s = []
    
    for true_kws, pred_kws in zip(true_list, pred_list):
        true_set = set(kw.lower() for kw in true_kws)
        pred_set = set(kw.lower() for kw in pred_kws)
        
        tp = len(true_set & pred_set)
        precision = tp / len(pred_set) if len(pred_set) > 0 else 0
        recall = tp / len(true_set) if len(true_set) > 0 else 0
        f1 = 2*(precision*recall)/(precision+recall) if (precision+recall)>0 else 0
        
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)
    
    return {
        'precision': np.mean(precisions),
        'recall': np.mean(recalls),
        'f1': np.mean(f1s)
    }

if __name__ == "__main__":
    # 加载数据
    train_docs, train_keywords = load_scienceie_data(TRAIN_DIR)
    test_docs, test_keywords = load_scienceie_data(TEST_DIR)

    # 计算平均关键词数量
    avg_keywords = int(round(np.mean([len(kws) for kws in train_keywords])))
    print(f"Average keywords per document: {avg_keywords}")

    # 优化后的TF-IDF参数
    vectorizer = TfidfVectorizer(
        ngram_range=(1, 2),
        max_df=0.75,
        min_df=1,
        token_pattern=r'(?u)\b[\w-]+\b',  # 支持连字符
        stop_words=list(SCIENCE_STOPWORDS)  # 仅使用科学停用词
    )
    # 预处理并训练
    preprocessed_train = [science_preprocess(doc) for doc in train_docs]
    vectorizer.fit(preprocessed_train)

    # 预处理测试集
    preprocessed_test = [science_preprocess(doc) for doc in test_docs]
    tfidf_test = vectorizer.transform(preprocessed_test)
    
    # 动态提取关键词数量
    feature_names = vectorizer.get_feature_names_out()
    test_pred_keywords = []
    for i in range(tfidf_test.shape[0]):
        row = tfidf_test[i].toarray().flatten()
        top_indices = row.argsort()[-avg_keywords:][::-1]
        keywords = [feature_names[idx] for idx in top_indices if row[idx] > 0]
        test_pred_keywords.append(keywords[:avg_keywords])  # 限制数量

    # 评估
    metrics = evaluate_science_v2(test_keywords, test_pred_keywords)
    print("\nOptimized Results:")
    print(f"Precision: {metrics['precision']:.4f}")
    print(f"Recall:    {metrics['recall']:.4f}")
    print(f"F1-score:  {metrics['f1']:.4f}")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sphy9\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sphy9\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Average keywords per document: 17

Optimized Results:
Precision: 0.0759
Recall:    0.0779
F1-score:  0.0736
