In [None]:
import pandas as pd
import numpy as np
import os
import re
import logging
import torch
from transformers import BertTokenizer, BertModel
import jieba 
from collections import Counter
import math
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


# 论文评估
class PaperEvaluator:
    def __init__(self):
        # 加载停用词
        self.stopwords = {'的', '了', '和', '是', '在', '我', '有', '这', '他', '它', '们', '与', '以', '为', '上', '下', '从', '但', '所', '如', '对', '之', '也', '而', '或', '自', '其', '那', '并', '等', '被', '一', '二', '三', '不', '能', '会', '就', '没', '到', '很', '可', '个', '又', '因', '此', '只', '每', '于', '你', '我们', '你们', '他们', '她们', '这些', '那些', '什么', '怎么', '这样', '那样', '这个', '那个', '如何', '为什么', '可以', '不能', '应该', '如此', '一些'}
        
        # 创新性相关的词汇
        self.novelty_terms = [
            "新颖", "新的", "创新", "首次", "开创性", "原创的", "进步", "贡献", 
            "提出", "突破", "前所未有的", "独特的", "革命性的", "改进", "增强的", 
            "优于", "优越的", "超过", "提升"
        ]
        
        # 学术术语识别的词汇长度阈值
        self.term_length_threshold = 2
    
    # 提取标题
    def _extract_title(self, text):
        lines = text.strip().split('\n')
        for line in lines[:7]:
            line = line.strip()
            # 标题通常不会太短或太长
            if line and len(line) > 10 and len(line) < 200:
                return line
                
        # 尝试获取第一个句子
        first_part = text.split('\n\n')[0] if '\n\n' in text else text[:500]
        sentences = re.split(r'[。！？!?]+', first_part)
        if sentences and len(sentences[0]) < 200:
            return sentences[0]
            
        return "" 

    # 提取摘要内容
    def _extract_abstract(self, text):
        abstract_content = []
        is_in_abstract = False
        lines = text.strip().split('\n')
        
        for line in lines:
            line = line.strip() 
            if line.startswith('#'):
                if '摘要' in line:
                    is_in_abstract = True
                elif is_in_abstract:  
                    break
            elif is_in_abstract:  # 非标题行且在摘要区域时收集内容
                abstract_content.append(line) 
                
        return ''.join(abstract_content).strip()
    
    # 提取文章各个章节标题
    def _extract_sections_title(self, text):
        titles = []
        lines = text.strip().split('\n') 
        
        for line in lines:
            if line.strip().startswith('#'):
                parts = line.strip().split(' ', 1)
                if len(parts) > 1:
                    level = parts[0].count('#')
                    title_text = parts[1].strip()
                    titles.append({
                        'level': level,
                        'text': title_text
                    })
        
        return [title['text'] for title in titles]
    
    # 计算有效词数
    def _count_words(self, text):
        # 使用jieba分词
        words = list(jieba.cut(text))
        # 过滤停用词
        filtered_words = []
        for word in words:
            if word not in self.stopwords and word.strip():
                filtered_words.append(word)
        return len(filtered_words)
    
    # 计算句子数
    def _count_sentences(self, text):
        sentences = re.split(r'[。！？!?]+', text)
        count = 0
        for s in sentences:
            if s.strip():
                count += 1
        return count
    
    # 计算词汇多样性
    def _calculate_lexical_diversity(self, text):
        # 使用jieba分词
        words = list(jieba.cut(text))
        
        # 过滤停用词
        meaningful_words = []
        for word in words:
            if word not in self.stopwords and word.strip():
                meaningful_words.append(word)
                
        if not meaningful_words:
            return 0
        
        # 计算不同的词的数量
        word_types = len(set(meaningful_words))
        word_tokens = len(meaningful_words)
        
        if word_tokens < 100:
            # 简单类型-符号比(TTR)
            diversity = word_types / max(1, word_tokens)
        else:
            # 使用ROOT TTR: 类型数 / 词数的平方根
            diversity = word_types / math.sqrt(word_tokens)
            
        return diversity
    
    # 估计专业术语密度
    def _technical_term_density(self, text):
        # 分词
        all_words = list(jieba.cut(text))
        
        # 过滤空词
        filtered_words = []
        for w in all_words:
            if w.strip():
                filtered_words.append(w)
        all_words = filtered_words
        
        if not all_words:
            return 0
            
        # 识别潜在术语 (长词、非停用词、不含数字)
        potential_terms = []
        for w in all_words:
            has_digit = False
            for c in w:
                if c.isdigit():
                    has_digit = True
                    break
            
            if len(w) > self.term_length_threshold and w not in self.stopwords and not has_digit:
                potential_terms.append(w)
        
        # 计算词频
        word_freq = Counter(all_words)
        
        # 重复出现的词汇
        repeated_terms = []
        for w in potential_terms:
            if word_freq[w] > 3:
                repeated_terms.append(w)

        # 计算术语密度
        term_density = len(potential_terms) / len(all_words) if all_words else 0
        
        # 计算术语重复率
        term_repetition = len(repeated_terms) / max(1, len(potential_terms))

        # 加权计算最终得分
        final_score = 0.7 * term_density + 0.3 * term_repetition
        return final_score
    
    # 计算引用密度
    def _citation_density(self, text):
        # 不同引用格式的正则表达式
        patterns = [
            r'\[\d+(,\s*\d+)*\]',  # [1] 或 [1,2,3]
            r'【\d+】',  # 中文引用格式【1】
        ]
        
        # 找出所有引用
        all_citations = []
        for pattern in patterns:
            matches = re.findall(pattern, text)
            all_citations.extend(matches)
        # 计算引用数
        citation_count = len(all_citations)
        # 获取词数
        word_count = self._count_words(text)
        # 计算每千字引用数
        if word_count == 0:
            return 0
        else:
            return (citation_count / word_count) * 1000
    
    # 检查论文结构完整性
    def _check_structure(self, sections):
        # 核心章节及权重
        core_sections = {
            '摘要': 0.15,
            '目录': 0.2,
            '问题背景': 0.25,
            '分析': 0.2,
            '数据处理': 0.1,
            '模型': 0.05,
            '参考文献': 0.05
        }
        
        # 计算得分
        total_score = 0
        for section, weight in core_sections.items():
            found = False
            for doc_section in sections:
                if section in doc_section:
                    found = True
                    break
            
            if found:
                total_score += weight
                
        return total_score
    
    # 评估论点一致性
    def _evaluate_argument_consistency(self, text, abstract):
        if not abstract or not text or len(abstract) < 20:
            return 0.5 
        
        # 从摘要中提取关键词，排除停用词
        abstract_words = []
        for w in jieba.cut(abstract):
            if w not in self.stopwords and len(w) > 1:
                abstract_words.append(w)
        
        # 获取摘要中的高频词
        if not abstract_words:
            return 0.5
        N = min(10, len(abstract_words) // 2)
        if N == 0:
            return 0.5
            
        # 计算词频
        abstract_freq = Counter(abstract_words)
        top_keywords = []
        for word, _ in abstract_freq.most_common(N):
            top_keywords.append(word)
        
        if not top_keywords:
            return 0.5
            
        # 检查正文中是否包含这些关键词
        matches = 0
        for keyword in top_keywords:
            if keyword in text:
                matches += 1
                
        return matches / max(1, len(top_keywords))
    
    # 评估论文创新性
    def _assess_novelty(self, text, title):
        combined_text = title + " " + text
        
        # 计算创新性术语出现次数
        novelty_score = 0
        for term in self.novelty_terms:
            if term in combined_text:
                # 标题中出现的创新性术语权重更高
                if term in title:
                    novelty_score += 0.3
                else:
                    novelty_score += 0.1
                    
        if novelty_score > 1.0:
            novelty_score = 1.0
            
        return novelty_score
    
    # 评估论文的实质性内容
    def evaluate_paper(self, text):
        # 如果文本太短，直接判定为无实质内容
        if len(text) < 2000:
            return 1  
            
        # 提取论文的各个部分
        paper_sections_title = self._extract_sections_title(text)
        abstract = self._extract_abstract(text)
        title = self._extract_title(text)
        
        metrics = {}
        
        # 计算各项指标
        metrics['word_count'] = self._count_words(text)
        metrics['lexical_diversity'] = self._calculate_lexical_diversity(text)
        metrics['technical_term_density'] = self._technical_term_density(text)
        metrics['citation_density'] = self._citation_density(text)
        metrics['structure_completeness'] = self._check_structure(paper_sections_title)
        metrics['argument_consistency'] = self._evaluate_argument_consistency(text, abstract)
        metrics['novelty_score'] = self._assess_novelty(text, title)
        
        # 计算加权总分
        weights = {
            'word_count': 0.1,                 # 论文长度
            'lexical_diversity': 0.1,          # 词汇多样性
            'technical_term_density': 0.15,    # 术语使用
            'citation_density': 0.1,           # 引用情况
            'structure_completeness': 0.2,     # 结构完整性
            'argument_consistency': 0.26,      # 论点一致性
            'novelty_score': 0.08,             # 创新性
        }
        
        # 确保所有指标都有值
        for key in weights:
            if key not in metrics:
                metrics[key] = 0
        
        # 计算总分
        weighted_score = 0
        for key in weights:
            weighted_score += weights[key] * metrics[key]
        
        # 长度惩罚: 如果论文太短，降低得分
        if metrics['word_count'] < 2500:
            length_penalty = metrics['word_count'] / 2500
            weighted_score = weighted_score * length_penalty
        
        # 结构惩罚: 如果缺少核心章节，降低得分
        if metrics['structure_completeness'] < 0.5:
            structure_factor = 0.6 + 0.4 * metrics['structure_completeness']
            weighted_score = weighted_score * structure_factor
        return weighted_score

# 论文处理
class PaperProcessor:
    def __init__(self, data_dir='./data'):
        """初始化论文处理器"""
        self.data_dir = data_dir
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        logger.info("加载BERT模型...")
        model_path = './model/bert-base-chinese'
        self.tokenizer = BertTokenizer.from_pretrained(model_path)
        self.model = BertModel.from_pretrained(model_path)
        self.model.to(self.device)
        self.model.eval()
        
        # 初始化评估器
        self.evaluator = PaperEvaluator()

    # 加载数据
    def load_data(self):
        try:
            # 加载队伍信息
            self.team_info = pd.read_excel(os.path.join(self.data_dir, '附件1.xlsx'))
            
            # 获取PDF文件列表
            self.pdf_files = os.listdir(os.path.join(self.data_dir, 'transform/'))
            
            # 加载赛题内容
            topic_path = os.path.join(self.data_dir, '附件2/topic.txt')
            with open(topic_path, 'r', encoding='utf-8') as f:
                self.topic_text = f.read()
            
            # 计算赛题的BERT嵌入
            self.topic_embedding = self.get_bert_embedding(self.topic_text)
            
            logger.info("数据加载完成")
            return True
        except Exception as e:
            logger.error(f"数据加载失败: {str(e)}")
            return False
    
    # 获取BERT嵌入向量
    def get_bert_embedding(self, text):
        try:
            # 对文本进行编码
            encoded = self.tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
            for key in encoded:
                encoded[key] = encoded[key].to(self.device)
            with torch.no_grad():
                outputs = self.model(**encoded)
            
            # 计算嵌入向量
            embedding = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
            return embedding
        except Exception as e:
            logger.error(f"获取BERT嵌入失败: {str(e)}")
            return None
    
    # 计算文本与赛题的相似度
    def calculate_similarity(self, text):
        try:
            # TF-IDF
            vectorizer = TfidfVectorizer()
            tfidf_matrix = vectorizer.fit_transform([text, self.topic_text])
            tfidf_similarity = cosine_similarity(tfidf_matrix)[0][1]
            
            # BERT
            text_embedding = self.get_bert_embedding(text)
            if text_embedding is not None and self.topic_embedding is not None:
                bert_similarity = cosine_similarity(text_embedding, self.topic_embedding)[0][0]
            else:
                bert_similarity = 0
            
            return {
                'TF-IDF': tfidf_similarity,
                'BERT': bert_similarity
            }
        except Exception as e:
            logger.error(f"计算相似度失败: {str(e)}")
            return {'TF-IDF': 0, 'BERT': 0}
    
    # 处理所有论文并生成结果
    def process_papers(self):
        if not self.load_data():
            return False
        
        substance_scores = []  # 存储所有论文的实质性内容得分
        paper_info = []  # 存储论文信息和初步结果
        
        # 处理每个队伍的论文
        for idx, row in self.team_info.iterrows():
            try:
                # 初始化结果 [参赛队号, 是否包含参赛队信息, 是否与赛题无关, 原始实质性内容得分]
                paper_result = [row['参赛队号'], 0, 1, 1] 
                
                # 检查加密号是否在文件列表中
                if row['加密号'] in self.pdf_files:
                    # 构建论文路径
                    paper_path = os.path.join(
                        self.data_dir, f"transform/{row['加密号']}/md/{row['加密号']}.md"
                    )
                    if not os.path.exists(paper_path):
                        logger.warning(f"论文文件不存在: {paper_path}")
                        substance_scores.append(1)  # 文件不存在时得分为0
                    else:
                        with open(paper_path, 'r', encoding='utf-8') as f:
                            text = f.read()
                        
                        # 1、检查参赛队信息是否包含在论文中
                        for info in row[:-1]: 
                            if str(info) in text:
                                paper_result[1] = 1
                                break
                        
                        # 2、检查是否与赛题无关
                        similarity = self.calculate_similarity(text)
                        threshold = 0.35
                        avg_similarity = (similarity['TF-IDF'] + similarity['BERT']) / 2
                        if (similarity['TF-IDF'] > threshold or 
                            similarity['BERT'] > threshold or 
                            avg_similarity > threshold):
                            paper_result[2] = 0

                        # 3、获取实质性内容原始得分
                        raw_score = self.evaluator.evaluate_paper(text)
                        paper_result[3] = raw_score
                        substance_scores.append(raw_score)
                else:
                    substance_scores.append(1)  # 文件不在列表中时得分为0
                    
                paper_info.append(paper_result)
                logger.info(f"处理完成: 参赛队号 {row['参赛队号']}, 初步结果: {paper_result}")
                    
            except Exception as e:
                logger.error(f"处理论文出错 (参赛队号: {row['参赛队号']}): {str(e)}")
                paper_info.append([row['参赛队号'], -1, -1, 0])
                substance_scores.append(0) 
        try:
            substance_scores_array = np.array(substance_scores)
            if len(substance_scores_array) > 1:
                mask = substance_scores_array != 1
                selected = substance_scores_array[mask]
                if selected.size > 0:
                    scaler = MinMaxScaler()
                    normalized_selected = scaler.fit_transform(selected.reshape(-1, 1)).flatten()
                    substance_scores_array[mask] = normalized_selected
                normalized_scores = substance_scores_array.tolist()
            else:
                normalized_scores = substance_scores.copy()
        except Exception as e:
            logger.error(f"归一化处理失败: {str(e)}")
            normalized_scores = substance_scores
        
        print("归一化后的实质性内容得分:", normalized_scores)

        final_results = []
        for i, paper in enumerate(paper_info):
            final_substance_value = 0 if normalized_scores[i] >= 0.08 and normalized_scores[i] != 1 else 1
            final_results.append([
                paper[0], 
                paper[1],  
                paper[2], 
                final_substance_value  # 最终实质内容判断结果
            ])

        result_df = pd.DataFrame(final_results, columns=[
            '参赛队号', '是否包含参赛队信息', '是否与赛题无关', '是否无实质内容'
        ])
        try:
            result_df.to_excel(os.path.join(self.data_dir, 'result2.xlsx'), sheet_name="Sheet1", index=False)
            logger.info("结果已保存")
            
            # 筛选出问题论文
            filtered = result_df[
                (result_df['是否包含参赛队信息'] == 1) | 
                (result_df['是否与赛题无关'] == 1) | 
                (result_df['是否无实质内容'] == 1)
            ]
            filtered = filtered.sort_values('参赛队号')
            top5 = filtered.head(5)  # 显示前5篇
            return top5
            
        except Exception as e:
            logger.error(f"保存结果失败: {str(e)}")
            return None

if __name__ == "__main__":
    try:
        processor = PaperProcessor()
        # 处理所有论文
        top5_results = processor.process_papers()
        if top5_results is not None:
            logger.info("处理完成，以下是符合筛选条件的前5篇论文:")
            print(top5_results)
        else:
            logger.error("处理失败")
    except Exception as e:
        logger.error(f"主程序执行失败: {str(e)}")

INFO:__main__:加载BERT模型...
INFO:__main__:数据加载完成
INFO:__main__:处理完成: 参赛队号 202190000001, 初步结果: [202190000001, 0, 0, 1235.423165002405]
INFO:__main__:处理完成: 参赛队号 202190000007, 初步结果: [202190000007, 0, 0, 682.6336316566425]
INFO:__main__:处理完成: 参赛队号 202190000008, 初步结果: [202190000008, 0, 0, 1312.2167087201044]
INFO:__main__:处理完成: 参赛队号 202190000025, 初步结果: [202190000025, 0, 0, 1576.11830510994]
INFO:__main__:处理完成: 参赛队号 202190000026, 初步结果: [202190000026, 0, 0, 2430.3462656627053]
INFO:__main__:处理完成: 参赛队号 202190000037, 初步结果: [202190000037, 0, 0, 809.5950397184866]
INFO:__main__:处理完成: 参赛队号 202190000045, 初步结果: [202190000045, 1, 0, 603.3870976162247]
INFO:__main__:处理完成: 参赛队号 202190000046, 初步结果: [202190000046, 0, 0, 663.394928047697]
INFO:__main__:处理完成: 参赛队号 202190000047, 初步结果: [202190000047, 0, 0, 59.8211792089651]
INFO:__main__:处理完成: 参赛队号 202190000048, 初步结果: [202190000048, 0, 0, 2219.7261937469625]
INFO:__main__:处理完成: 参赛队号 202190000049, 初步结果: [202190000049, 0, 0, 599.1907479169435]
INFO:__main__:处理完成

归一化后的实质性内容得分: [0.38133121825729677, 0.20795553214819182, 0.4054165753917233, 0.4881860881617078, 0.7561042556024453, 0.2477754272516182, 0.18310082455655555, 0.20192154771678922, 0.012618012628136067, 0.6900458422751792, 0.18178469072192388, 0.2555720019825537, 0.005933590875160599, 1.0, 0.09641617246012082, 1.0, 0.4397293570679433, 0.16859733896574577, 0.17376867260258472, 0.30440708189093835, 0.4419977784925298, 1.0, 0.12745359059305966, 0.2314843773236899, 0.3401447318387323, 0.1854103339535832, 0.24970037548120808, 0.27437652193527684, 0.19463474505303827, 0.049208204587714924, 0.10497387750378011, 0.3029846619842091, 0.11347167002148113, 1.0, 0.15303899174011562, 1.0, 1.0, 0.14789019267518072, 0.3449952994080581, 1.0, 0.09337143067712741, 0.055394018924061796, 1.0, 1.0, 1.0, 0.23700401488128994, 1.0, 0.26900658881534073, 0.06198382519060608, 0.2706275476651152, 0.1466216193878855, 0.16298133565620695, 0.10453951983181821, 0.3456173019203187, 0.21279241210095395, 0.3670960149517063