In [6]:
import bibtexparser
import pandas as pd
import re
from bs4 import BeautifulSoup


def extract_bib_info(bib_filename, conf_name, year):
    """
    从BibTeX文件中提取信息，并将其保存到Excel文件中。

    Args:
    bib_filename (str): BibTeX文件名。
    conf_name (str): 会议名称。
    year (int): 会议年份。

    Returns:
    DataFrame: 包含提取信息的DataFrame。
    """
    # 读取BibTeX文件
    with open(bib_filename, 'r', encoding='utf-8') as bibfile:
        bib_database = bibtexparser.load(bibfile)

    # 创建一个空的DataFrame来存储提取的信息
    data = {'title': [],
            'author': [],
            'year': [],
            'month': [],
            'address': [],
            'publisher': [],
            'url': [],
            'doi': [],
            'pages': []}

    # 遍历每个条目，并提取所需信息
    for entry in bib_database.entries:
        data['title'].append(entry.get('title', ''))
        data['author'].append(entry.get('author', ''))
        data['year'].append(entry.get('year', ''))
        data['month'].append(entry.get('month', ''))
        data['address'].append(entry.get('address', ''))
        data['publisher'].append(entry.get('publisher', ''))
        data['url'].append(entry.get('url', ''))
        data['doi'].append(entry.get('doi', ''))
        data['pages'].append(entry.get('pages', ''))

    # 将提取的信息转换为DataFrame
    df = pd.DataFrame(data)

    # 将DataFrame写入Excel文件
    excel_filename = f"{conf_name}{year}.xlsx"
    df.to_excel(excel_filename, index=False)

    print(f"Excel文件 {excel_filename} 已创建成功！")
    return df


def extract_html_info(html_filename, conf_name, year):
    """
    从HTML文件中提取文章标题和摘要，并将其添加到Excel文件中。

    Args:
    html_filename (str): HTML文件名。
    conf_name (str): 会议名称。
    year (int): 会议年份。

    Returns:
    DataFrame: 包含提取信息的DataFrame。
    """
    # 读取HTML文件
    with open(html_filename, 'r', encoding='utf-8') as html_file:
        html_content = html_file.read()

    # 使用Beautiful Soup解析HTML
    soup = BeautifulSoup(html_content, 'html.parser')

    # 创建一个空的DataFrame来存储文章信息
    data = {'paper_title': [], 'abstract': []}
    # 填充两行空白数据
    data['paper_title'].append('')
    data['abstract'].append('')
    if year == 2023 and conference == "ACL":
        data['paper_title'].append('')
        data['abstract'].append('')

    # 设置i的初始值
    i = 1

    # 循环搜索文章标题和摘要内容并提取
    while True:
        # 构建文章链接的正则表达式

        str_acl = f'https://aclanthology.org/{year}.acl-long.{i}/'
        str_emnlp = f'https://aclanthology.org/{year}.findings-emnlp.{i}/'

        if conference == "ACL":
            paper_link_regex = re.compile(fr'href={str_acl}')
        elif conference == "EMNLP":
            paper_link_regex = re.compile(fr'href={str_emnlp}')
        # paper_link_regex = re.compile(fr'href="https://aclanthology.org/{year}.acl-long.{i}/"')

        # 搜索文章链接
        paper_link_match = re.search(paper_link_regex, html_content)

        # 如果找到文章链接
        if paper_link_match:
            # 根据文章链接定位文章标题
            if conference == "ACL":
                paper_link_tag = soup.find(href=str_acl)
            elif conference == "EMNLP":
                paper_link_tag = soup.find(href=str_emnlp)

            if paper_link_tag:
                paper_title = paper_link_tag.text.strip()
                data['paper_title'].append(paper_title)
                print(f"文章 {i} 标题提取成功: {paper_title[:20]}...")
            else:
                data['paper_title'].append('')
                print(f"文章 {i} 标题未找到")

            # 根据摘要id定位摘要内容
            if conference == "ACL":
                abstract_div = soup.find(id=f'abstract-{year}--acl-long--{i}').find_next('div')
            elif conference == "EMNLP":
                abstract_div = soup.find(id=f'abstract-{year}--findings-emnlp--{i}').find_next('div')

            # 提取摘要文本并添加到DataFrame
            if abstract_div:
                abstract_text = abstract_div.text.strip()
                data['abstract'].append(abstract_text)
                print(f"文章 {i} 摘要提取成功: {abstract_text[:20]}...")
            else:
                data['abstract'].append('')
                print(f"摘要 {i} 未找到")

            # 增加i的值
            i += 1
        else:
            # 如果找不到文章链接，退出循环
            break

    # 将提取的文章信息转换为DataFrame
    df_articles = pd.DataFrame(data)

    # 读取原始Excel文件
    excel_filename = f"{conf_name}{year}.xlsx"
    df_original = pd.read_excel(excel_filename)

    # 将文章标题信息添加为原始Excel文件的新列
    df_original['paper_title'] = df_articles['paper_title']
    df_original['abstract'] = df_articles['abstract']

    # 将更新后的DataFrame写回原始Excel文件
    df_original.to_excel(excel_filename, index=False)

    print(f"文章标题和摘要信息已提取并添加到原始Excel文件 {excel_filename} 中！")


import pandas as pd


def fill_title_with_paper_title(conf_name, year):
    """
    从Excel文件中读取数据，将paper_title填充到title列中，并将更新后的DataFrame写回Excel文件中。

    Args:
    conf_name (str): 会议名称。
    year (int): 会议年份。

    Returns:
    None
    """
    # 读取Excel文件
    excel_filename = f"{conf_name}{year}.xlsx"
    df = pd.read_excel(excel_filename)

    # 遍历每一行
    for index, row in df.iterrows():
        # 获取当前行的paper_title和title属性值
        paper_title = row['paper_title']

        # 如果paper_title为空，则跳过该行
        if pd.isna(paper_title):
            print(f"行 {index + 1}: paper_title为空，跳过处理")
            continue

        # 如果paper_title不为空，则用其内容填充title属性
        df.at[index, 'title'] = paper_title
        print(f"行 {index + 1}: paper_title为 '{paper_title[:10]}'... 用其填充title属性")

    # 将更新后的DataFrame写回Excel文件
    df.to_excel(excel_filename, index=False)

    print(f"Excel文件 {excel_filename} 处理完成！")


In [None]:
conference = "EMNLP"
year = 2023

# 提取BibTeX信息并保存到Excel文件中
extracted_df = extract_bib_info(bib_filename=f'{conference}{year}.bib', conf_name=conference, year=year)

# 提取HTML信息并添加到对应的Excel文件中
extract_html_info(f'{conference}{year}.html', conf_name=conference, year=year)

# 将2023年的paper_title填充到title列中
fill_title_with_paper_title(conf_name=conference, year=year)

In [5]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [25]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import numpy as np
from nltk.corpus import stopwords

# 创建保存图片的文件夹
if not os.path.exists('figure_ACL21-23'):
    os.makedirs('figure_ACL21-23')

# 读取Excel文件
acl2019_df = pd.read_excel('ACL2019.xlsx')
acl2020_df = pd.read_excel('ACL2020.xlsx')
acl2021_df = pd.read_excel('ACL2021.xlsx')
acl2022_df = pd.read_excel('ACL2022.xlsx')
acl2023_df = pd.read_excel('ACL2023.xlsx')

# 将NaN值替换为空字符串
for df in [acl2019_df, acl2020_df, acl2021_df, acl2022_df, acl2023_df]:
    df.replace(np.nan, '', inplace=True)

# 获取NLTK停用词列表
nltk_stopwords = set(stopwords.words('english'))

# 自定义常见词，除了NLTK停用词之外
common_words = {'model', 'propose', 'based', 'via', 'task', 'dataset', 'paper', 'models', 'method', 'text', 'datasets', 'data', 'tasks', 'performance', 'training', 'language', 'methods', 'using', 'new', 'show', 'However', 'also', 'two','results','existing','learning','different','however,','paper,','experiments','demonstrate'}

# 将NLTK停用词与自定义常见词合并
common_words = common_words.union(nltk_stopwords)

# 定义函数用于生成词频统计和词云图
def clean_title(title):
    # 移除无效字符，并转换为小写
    cleaned_title = ' '.join(word.strip().lower() for word in title.split() if word.strip() and word.strip().lower() not in common_words)
    return cleaned_title

def plot_word_frequency_and_cloud(dataframe, year):
    # 对title进行词频统计和清洗
    dataframe['cleaned_title'] = dataframe['title'].apply(clean_title)
    title_words = dataframe['cleaned_title'].str.split().explode()
    title_words = title_words[title_words != '']  # 去除空字符串
    title_word_counts = title_words.value_counts()
    title_word_counts_top30 = title_word_counts.head(30)  # 只取前30个词
    top_three_words_title = title_word_counts_top30.head(3).index.tolist()  # 获取标题前三个词
    print(f"Top three words in titles for {year}: {top_three_words_title}")  # 输出标题前三个词

    # 绘制标题词频统计图并保存到本地文件
    plt.figure(figsize=(12, 8))  # 增加图形大小
    title_word_counts_top30.plot(kind='barh', color='skyblue')
    plt.title(f'Title Word Frequency {year}')
    plt.xlabel('Frequency')
    plt.ylabel('Words')
    plt.tight_layout()
    plt.savefig(f'figure_ACL21-23/title_word_frequency_{year}.png')  # 保存到本地文件夹
    plt.close()  # 关闭绘图，释放资源

    # 生成标题词云图并保存到本地文件
    title_wordcloud = WordCloud(width=800, height=400, background_color='white', stopwords=common_words).generate(' '.join(title_words))
    plt.figure(figsize=(12, 8))  # 增加图形大小
    plt.imshow(title_wordcloud, interpolation='bilinear')
    plt.title(f'Title Word Cloud {year}')
    plt.axis('off')
    plt.tight_layout()
    plt.savefig(f'figure_ACL21-23/title_word_cloud_{year}.png')  # 保存到本地文件夹
    plt.close()  # 关闭绘图，释放资源

    # 如果是ACL2021、2022、2023年，对摘要进行词频统计和绘制词云图
    if year in [2021, 2022, 2023]:
        # 对abstract进行词频统计和清洗
        dataframe['cleaned_abstract'] = dataframe['abstract'].apply(clean_title)
        abstract_words = dataframe['cleaned_abstract'].astype(str).str.split().explode()
        abstract_words = abstract_words[abstract_words != '']  # 去除空字符串
        abstract_word_counts = abstract_words.value_counts()

        # 绘制摘要词频统计图并保存到本地文件
        abstract_word_counts_top30 = abstract_word_counts.head(30)  # 只取前30个词
        top_three_words_abstract = abstract_word_counts_top30.head(3).index.tolist()  # 获取摘要前三个词
        print(f"Top three words in abstracts for {year}: {top_three_words_abstract}")  # 输出摘要前三个词
        plt.figure(figsize=(12, 8))  # 增加图形大小
        abstract_word_counts_top30.plot(kind='barh', color='lightgreen')
        plt.title(f'Abstract Word Frequency {year}')
        plt.xlabel('Frequency')
        plt.ylabel('Words')
        plt.tight_layout()
        plt.savefig(f'figure_ACL21-23/abstract_word_frequency_{year}.png')  # 保存到本地文件夹
        plt.close()  # 关闭绘图，释放资源

        # 生成摘要词云图并保存到本地文件
        abstract_wordcloud = WordCloud(width=800, height=400, background_color='white', stopwords=common_words).generate(' '.join(abstract_words))
        plt.figure(figsize=(12, 8))  # 增加图形大小
        plt.imshow(abstract_wordcloud, interpolation='bilinear')
        plt.title(f'Abstract Word Cloud {year}')
        plt.axis('off')
        plt.tight_layout()
        plt.savefig(f'figure_ACL21-23/abstract_word_cloud_{year}.png')  # 保存到本地文件夹
        plt.close()  # 关闭绘图，释放资源



# 对ACL2019和ACL2020年调用函数生成标题的统计图和词云图
plot_word_frequency_and_cloud(acl2019_df, 2019)
plot_word_frequency_and_cloud(acl2020_df, 2020)

# 对ACL2021、2022、2023年调用函数生成标题和摘要的统计图和词云图
plot_word_frequency_and_cloud(acl2021_df, 2021)
plot_word_frequency_and_cloud(acl2022_df, 2022)
plot_word_frequency_and_cloud(acl2023_df, 2023)


Top three words in titles for 2019: ['neural', 'translation', 'machine']
Top three words in titles for 2020: ['neural', 'generation', 'translation']
Top three words in titles for 2021: ['neural', 'translation', 'generation']
Top three words in abstracts for 2021: ['knowledge', 'state-of-the-art', 'information']
Top three words in titles for 2022: ['generation', 'neural', 'translation']
Top three words in abstracts for 2022: ['knowledge', 'state-of-the-art', 'information']
Top three words in titles for 2023: ['generation', 'knowledge', 'extraction']
Top three words in abstracts for 2023: ['knowledge', 'generation', 'novel']
