# Assignment 2 
### \<Weiyou Liu> \<A1872800>
### \<Hengyi Ma> \<A1875198>
### \<name3> \<id3>

## A. Tasks as specified for your team structure

**One headings for each task.**

#### 1. Load date and pre-process

In [4]:
import chardet   
import spacy
import stanza
import neuralcoref
import random
import nltk
import torch
import re
import sys
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer, BertModel
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from torch.nn.functional import softmax
from collections import defaultdict
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords


nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package punkt to /home/vscode/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/vscode/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
file_path = "/homes/Adam/NLP/ASM2/news_dataset.csv"

with open(file_path, 'rb') as file:
    rawdata = file.read()

result = chardet.detect(rawdata)
encoding = result['encoding']

print(f"Detected encoding: {encoding}")

if encoding:
    df = pd.read_csv(file_path, encoding=encoding)
else:
    print("Failed to detect encoding. Consider specifying encoding manually.")


Detected encoding: utf-8


In [6]:
sample_df = df.sample(n=100, random_state=np.random.seed(0))
sample_df.head()
# Randomly select 100 samples to speed up training

Unnamed: 0,id,author,date,year,month,topic,article
993,18456,Victor Mather,13/02/2017,2017,2,politics,At least six members of the Super New Englan...
859,18308,Ken Belson,8/02/2017,2017,2,sports,"HOUSTON ? There was the game on the field, ..."
298,17636,Ron Lieber,14/01/2017,2017,1,business,When Wells Fargo announced its quarterly earni...
553,17938,Dale Russakoff,29/01/2017,2017,1,politics,When Indira Islas was in third grade at Centen...
672,18070,Laurie Goodstein,30/01/2017,2017,1,crime,"Over the past decade, Christians in the United..."


In [16]:
nlp = spacy.load('en_core_web_sm')

stemmer = PorterStemmer()

# Assuming the following are placeholders for previously defined preprocessing and entity extraction functions
def clean_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove special characters and numbers
    text = re.sub(r'\W+', ' ', text)
    
    # Tokenize text
    tokens = word_tokenize(text)
    
    # Remove stopwords and stem the words
    filtered_tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    
    return ' '.join(filtered_tokens)

# Load spaCy model
nlp = spacy.load('en_core_web_sm')

def extract_entities(text):
    doc = nlp(text)
    entities = []
    for ent in doc.ents:
        entities.append((ent.text, ent.label_))
    return entities

In [8]:
# Clean the text column
sample_df['cleaned_article'] = sample_df['article'].apply(clean_text)

# Vectorize the processed text
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(sample_df['cleaned_article'])
# tfidf_matrix is the processed numerical data, which can be used for subsequent NLP tasks

# Apply function to extract entities from each article
sample_df['entities'] = sample_df['article'].apply(extract_entities)

In [9]:
doc_id_to_index = {doc_id: index for index, doc_id in enumerate(sample_df.index)}
# Because the index is random, we need a dictionary to map doc_id to index

inverted_index = defaultdict(set)
for index, row in sample_df.iterrows():
    for entity in row['entities']:
        inverted_index[entity[0]].add(index)  # entity[0] is the text of the entity

In [10]:
nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma,depparse,ner,coref', use_gpu=True)

# Define a function to resolve coreferences in a text
def coref_resolve(text, nlp):
    try:
        # Process the text with the NLP pipeline
        doc = nlp(text)
        # Initialize an empty list to store representative mentions
        representative_mentions = []
        # Iterate over sentences, tokens, and words in the document
        for sentence in doc.sentences:
            for token in sentence.tokens:
                for word in token.words:
                    # If a word has coreference chains
                    if hasattr(word, 'coref_chains'):
                        # Check each chain to see if it is a representative mention
                        for chain in word.coref_chains:
                            if chain.is_representative:
                                # If it is, add the word text to the list
                                representative_mentions.append(word.text)
                                break
        # Return the list of representative mentions
        return representative_mentions
    except IndexError:
        # Print an error message if there is an issue with processing the text
        print("Error processing text:", text)
        return []

2024-04-12 14:27:48 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json: 379kB [00:00, 30.0MB/s]                    
2024-04-12 14:27:48 INFO: Downloaded file to /home/vscode/stanza_resources/resources.json
2024-04-12 14:27:50 INFO: Loading these models for language: en (English):
| Processor | Package                   |
-----------------------------------------
| tokenize  | combined                  |
| mwt       | combined                  |
| pos       | combined_charlm           |
| lemma     | combined_nocharlm         |
| coref     | ontonotes_electra-large   |
| depparse  | combined_charlm           |
| ner       | ontonotes-ww-multi_charlm |

2024-04-12 14:27:50 INFO: Using device: cpu
2024-04-12 14:27:50 INFO: Loading: tokenize
2024-04-

In [11]:
sample_df['coref_chains'] = sample_df['article'].apply(lambda x: coref_resolve(x, nlp))



In [12]:
def map_entities_to_corefs(entities, coref_chains):
    entity_coref_map = {}
    for entity in entities:
        entity_text = entity[0]  #entities are tuples of (text, type)
        entity_coref_map[entity_text] = []
        for coref in coref_chains:
            if entity_text in coref:
                entity_coref_map[entity_text].append(coref)
    return entity_coref_map



In [13]:
# Apply this function row-wise, assuming 'coref_chains' and 'entities' are available for each row
sample_df['entity_coref_map'] = sample_df.apply(lambda row: map_entities_to_corefs(row['entities'], row['coref_chains']), axis=1)


In [14]:
def search_documents(query_text):
    # 对查询进行预处理和实体识别
    cleaned_query = clean_text(query_text)
    query_vector = vectorizer.transform([cleaned_query])
    query_entities = extract_entities(query_text)
    
    # 基于实体和共指信息查找文档
    docs_based_on_content = set()
    for entity_text, _ in query_entities:
        # 使用倒排索引直接根据实体查找文档
        docs_based_on_content.update(inverted_index.get(entity_text, []))
        
        # 使用共指信息查找相关文档
        for doc_id in sample_df.index:
            coref_map = sample_df.at[doc_id, 'entity_coref_map']
            for coref_entity, mentions in coref_map.items():
                if entity_text == coref_entity or entity_text in mentions:
                    docs_based_on_content.add(doc_id)
                    break  # 如果找到匹配，就跳出循环

    # 直接使用找到的文档进行评分和排序
    if docs_based_on_content:
        docs_indices = [doc_id_to_index[doc_id] for doc_id in docs_based_on_content if doc_id in doc_id_to_index]
        docs_tfidf = tfidf_matrix[docs_indices]
        cos_similarities = cosine_similarity(query_vector, docs_tfidf).flatten()
        
        scored_docs = sorted(zip(docs_based_on_content, cos_similarities), key=lambda x: x[1], reverse=True)
        ranked_docs = [doc[0] for doc in scored_docs]
        
        return ranked_docs
    else:
        return []


In [17]:
# 对索引的功能进行测试

query_text = input("Enter your query text: ")


ranked_docs = search_documents(query_text)
if ranked_docs:
    print(f"Found documents: {ranked_docs}")
else:
    print("No documents found or query format incorrect.")


Found documents: [494]


In [18]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

qa_tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
qa_model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")

def get_bert_embeddings(texts):
    # 使用全局变量bert_tokenizer和bert_model处理文本
    inputs = bert_tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
    outputs = bert_model(**inputs)
    embeddings = outputs.last_hidden_state[:, 0, :]
    return embeddings.detach().numpy()

nlp = spacy.load("en_core_web_sm")
def find_top_n_relevant_contents(question, article_content, top_n=5, window_size=5):
    # 使用spaCy进行句子分割
    doc = nlp(article_content)
    sentences = [sent.text.strip() for sent in doc.sents]  # 获取句子文本并去除首尾空白

    # 其余的逻辑与之前相同
    paragraph_embeddings = []
    for i in range(len(sentences) - window_size + 1):
        window_sentences = ' '.join(sentences[i:i+window_size])
        window_embedding = get_bert_embeddings([window_sentences])[0]  # 假设get_bert_embeddings返回numpy数组
        paragraph_embeddings.append(window_embedding)

    paragraph_embeddings = np.array(paragraph_embeddings)
    question_embedding = get_bert_embeddings([question])[0]

    # 计算相似度并找到最相关的窗口
    similarities = cosine_similarity([question_embedding], paragraph_embeddings).flatten()
    top_n_indices = np.argsort(similarities)[-top_n:][::-1]
    top_n_contents = [(' '.join(sentences[i:i+window_size]), similarities[i]) for i in top_n_indices]

    return top_n_contents


def encode_question_and_context(question, context):
    # 使用全局变量qa_tokenizer对问题和上下文进行编码
    return qa_tokenizer.encode_plus(question, context, add_special_tokens=True, return_tensors="pt", truncation=True, max_length=512)


def find_answer(question, context, top_k=3):
    inputs = encode_question_and_context(question, context)
    input_ids = inputs["input_ids"].tolist()[0]

    outputs = qa_model(**inputs)
    answer_start_scores = outputs.start_logits
    answer_end_scores = outputs.end_logits

    # 计算开始和结束分数的softmax分布
    start_probs = softmax(answer_start_scores, dim=-1)
    end_probs = softmax(answer_end_scores, dim=-1)

    # 提取排名前top_k的开始和结束位置
    start_topk = torch.topk(start_probs, top_k)
    end_topk = torch.topk(end_probs, top_k)

    top_answers = []
    for start_index, start_score in zip(start_topk.indices[0], start_topk.values[0]):
        for end_index, end_score in zip(end_topk.indices[0], end_topk.values[0]):
            # 确保结束位置在开始位置之后
            if end_index >= start_index:
                answer = qa_tokenizer.convert_tokens_to_string(qa_tokenizer.convert_ids_to_tokens(input_ids[start_index:end_index + 1]))
                score = (start_score.item() + end_score.item()) / 2  # 简单地取平均分数作为置信度
                top_answers.append((answer, score))
                break  # 只添加每个开始位置的最佳结束位置

    # 根据置信度分数降序排列答案
    top_answers = sorted(top_answers, key=lambda x: x[1], reverse=True)[:top_k]
    return top_answers



Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [19]:
def compute_f1_single(predicted, truth):
    # 如果truth是字符串，将其转换为只包含一个元素的列表
    if isinstance(truth, str):
        truth = [truth]
    # 将所有真实答案合并为一个大的token集合，以处理答案列表的情况
    truth_tokens = set(token.lower() for answer in truth for token in answer.split())

    pred_tokens = set(predicted.lower().split())
    common_tokens = pred_tokens.intersection(truth_tokens)
    if not common_tokens:
        return 0.0
    precision = len(common_tokens) / len(pred_tokens)
    recall = len(common_tokens) / len(truth_tokens)
    if precision + recall == 0:
        return 0.0
    f1 = 2 * (precision * recall) / (precision + recall)
    return f1


def compute_mrr(predicted_article_ids, truth_article_ids):
    if not isinstance(predicted_article_ids, list):
        predicted_article_ids = [predicted_article_ids]
    rank = 0
    for i, predicted_id in enumerate(predicted_article_ids, start=1):
        if predicted_id in truth_article_ids:
            rank = 1 / i
            break
    return rank

def compute_map(predicted_article_ids, truth_article_ids):
    if not isinstance(predicted_article_ids, list):
        predicted_article_ids = [predicted_article_ids]
    avg_precisions = []
    for i, predicted_id in enumerate(predicted_article_ids, start=1):
        if predicted_id in truth_article_ids:
            relevant_count = sum(pred_id in truth_article_ids for pred_id in predicted_article_ids[:i])
            precision_at_i = relevant_count / i
            avg_precisions.append(precision_at_i)
    return np.mean(avg_precisions) if avg_precisions else 0




def lcs_length(x, y):
    """计算两个序列的最长公共子序列（LCS）的长度"""
    if not x or not y:
        return 0
    dp = [[0] * (len(y) + 1) for _ in range(len(x) + 1)]
    for i in range(1, len(x) + 1):
        for j in range(1, len(y) + 1):
            if x[i - 1] == y[j - 1]:
                dp[i][j] = dp[i - 1][j - 1] + 1
            else:
                dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])
    return dp[-1][-1]

def compute_rouge_l_multi(predicted_answers, truths):
    def compute_rouge_l(predicted, truth):
        """计算单个预测答案与单个真实答案之间的ROUGE-L分数。"""
        lcs = lcs_length(predicted, truth)
        if lcs == 0:
            return 0
        precision = lcs / len(predicted)
        recall = lcs / len(truth)
        f1 = 2 * (precision * recall) / (precision + recall + 1e-8)
        return f1
    
    rouge_l_scores = []
    for predicted in predicted_answers:  # 直接处理每个预测答案文本
        # 计算当前预测答案与每个真实答案之间的ROUGE-L分数，并取最大值
        scores_for_this_answer = [compute_rouge_l(predicted, truth) for truth in truths]
        rouge_l_scores.append(max(scores_for_this_answer))
    
    return np.mean(rouge_l_scores) if rouge_l_scores else 0




In [45]:
questions_answers = {
    "fact_based": [
        {
            "question": "What organization began compiling opposition research on Donald Trump in September 2015?",
            "answer": "Fusion GPS",
            "article_id": 230
        },
        {
            "question": "What Netflix series is a reboot of a Norman Lear sitcom that discusses class divide?",
            "answer": "One Day at a Time",
            "article_id": 77
        },
        {
            "question": "Who was the American ambassador to the United Nations who issued a warning to allies and rivals in her first remarks?",
            "answer": "Nikki R. Haley",
            "article_id": 614
        },
        {
            "question": "In which year did Xi Jinping come to power in China?",
            "answer": ["2012"],
            "article_id": 494
        },
        {
            "question": "What is the title of Yang Jisheng's book about the Cultural Revolution?",
            "answer": ["The World Turned Upside Down"],
            "article_id": 494
        },
        {
            "question": "What year did the Cultural Revolution begin?",
            "answer": ["1966"],
            "article_id": 494
        },
        {
            "question": "Which state is Monticello located in?",
            "answer": "Iowa",
            "article_id": 261
        },
        {
            "question": "What is the name of Yang Jisheng's other book about the famine caused by the Great Leap Forward?",
            "answer": ["Tombstone"],
            "article_id": 494
        },       
        {
            "question": " What is the name of the film starring Matthew McConaughey that involves the mining industry and is set in the 1980s?",
            "answer": ["Gold"],
            "article_id": 601
        },
        {
            "question": "Which Taiwanese official provided updates on the Liaoning's transit?",
            "answer": "Alex Huang, the president's spokesman",
            "article_id": 239
        }
    ],
    "explanation_based": [
        {
            "question": "What did Bonnie S. Glaser say about the potential response from the Trump administration to China's actions?",
            "answer": "Bonnie S. Glaser suggested that if the Trump administration views this as a test of U.S. resolve, they are likely to push back pretty forcefully.",
            "article_id": 239
        },
        {
            "question": "How does 'One Day at a Time' reflect the issue of class divide in its narrative?",
            "answer": "The show reflects the class divide by showcasing a family that is closer to the lower side of the economic spectrum, discussing real-life economic struggles within a sitcom format.",
            "article_id": 77
        },
        {
            "question": "What was the main message Nikki R. Haley conveyed in her first remarks at the United Nations?",
            "answer": "The main message was that the Trump administration would hold to account those who do not back the United States, signaling a change in the way the U.S. interacts with the UN.",
            "article_id": 614
        },
        {
            "question": "Who is the famed chronicler of the Mao era who finished writing a history of the Cultural Revolution?",
            "answer": ["Yang Jisheng"],
            "article_id": 494
        },
        {
            "question": "What was Yang Jisheng's role during the early phase of the Cultural Revolution?",
            "answer": ["He was a university student in Beijing who immersed himself in the early phase."],
            "article_id": 494
        },
        {
            "question": "What is described as 'historical nihilism' by the Chinese government?",
            "answer": ["Delving into events like the Cultural Revolution, described as subversive to corrode the party’s authority."],
            "article_id": 494
        },
        {
            "question": "Why did Yang Jisheng decide to write a book about the Cultural Revolution?",
            "answer": ["To expose lies and restore the truth."],
            "article_id": 494
        },
        {
            "question": "Why was Yang Jisheng advised not to discuss his book with foreign media after its publication?",
            "answer": ["The article implies there's political pressure, but does not provide a specific reason."],
            "article_id": 494
        },
        {
            "question": "Why do the residents of Monticello, Iowa, appear to have little angst about Donald Trump's presidency?",
            "answer": "Residents seem to have little angst because they perceive that Trump is making changes and influencing Congress, reflecting their desire for political action and change.",
            "article_id": 261
        },
        {
            "question": "Why was Hillary Clinton's visit to Monticello significant during her campaign?",
            "answer": "Hillary Clinton's visit to Monticello was significant because it was her first campaign stop after declaring her candidacy for the Democratic nomination in 2015.",
            "article_id": 261
        }
    ],
    "list_based": [
        {
            "question": "List the military activities by China that led to regional tensions, as mentioned in the article.",
            "answer": "Sending the Liaoning through the Taiwan Strait, Chinese bombers and surveillance planes flying over the East China Sea and the Sea of Japan,A Chinese warship seizing a U.S. Navy underwater drone",
            "article_id": 239
        },
        {
            "question": "Who are the main characters in the Netflix reboot of 'One Day at a Time'?",
            "answer": "Penelope,Alex ,Lydia, Elena",
            "article_id": 77
        },
        {
            "question": "What are the implications of the Trump administration's stance towards the United Nations as expressed by Nikki R. Haley?",
            "answer": "Holding to account those who do not support the U.S., A potential reduction in U.S. funding for the UN, A focus on showing strength and value in U.N. participation",
            "article_id": 614
        },
        {
            "question": "Two challenges Yang Jisheng faced while writing his historical works.",
            "answer": "He was warned against publishing his book and barred from traveling to the United States; he was told not to discuss the book with foreign media.",
            "article_id": 494
        },
        {
            "question": "The types of jobs held by the men who meet at the Table of Knowledge.",
            "answer": "optometrist, farmers, former employees of a utility company",
            "article_id": 261
        },
        {
            "question": "The types of information alleged in the memos about Trump's ties to Russia.",
            "answer": "Blackmail with sex tapes, Bribery with business deals, Meetings with Russian operatives to discuss hacking and leaking emails",
            "article_id": 230
        },
        {
            "question": "List some of the initiatives taken by OGC Nice to enhance fan experience at their matches.",
            "answer": "Handing out free tickets to local children, Refitting all V.I.P. areas, some of which remain open until the early hours, Installing an ice rink outside the Allianz Riviera before Christmas",
            "article_id": 996
        },
        {
            "question": "What are the different ways OGC Nice has tried to draw fans into the stadium?",
            "answer": "Signing high-profile players like Mario Balotelli, Making matches a complete experience with various entertainment options, Promoting a 'popular' style of play",
            "article_id": 996
        },
        {
            "question": "What accolades have members of the Cowboys' offensive line achieved?",
            "answer": "Three of the starters have been chosen to the Pro Bowl, with Tyron Smith having four selections, Travis Frederick three, and Zack Martin three.",
            "article_id": 331
        },
        {
            "question": "What are the gifts Ezekiel Elliott gave to his linemen for Christmas?",
            "answer": "John Deere utility vehicles",
            "article_id": 331
        }

    ],  
    "cause_based": [
        {
            "question": "What geopolitical implications does the passage of the Liaoning through the Taiwan Strait have for Taiwan and China relations?",
            "answer": "The passage increases uncertainty and tensions, possibly exacerbating existing disputes over sovereignty and territorial integrity, as well as impacting the broader regional security dynamics.",
            "article_id": 239
        },
        {
            "question": "Why did many farmers switch from voting for Obama to Trump?",
            "answer": "Many farmers switched to Trump despite his opposition to the TPP because they voted against what they perceived as their economic interests, possibly influenced by broader political or cultural concerns.",
            "article_id": 261
        },
        {
            "question": "What concerns did Mike Staudt have about the Affordable Care Act?",
            "answer": "Mike Staudt described the Affordable Care Act as a form of socialism, reflecting his concern about government overreach into healthcare.",
            "article_id": 261
        },
        {
            "question": "Why might American intelligence agencies have not confirmed the claims in the memos about Trump?",
            "answer": "Because the claims were unsubstantiated and based on opposition research that neither intelligence agencies nor news organizations like The New York Times could verify.",
            "article_id": 230
        },
        {
            "question": "Why has OGC Nice been compared to Leicester City?",
            "answer": "OGC Nice has been compared to Leicester City due to their similar stories of being smaller clubs defying financial odds to compete at high levels in their respective leagues.",
            "article_id": 996
        },
        {
            "question": "How has OGC Nice's approach to building its team differed from the perceived 'wonderful accident' of Leicester City's success?",
            "answer": "Unlike Leicester City's unexpected success, OGC Nice's rise has been strategic and planned, involving steady growth since Jean-Pierre Rivère took control, investment in facilities, and intelligent recruitment.",
            "article_id": 996
        },
        {
            "question": "Why is the Cowboys' offensive line considered crucial to their success?",
            "answer": "The line consistently provides strong protection and blocking, which is fundamental for both their running and passing games, supporting the performances of both quarterback Dak Prescott and running back Ezekiel Elliott.",
            "article_id": 331
        },
        {
            "question": "How did the Cowboys' management build such a strong offensive line?",
            "answer": "The team focused on strengthening the line by using high draft picks on blockers, selecting Tyron Smith in 2011, Travis Frederick in 2013, and Zack Martin in 2014, which laid the foundation for their current success.",
            "article_id": 331
        },
        {
        "question": "Why is Justice Teori Zavascki's death particularly impactful for Brazil's judiciary?",
        "answer": "Justice Zavascki was a key figure in major corruption investigations, including overseeing the plea deal ratification for Odebrecht, which implicated numerous politicians. His death throws the continuation and direction of these high-stakes cases into uncertainty.",
        "article_id": 456
        },
        {
        "question": "What concerns are raised by the timing of Justice Zavascki's death?",
        "answer": "The timing of his death is concerning because it coincides with escalating investigations into high-profile corruption cases involving Petrobras and Odebrecht, leading to speculation about potential foul play to disrupt these proceedings.",
        "article_id": 456
        }
    ]
}


In [21]:
def auto_answer_question(query_text, use_relevant_content=True):
    # 假设ranked_docs是通过某种方式得到的文档ID列表
    ranked_docs = search_documents(query_text)
    if not ranked_docs:
        return "No documents found or query format incorrect."

    # 获取所有相关文档的内容
    all_articles = ''
    for doc_id in ranked_docs:
        # 假设可以通过doc_id获取到文档内容
        article = sample_df.loc[doc_id, 'article']
        all_articles += article + ' '

    # 根据use_relevant_content变量决定使用哪种内容寻找答案
    if use_relevant_content:
        # 找到与问题最相关的内容片段
        top_n_contents = find_top_n_relevant_contents(query_text, all_articles, top_n=3)
        relevant_context = ' '.join([content[0] for content in top_n_contents])
        context_to_use = relevant_context
    else:
        # 使用全部内容
        context_to_use = all_articles

    # 使用选择的内容寻找答案
    answer = find_answer(query_text, context_to_use)
    return answer


In [22]:
def run_evaluation(test_data, qa_system_func, use_relevant_content=True):
    f1_scores, mrr_scores, map_scores, rouge_l_scores = [], [], [], []

    for items in test_data:
        print(f"Evaluating {items} questions...")
        print("")
        # 初始化每个类别的分数列表，以便在每个类别结束时计算平均值
        cat_f1_scores, cat_mrr_scores, cat_map_scores, cat_rouge_l_scores = [], [], [], []

        for i in test_data[items]:
            question, true_answer, true_article_id = i["question"], [i["answer"]], [i["article_id"]]
        
            predicted_answers_with_scores = qa_system_func(question, use_relevant_content)
            
            # 假设 search_documents 返回与预测相关的文章ID列表
            predicted_article_ids = search_documents(question)
            predicted_answers = [ans[0] for ans in predicted_answers_with_scores]
            f1 = compute_f1_single(predicted_answers[0], true_answer[0])  # 假设只有一个真实答案和一个预测答案
            mrr = compute_mrr(predicted_article_ids, true_article_id)
            map_score = compute_map(predicted_article_ids, true_article_id)
            rouge_l = compute_rouge_l_multi(predicted_answers, true_answer)

            cat_f1_scores.append(f1)
            cat_mrr_scores.append(mrr)
            cat_map_scores.append(map_score)
            cat_rouge_l_scores.append(rouge_l)

        # 计算当前类型问题的平均性能指标
        avg_f1 = np.mean(cat_f1_scores)
        avg_mrr = np.mean(cat_mrr_scores)
        avg_map = np.mean(cat_map_scores)
        avg_rouge_l = np.mean(cat_rouge_l_scores)

        # 打印当前类型问题的平均性能指标
        print(f"Average F1 Score for {items}: {avg_f1:.3f}")
        print(f"Average MRR for {items}: {avg_mrr:.3f}")
        print(f"Average MAP for {items}: {avg_map:.3f}")
        print(f"Average ROUGE-L for {items}: {avg_rouge_l:.3f}")
        print("")  # 打印空行以分隔不同类型的输出


In [46]:

print('Rustl with whole content')
print("")
run_evaluation(questions_answers, auto_answer_question , use_relevant_content=False)

Rustl with whole content

Evaluating fact_based questions...

Average F1 Score for fact_based: 0.629
Average MRR for fact_based: 0.850
Average MAP for fact_based: 0.850
Average ROUGE-L for fact_based: 0.290

Evaluating explanation_based questions...

Average F1 Score for explanation_based: 0.327
Average MRR for explanation_based: 0.900
Average MAP for explanation_based: 0.900
Average ROUGE-L for explanation_based: 0.169

Evaluating list_based questions...

Average F1 Score for list_based: 0.174
Average MRR for list_based: 0.800
Average MAP for list_based: 0.800
Average ROUGE-L for list_based: 0.183

Evaluating cause_based questions...

Average F1 Score for cause_based: 0.065
Average MRR for cause_based: 0.833
Average MAP for cause_based: 0.833
Average ROUGE-L for cause_based: 0.235



In [47]:
print('Rustl with relevant content')
print("")
run_evaluation(questions_answers, auto_answer_question)

Rustl with relevant content

Evaluating fact_based questions...

Average F1 Score for fact_based: 0.100
Average MRR for fact_based: 0.850
Average MAP for fact_based: 0.850
Average ROUGE-L for fact_based: 0.094

Evaluating explanation_based questions...

Average F1 Score for explanation_based: 0.164
Average MRR for explanation_based: 0.900
Average MAP for explanation_based: 0.900
Average ROUGE-L for explanation_based: 0.017

Evaluating list_based questions...

Average F1 Score for list_based: 0.056
Average MRR for list_based: 0.800
Average MAP for list_based: 0.800
Average ROUGE-L for list_based: 0.169

Evaluating cause_based questions...

Average F1 Score for cause_based: 0.083
Average MRR for cause_based: 0.833
Average MAP for cause_based: 0.833
Average ROUGE-L for cause_based: 0.228



In [24]:
def user_interaction(qa_system_func, use_relevant_content=True):
    print("Welcome to the QA System!")
    print("Type 'exit' to leave the system.")
    
    while True:
        question = input("Please enter your question: ")
        if question.lower() == 'exit':
            print("Exiting the QA system. Goodbye!")
            break  # 退出循环，结束程序

        # 可以在这里添加更多的逻辑来处理用户输入，比如清洗和预处理
        # 假设 search_documents 返回与预测相关的文章ID
        predicted_article_ids = search_documents(question)

        if not predicted_article_ids:
            print("No relevant documents found. Try another question.")
            continue

        # 获取所有相关文档的内容并选择内容
        all_articles = ''
        for doc_id in predicted_article_ids:
            article = sample_df.loc[doc_id, 'article']  # 假设可以通过doc_id获取到文档内容
            all_articles += article + ' '

        # 根据设置决定使用哪种内容寻找答案
        if use_relevant_content:
            # 找到与问题最相关的内容片段
            top_n_contents = find_top_n_relevant_contents(question, all_articles, top_n=3)
            relevant_context = ' '.join([content[0] for content in top_n_contents])
            context_to_use = relevant_context
        else:
            # 使用全部内容
            context_to_use = all_articles

        # 使用选择的内容寻找答案
        answers = find_answer(question, context_to_use)
        print("Answer(s):")
        for answer, score in answers:
            print(f"- {answer} (confidence: {score:.2f})")




In [None]:
def auto_answer_question(query_text, use_relevant_content=True):
    # 假设ranked_docs是通过某种方式得到的文档ID列表
    ranked_docs = search_documents(query_text)
    if not ranked_docs:
        return "No documents found or query format incorrect."

    # 获取所有相关文档的内容
    all_articles = ''
    for doc_id in ranked_docs:
        # 假设可以通过doc_id获取到文档内容
        article = sample_df.loc[doc_id, 'article']
        all_articles += article + ' '

    # 根据use_relevant_content变量决定使用哪种内容寻找答案
    if use_relevant_content:
        # 找到与问题最相关的内容片段
        top_n_contents = find_top_n_relevant_contents(query_text, all_articles, top_n=3)
        relevant_context = ' '.join([content[0] for content in top_n_contents])
        context_to_use = relevant_context
    else:
        # 使用全部内容
        context_to_use = all_articles

    # 使用选择的内容寻找答案
    answer = find_answer(query_text, context_to_use)
    return answer


In [48]:
# 调用用户交互函数
user_interaction(auto_answer_question,use_relevant_content = False)

Welcome to the QA System!
Type 'exit' to leave the system.
Answer(s):
- mr. trump has said they are a complete fabrication (confidence: 0.58)
- they are a complete fabrication (confidence: 0.52)
- trump has said they are a complete fabrication (confidence: 0.48)
Exiting the QA system. Goodbye!


## B. References

## C. Appendix