# Assignment 2 
### \<name1> \<id1>
### \<name2> \<id2>
### \<name3> \<id3>

In [72]:
# This model can only run on python 3.7, for 3.11 doesn't support neurcoef.
import pandas as pd
from sklearn.model_selection import train_test_split
import spacy
import neuralcoref
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering



In [74]:

# Download necessary NLTK datasets
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load data
df = pd.read_csv('news_dataset.csv', encoding='ISO-8859-1')
# test with the previous 10
df_sample = df[:10]
# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Load Spacy model and add neuralcoref to pipe
nlp = spacy.load('en_core_web_sm')
neuralcoref.add_to_pipe(nlp)

def preprocess_text(text):
    # Resolve coreferences
    doc = nlp(text)
    text = doc._.coref_resolved

    # Split the text into sentences using spaCy's sentence segmentation
    sentences = list(doc.sents)

    # Process each sentence
    processed_sentences = []
    for sentence in sentences:
        # Convert sentence to lower case
        sentence_text = sentence.text.lower()
        # Remove non-alphanumeric characters
        sentence_text = re.sub(r'[^\w\s]', ' ', sentence_text)
        # Tokenize
        tokens = word_tokenize(sentence_text)
        # Lemmatize and remove stopwords
        lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
        # Join tokens back to string
        processed_sentence = ' '.join(lemmatized_tokens)
        processed_sentences.append(processed_sentence)

    # Join all processed sentences back to a single string
    return processed_sentences

# Apply preprocessing to each article in the dataframe
df_sample['processed_article'] = df_sample['article'].apply(preprocess_text)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yulun\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yulun\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yulun\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [110]:
import spacy
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re
# load data
df = pd.read_csv('news_dataset.csv', encoding='ISO-8859-1')
# test with the previous 10
df_sample = df[:10]
# 下载必要的 NLTK 数据集
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# 初始化文本处理工具
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
nlp = spacy.load('en_core_web_sm', disable=["ner", "parser"])  # 加载模型，禁用不需要的管道组件以提升速度
nlp.add_pipe(nlp.create_pipe('sentencizer'))
neuralcoref.add_to_pipe(nlp)

def sliding_window(text, window_size=512, step_size=256):
    # Tokenize the text
    tokens = word_tokenize(text)
    # Generate windows
    for i in range(0, len(tokens), step_size):
        window_tokens = tokens[i:i+window_size]
        yield ' '.join(window_tokens)

def preprocess_text(text, window_size=512, step_size=256):
    # 使用NeuralCoref进行指代消解
    doc = nlp(text)
    text = doc._.coref_resolved

    # 生成处理后的文本窗口
    processed_windows = []
    for window in sliding_window(text, window_size, step_size):
        # 清洗文本
        clean_text = re.sub(r'[^\w\s]', ' ', window.lower())
        tokens = word_tokenize(clean_text)
        # 过滤停用词并进行词形还原
        filtered_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
        processed_window = ' '.join(filtered_tokens)
        processed_windows.append(processed_window)

    return processed_windows
df_sample['processed_article'] = df_sample['article'].apply(preprocess_text)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yulun\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yulun\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yulun\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [105]:
df_sample

Unnamed: 0,id,author,date,year,month,topic,article,processed_article
0,17307,Marlise Simons,1/01/2017,2017,1,architecture,PARIS ? When the Islamic State was about to...,"[paris, islamic state drive ancient city palmy..."
1,17292,Andy Newman,31/12/2016,2016,12,art,Angels are everywhere in the Mu?iz family?s ap...,[angel everywhere mu iz family apartment bronx...
2,17298,Emma G. Fitzsimmons,2/01/2017,2017,1,business,Finally. The Second Avenue subway opened in Ne...,"[finally, second avenue subway open new york c..."
3,17311,Carl Hulse,3/01/2017,2017,1,business,WASHINGTON ? It?s or time for Republica...,"[washington, time republican, tumultuous decad..."
4,17339,Jim Rutenberg,5/01/2017,2017,1,business,"For Megyn Kelly, the shift from Fox News to NB...",[megyn kelly shift fox news nbc host daily day...
5,17340,John Koblin and Michael M. Grynbaum,4/01/2017,2017,1,business,Megyn Kelly?s new office at NBC News sits a bl...,[megyn kelly new office nbc news sit block nor...
6,17342,Farhad Manjoo,5/01/2017,2017,1,business,"In the technology industry, the sharks have ne...",[technology industry shark never long safe min...
7,17344,Chris Buckley and Adam Wu,5/01/2017,2017,1,business,BEIJING ? A city official in southwest Chin...,"[beijing, city official southwest china unleas..."
8,17346,Beverly Gage,7/01/2017,2017,1,business,Our new president is a billionaire Ivy Leag...,[new president billionaire ivy league graduate...
9,17284,Benjamin Mueller and Al Baker,19/06/2017,2017,6,crime,"After the bullet shells get counted, the blood...",[bullet shell get count blood dry votive candl...


In [93]:
from collections import Counter
import numpy as np

def calculate_f1(predicted, truth):
    pred_tokens = predicted.split()
    truth_tokens = truth.split()
    common_tokens = Counter(pred_tokens) & Counter(truth_tokens)
    num_same = sum(common_tokens.values())

    if num_same == 0:
        return 0

    precision = num_same / len(pred_tokens)
    recall = num_same / len(truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

def calculate_mrr(rank_list):
    """ Calculates Mean Reciprocal Rank (MRR) """
    mrr = 0
    for rank in rank_list:
        mrr += 1.0 / rank
    mrr /= len(rank_list)
    return mrr


In [111]:
import numpy as np

def evaluate_qa_system(ids, questions, correct_answers, df_sample, model_name="distilbert-base-cased-distilled-squad", confidence_threshold=0.3):
    assert len(questions) == len(correct_answers) == len(ids)  # 确保问题、答案和 ID 数量相同

    predictions = []
    results = []
    ranks = []
    
    for id, question, correct_answer in zip(ids, questions, correct_answers):
        # 使用提供的 ID 来找到对应的段落
        try:
            passages = df_sample[df_sample['id'] == id]['processed_article'].iloc[0]
        except IndexError:
            print(f"No passage found for ID: {id}")
            continue

        # 调用 find_best_answer 函数来寻找最佳答案
        predicted_answer = find_best_answer(question, id, df_sample, model_name)
        predictions.append(predicted_answer)

        # 计算 F1 分数
        f1_score = calculate_f1(predicted_answer, correct_answer)
        results.append({
            "Question": question,
            "Search Answer": predicted_answer,
            "Correct Answer": correct_answer,
            "Score": f1_score
        })

        # 添加排名以计算 MRR
        rank = 1 if predicted_answer == correct_answer else 2
        ranks.append(rank)

    # 计算总体 F1 和 MRR
    f1_scores = [result['Score'] for result in results]
    mean_f1 = np.mean(f1_scores)
    mrr = calculate_mrr(ranks)

    print("F1 Scores:", f1_scores)
    print("Mean F1 Score:", mean_f1)
    print("MRR:", mrr)
    print("Detailed Results:")
    for result in results:
        print(f"Question: {result['Question']} | Search Answer: {result['Search Answer']} | Correct Answer: {result['Correct Answer']} | Score: {result['Score']}")

questions = [
    "What is the title of the exhibition discussed in the article?",
    "Who opened the 'Eternal Sites' exhibition and how did he describe it?",
    "What is the purpose of showing the beauty of Middle Eastern heritage according to the article?"
]

correct_answers = [
    "Eternal Sites: From Bamiyan to Palmyra",
    "President François Hollande opened the exhibition and described it as an act of resistance against terror and intolerance.",
    "Showing the beauty of the Middle Eastern heritage is the best answer to the Islamist propaganda of hate, destruction, and death."
]

ids = [17307, 17307, 17307]  # Assuming all questions refer to the same article identified by ID 17307


evaluate_qa_system(ids, questions, correct_answers, df_sample)

F1 Scores: [0.007142857142857143, 0.027303754266211604, 0.07142857142857142]
Mean F1 Score: 0.035291727612546724
MRR: 0.5
Detailed Results:
Question: What is the title of the exhibition discussed in the article? | Search Answer: Question: What is the title of the exhibition discussed in the article?
Best Answer: [CLS]
From Sentence: pillaging neglect destruction accessible public mr said aimed mobilize public opinion face devastation unique heritage besides image palmyra multimedia show project enormous photograph video immersing visitor different era including ancient iraqi city khorsabad around 700 b c mosque damascus medieval christian citadel mr ubelmann dismissed criticism collaboration government syrian president bashar working pro bono government help archaeologist mr said shared work syrian archaeologist mr said adding also train colleague later paramount memory potential restoration last year mr team flown drone 20 historic site syria recently syria moved zone iraq close front

In [99]:
questions = [ "What was the purpose of the 'Eternal Sites: From Bamiyan to Palmyra' exhibition at the Grand Palais in Paris?", "Who's early life in Puerto Rico was like something from a tropical Dickens novel?"]
correct_answers = ["terrorists", "Zoraida"]
ids = [17307, 17292] 

evaluate_qa_system(ids, questions, correct_answers, df_sample)

F1 Scores: [0, 0]
Mean F1 Score: 0.0
MRR: 0.5
Detailed Results:
Question: What was the purpose of the 'Eternal Sites: From Bamiyan to Palmyra' exhibition at the Grand Palais in Paris? | Search Answer: Question: What was the purpose of the 'Eternal Sites: From Bamiyan to Palmyra' exhibition at the Grand Palais in Paris?
Best Answer: collect reliable data area
From Sentence: collect reliable data area
Confidence: 0.4978686054394075 | Correct Answer: terrorists | Score: 0
Question: Who's early life in Puerto Rico was like something from a tropical Dickens novel? | Search Answer: Question: Who's early life in Puerto Rico was like something from a tropical Dickens novel?
Best Answer: uncle judge
From Sentence: permission uncle judge marry
Confidence: 0.5903977155685425 | Correct Answer: Zoraida | Score: 0


In [106]:
df_sample['processed_article'][0]

['paris',
 'islamic state drive ancient city palmyra march yves ubelmann get call syria director antiquity come hurry',
 'architect train mr ubelmann 36 work syria country engulf war',
 'special urgency kind work youthful team architect mathematician designer cramped office paris produce digital copy threatened historical site',
 'palmyra part already destroy islamist deem monument idolatrous still rig explosive',
 'houmam saad syrian colleague spend four day fly drone robot camera crumbled arch temple',
 'drone four six rotor hover really close register structural detail every crack hole take precise measurement say mr ubelmann found company iconem',
 'stuff architect archaeologist need',
 'need new push virtual preservation scientist archaeologist others like mr ubelmann compile large scale',
 'record could use create computer model would show monument endanger historical site might one day restore repaired reconstruct',
 'special interest today ancient site syria also iraq suffer wa

## A. Tasks as specified for your team structure

**One headings for each task.**

## B. References

## C. Appendix