# Assignment 2 
### Kusal Bista

In [2]:
# Libraries for reading data
import random
import numpy as np
import pandas as pd 
import glob
import json
from tqdm import tqdm

# Libraries for pre-processing
import re
import nltk

from nltk import tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

# Libraries for information retrieval
import spacy
from spacy import displacy
from spacy.lang.en import English
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Libraries for data analysis
import matplotlib.pyplot as plt
from tabulate import tabulate

In [3]:
# !pip install spacy
# !python -m spacy download en_core_web_sm
# !python -m spacy download en_core_web_md
# !pip install tabulate
# !pip install nltk

In [4]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\a1881044\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\a1881044\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\a1881044\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\a1881044\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

### 1 Reading dataset and pre-processing

In [5]:
news_dataset = pd.read_csv('news_dataset.csv', encoding='latin-1')

In [6]:
news_dataset.head()

Unnamed: 0,id,author,date,year,month,topic,article
0,17307,Marlise Simons,1/01/2017,2017,1,architecture,PARIS ? When the Islamic State was about to...
1,17292,Andy Newman,31/12/2016,2016,12,art,Angels are everywhere in the Mu?iz family?s ap...
2,17298,Emma G. Fitzsimmons,2/01/2017,2017,1,business,Finally. The Second Avenue subway opened in Ne...
3,17311,Carl Hulse,3/01/2017,2017,1,business,WASHINGTON ? It?s or time for Republica...
4,17339,Jim Rutenberg,5/01/2017,2017,1,business,"For Megyn Kelly, the shift from Fox News to NB..."


In [7]:
# # selecting 100 sample 
# sample_size = 100
# if news_dataset.shape[0] >= sample_size:
#     news_dataset_sm = news_dataset.sample(n=sample_size, random_state=42)  # Adjusting random_state for reproducibility
#     news_dataset_sm.reset_index(drop=True, inplace=True) 
#     print("Sampled dataset shape:", news_dataset_sm.shape)
# else:
#     print("Dataset size is less than the sample size. Cannot perform sampling.")

In [8]:
news_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       1000 non-null   int64 
 1   author   994 non-null    object
 2   date     1000 non-null   object
 3   year     1000 non-null   object
 4   month    1000 non-null   object
 5   topic    1000 non-null   object
 6   article  1000 non-null   object
dtypes: int64(1), object(6)
memory usage: 54.8+ KB


### 1.2 Handling missing value

In [9]:
print("Missing value")
print(news_dataset.isnull().sum())

Missing value
id         0
author     6
date       0
year       0
month      0
topic      0
article    0
dtype: int64


In [10]:
# Handling missing value
news_dataset['author'] = news_dataset['author'].fillna('No author')
# checking missing value after handling missing value

In [11]:
print("After handling missing value")
print(news_dataset.isnull().sum())

After handling missing value
id         0
author     0
date       0
year       0
month      0
topic      0
article    0
dtype: int64


### 1.3 Data pre-processing

In [12]:
def pre_process(data):
    # Define stop words
    stop_words = set(stopwords.words('english'))
    stop_words.update(["This", "The", "the"])

    s = " \[(?=.*\d).*?\]" 

    # Lemmatization and removal of stopwords
    lemmatizer = WordNetLemmatizer()

    result = []
    for text in data:
        # Clean text
        # Remove non-ASCII characters
        text = ''.join([char for char in text if ord(char) < 128])

        # Remove multiple spaces
        text = re.sub(r'\s+', ' ', text)

        # Remove question mark problems
        text = re.sub(r'(\s\?)',' ',text)
        text = re.sub(r"\b\?\b", "\'", text)
        text = re.sub(r"(,\?)",",", text)
        text = re.sub(r"\?+", "?", text)
        text = text.strip()

        # Lemmatization and removal of stopwords
        processed_text = " ".join([lemmatizer.lemmatize(word) for word in re.sub(s, "", text).split() if word.lower() not in stop_words])

        result.append(processed_text)

    return result

In [13]:
news_dataset['processed_article'] = pre_process(news_dataset['article'])

In [14]:
news_dataset.head()

Unnamed: 0,id,author,date,year,month,topic,article,processed_article
0,17307,Marlise Simons,1/01/2017,2017,1,architecture,PARIS ? When the Islamic State was about to...,PARIS Islamic State driven ancient city Palmyr...
1,17292,Andy Newman,31/12/2016,2016,12,art,Angels are everywhere in the Mu?iz family?s ap...,Angels everywhere Mu'iz family's apartment Bro...
2,17298,Emma G. Fitzsimmons,2/01/2017,2017,1,business,Finally. The Second Avenue subway opened in Ne...,Finally. Second Avenue subway opened New York ...
3,17311,Carl Hulse,3/01/2017,2017,1,business,WASHINGTON ? It?s or time for Republica...,WASHINGTON time Republicans. tumultuous decade...
4,17339,Jim Rutenberg,5/01/2017,2017,1,business,"For Megyn Kelly, the shift from Fox News to NB...","Megyn Kelly, shift Fox News NBC host daily day..."


In [15]:
class NamedEntityRecognition:
    def __init__(self):
        self.ner_dict = {}
        self.nlp = spacy.load("en_core_web_sm")

    def example(self, article):
        """
        Display named entities in the given document using displacy.
        """
        text = self.nlp(article)
        displacy.render(text, style="ent", jupyter=True)
    
    def get_ner(self, data):
        """
        Extract named entities from the given data and store them in a dictionary.
        
        Args:
        data (DataFrame): DataFrame containing 'id' and 'clean' columns.
        
        Returns:
        dict: A dictionary containing named entities for each document.
        """
        for i in range(data.shape[0]):
            id = data['id'][i]
            text = self.nlp(str(data['processed_article'][i]))
            if id not in self.ner_dict:
                self.ner_dict[id] = [{"text": ent.text.strip(), "label": ent.label_} for ent in text.ents]
            else:
                existing_entities = set((entity['text'], entity['label']) for entity in self.ner_dict[id])
                new_entities = [{"text": ent.text.strip(), "label": ent.label_} for ent in text.ents
                                if (ent.text.strip(), ent.label_) not in existing_entities]
                self.ner_dict[id].extend(new_entities)
        return self.ner_dict

In [16]:
# Initializing NamedEntityRecognition object
ner = NamedEntityRecognition()

# Process example document and display named entities
ner.example(news_dataset['processed_article'][1][:800])

In [17]:
article_ner_dict = ner.get_ner(news_dataset)

In [18]:
article_ner_dict

{17307: [{'text': 'PARIS Islamic State', 'label': 'ORG'},
  {'text': 'March', 'label': 'DATE'},
  {'text': 'Yves Ubelmann', 'label': 'PERSON'},
  {'text': 'Syria', 'label': 'GPE'},
  {'text': 'Ubelmann', 'label': 'PERSON'},
  {'text': '36', 'label': 'DATE'},
  {'text': 'Syria', 'label': 'GPE'},
  {'text': 'Paris', 'label': 'GPE'},
  {'text': 'Islamists', 'label': 'NORP'},
  {'text': 'Houmam Saad', 'label': 'PERSON'},
  {'text': 'Syrian', 'label': 'NORP'},
  {'text': 'four day', 'label': 'DATE'},
  {'text': 'four', 'label': 'CARDINAL'},
  {'text': 'six', 'label': 'CARDINAL'},
  {'text': 'Ubelmann', 'label': 'PERSON'},
  {'text': 'Iconem', 'label': 'GPE'},
  {'text': 'Ubelmann', 'label': 'PERSON'},
  {'text': 'today', 'label': 'DATE'},
  {'text': 'Syria', 'label': 'GPE'},
  {'text': 'Iraq', 'label': 'GPE'},
  {'text': 'Islamic', 'label': 'NORP'},
  {'text': 'Ubelmann', 'label': 'PERSON'},
  {'text': 'Grand Palais Paris', 'label': 'FAC'},
  {'text': 'Jan. 9', 'label': 'DATE'},
  {'text': 

In [19]:
print("A example of entities is \n", article_ner_dict[17292], '\n',article_ner_dict[18465])

A example of entities is 
 [{'text': 'Bronx', 'label': 'GPE'}, {'text': 'angel wall', 'label': 'PERSON'}, {'text': 'Jos', 'label': 'PERSON'}, {'text': 'Zoraida', 'label': 'PERSON'}, {'text': 'Puerto Rico', 'label': 'GPE'}, {'text': 'Dickens', 'label': 'PERSON'}, {'text': 'Zoraida', 'label': 'PERSON'}, {'text': 'Jos', 'label': 'PERSON'}, {'text': 'Vietnam', 'label': 'GPE'}, {'text': '14.', 'label': 'CARDINAL'}, {'text': '29', 'label': 'CARDINAL'}, {'text': 'New York', 'label': 'GPE'}, {'text': '1983', 'label': 'DATE'}, {'text': '1987', 'label': 'DATE'}, {'text': "Mu'iz", 'label': 'PERSON'}, {'text': 'eight 10 day', 'label': 'DATE'}, {'text': 'first', 'label': 'ORDINAL'}, {'text': 'Jos? Jr.', 'label': 'PERSON'}, {'text': '2 six', 'label': 'CARDINAL'}, {'text': "Mu'iz", 'label': 'PERSON'}, {'text': '50', 'label': 'DATE'}, {'text': "Mu'izes", 'label': 'ORG'}, {'text': 'Westchester Avenue Bronx', 'label': 'FAC'}, {'text': "Mu'iz", 'label': 'PERSON'}, {'text': 'second', 'label': 'ORDINAL'}, 

In [20]:
count = 0
selected_article_ner = {}
for key, value in article_ner_dict.items():
    selected_article_ner[key] = value
    count += 1
    if count == 100:
        break

print(selected_article_ner.keys())

dict_keys([17307, 17292, 17298, 17311, 17339, 17340, 17342, 17344, 17346, 17284, 17300, 17302, 17314, 17319, 17323, 17325, 17336, 17345, 17355, 17285, 17286, 17297, 17306, 17308, 17327, 17347, 17296, 17337, 17318, 17360, 17361, 17362, 17363, 17283, 17287, 17289, 17295, 17301, 17305, 17309, 17312, 17313, 17317, 17321, 17328, 17330, 17332, 17333, 17334, 17335, 17349, 17350, 17352, 17353, 17354, 17356, 17358, 17364, 17365, 17294, 17303, 17331, 17326, 17341, 17291, 17366, 17367, 17368, 17369, 17370, 17371, 17374, 17376, 17378, 17379, 17381, 17382, 17383, 17384, 17385, 17386, 17387, 17388, 17389, 17390, 17391, 17392, 17393, 17394, 17395, 17396, 17397, 17398, 17399, 17400, 17401, 17402, 17403, 17404, 17406])


In [21]:
class KnowledgeBase:
    def __init__(self, data):
        self.data = data
        self.kb= {}
        self.nlp = spacy.load("en_core_web_sm")

    def getSentences(self,text):
        nlp = English()
        nlp.add_pipe(nlp.create_pipe('sentencizer'))
        document = nlp(text)
        return [sent.string.strip() for sent in document.sents]

    def appendChunk(self, original, chunk):
        return original + ' ' + chunk

    def isRelationCandidate(self, token):
        deps = ["ROOT", "adj", "attr", "agent", "amod"]
        return any(subs in token.dep_ for subs in deps)

    def isConstructionCandidate(self, token):
        deps = ["compound", "prep", "conj", "mod"]
        return any(subs in token.dep_ for subs in deps)

    def processSubjectObjectPairs(self, tokens):
        sub = ''
        obj = ''
        relation = ''
        subjectConstruction = ''
        objectConstruction = ''
        for token in tokens:
            if "punct" in token.dep_:
                continue
            if self.isRelationCandidate(token):
                relation = self.appendChunk(relation, token.lemma_)
            if self.isConstructionCandidate(token):
                if subjectConstruction:
                    subjectConstruction = self.appendChunk(subjectConstruction, token.text)
                if objectConstruction:
                    objectConstruction = self.appendChunk(objectConstruction, token.text)
            if "subj" in token.dep_:
                sub = self.appendChunk(sub, token.text)
                sub = self.appendChunk(subjectConstruction, sub)
                subjectConstruction = ''
            if "obj" in token.dep_:
                obj = self.appendChunk(obj, token.text)
                obj = self.appendChunk(objectConstruction, obj)
                objectConstruction = ''

        return (sub.strip(), relation.strip(), obj.strip())

    def auto_build_kb(self):
        for i in range(len(self.data)):
            doc = self.data['processed_article'][i]
            sentences = list(doc.split('.'))
            for sentence in sentences:                 
                s = self.nlp(sentence)
                sub, rel, obj = self.processSubjectObjectPairs(s)
                # set conditions
                if sub not in self.kb.keys() and sub.strip() != "" and rel.strip() != "" and obj.strip() != "":
                    self.kb[sub] = [{"relation":rel, "object": obj}]
                elif sub in self.kb.keys() and sub.strip() != "" and rel.strip() != "" and obj.strip() != "":
                    self.kb[sub].append({"relation":rel, "object": obj})
        print(self.kb)           


    def df_plot(self):
        """Get DataFrame from kb dictionary"""
        subjects, relations, objects = [],[],[]  
        for k in self.kb.keys():
            print(k)
            subjects.append(k)
            for i in range(len(self.kb[k])):
              relations.append(self.kb[k][i]['relation'])
              objects.append(self.kb[k][i]['object'])

        kg_df = pd.DataFrame({'subject':subjects,'relation':relations, 'object': objects})       
        return kg_df

    def knowledge_graph(self):
        """knowledge graph"""
        knowledge_dict = {}
        for name in self.kb.keys():
            for i in range(len(self.kb[name])):
              knowledge_dict[name] = self.kb[name][i]['object']

        # Create a new graph
        G = nx.Graph()

        # Add nodes to the graph
        G.add_nodes_from(knowledge_dict.keys())

        # Add edges to the graph
        for node, neighbors in knowledge_dict.items():
            for neighbor in neighbors:
                G.add_edge(node, neighbor)

        # Set node positions
        pos = nx.spring_layout(G)

        # Draw the graph
        nx.draw_networkx_nodes(G, pos, node_color='lightblue', node_size=1000)
        nx.draw_networkx_edges(G, pos, edge_color='grey')
        nx.draw_networkx_labels(G, pos, font_size=12, font_family='sans-serif')

        # Show the plot
        plt.axis('off')
        plt.show()


    def search(self, query):
        candidates = []
        for name in self.kb.keys():  
            if query == name:
                candidates.extend(self.kb[name]) 
        if candidates == []:
          print("There is no associated name for this canonical name.")
        return candidates

In [22]:
# build KB2 using the first 30 articles
KB2 = KnowledgeBase(news_dataset)
# Auto build kb
KB2.auto_build_kb()



In [23]:
print(KB2.kb['Islamists'])

[{'relation': '2015 send human mythical wing', 'object': 'video militant figure campaign'}]


In [24]:
KB2.search("Islamists")

[{'relation': '2015 send human mythical wing',
  'object': 'video militant figure campaign'}]

In [25]:
def InvertedIndex(query, ent_dic, data):
    """
    Retrieve a set of documents containing the given query term
    Input:
    query -> one word representing the query term
    ent_dic -> control the range of entities
    data -> control the size of data
    output:
    doc_index -> list of document IDs
    """
    # empty dictionary to store a set of documents
    docs_index = []

    # Retrieve the set of documents containing each query term
    # k: docid; v: name entities and labels
    for k, v in ent_dic.items():
        for i in v[0]['text'].split():
           if query == i:
              # if doc_id not in docs_index list
              if k not in docs_index:
                  docs_index.append(k)

    # In case cannot find the named_entities, use query to match text
    for i in range(data.shape[0]):
       if query in data['processed_article'][i]:
          id = data['id'][i]
          if id not in docs_index:
            docs_index.append(id)

    return docs_index


In [26]:
# Loading spaCy model
nlp_md = spacy.load("en_core_web_md")

class TextMatchingUtility:
    def __init__(self, entities, dataset, kb):
        """
        Initialize the TextMatchingUtility object.

        Args:
        entities: Named entity recognition result.
        dataset: Dataset containing articles.
        kb: Knowledge base for entity mapping.
        """
        self.ner = entities  # Named Entity Recognition result
        self.data = dataset  # Dataset
        self.kb = kb  # Knowledge Base
        self.nlp = spacy.load("en_core_web_sm")

    def preprocess_query(self, query):
        """
        Preprocess the query by removing stopwords and lemmatizing.

        Args:
        query: Input query string.

        Returns:
        List containing preprocessed query.
        """
        # Regular expression to match text patterns
        s = " \[(?=.*\d).*?\]"

        # Removing stopwords and Lemmatization
        stop_words = stopwords.words('english')
        stop_words.extend(["This", "The", "the"])
        lemmatizer = WordNetLemmatizer()
        result = [" ".join([lemmatizer.lemmatize(word) for word in re.sub(s, "", query).split() if word not in stop_words])]
        return result
    
    def get_article_ids(self, query):
        """
        Use inverted index method to get article IDs relevant to the query.

        Args:
        query: List of words in the query.

        Returns:
        List of article IDs.
        """
        index_dic = []
        if len(query) == 1:
            docs_index = InvertedIndex("".join(query), self.ner, self.data) 
            index_dic.extend(docs_index)
        else:
            for i in query:
                # Use inverted index method
                docs_index =  InvertedIndex(i, self.ner, self.data) 
                index_dic.extend(docs_index)
        # Avoid repeated doc_id
        index_dic = list(set(index_dic))
        return index_dic

    def tf_idf_score(self, query, articles):
        """
        Calculate TF-IDF similarity score between query and articles.

        Args:
        query: Preprocessed query.
        articles: List of articles.

        Returns:
        Array of similarity scores.
        """          
        vectorizer = TfidfVectorizer()
        # Convert to word vector
        articles_wv = vectorizer.fit_transform(articles)
        # Convert to word vector
        query_wv = vectorizer.transform(query)
        # Calculate similarity
        similarities = cosine_similarity(query_wv, articles_wv)[0]
        return similarities

    def spacy_score(self, query, articles):
        """
        Calculate SpaCy similarity score between query and articles.

        Args:
        query: Preprocessed query.
        articles: List of articles.

        Returns:
        List of similarity scores.
        """
        # Convert to word vector
        query_nlp = nlp_md(str(query))
        # Convert to word vector
        articles_nlp = [nlp_md(article) for article in articles]
        # Calculate similarity
        similarities = [query_nlp.similarity(article_nlp) for article_nlp in articles_nlp]
        return similarities
    
    def get_best_sentence(self, query, article, article_id, word_vector):
        """
        Get the best matching sentence in an article for a query. 

        Args:
        query: Preprocessed query.
        article: Article text.
        article_id: ID of the article.
        word_vector: Word vector method ("tf-idf" or "spaCy").

        Returns:
        Tuple containing best matching sentence and its score.
        """
        # Convert text into sentences
        sentences_clean = tokenize.sent_tokenize(article)

        # Calculate similarity
        if word_vector == "tf-idf":
            similarities = self.tf_idf_score(query, sentences_clean)
        elif word_vector == "spaCy":
            similarities = self.spacy_score(query, sentences_clean)

        # Get the maximum score index position
        best_idx = np.array(similarities).argmax()
        # Get the best score
        best_score = max(similarities)
        # Get original data
        for j in range(len(self.data['id'])):
            if self.data['id'][j] == article_id:
                topic = self.data['article'][j]
        sentences_topic = tokenize.sent_tokenize(topic)
        answer = sentences_topic[best_idx]
        return answer, best_score
        

    def get_topk_answers(self, query, article_ids, k, word_vector):
        """
        Get top-k answers based on similarity scores between query and articles.

        Args:
        query: Preprocessed query.
        article_ids: List of article IDs.
        k: Number of top results to retrieve.
        word_vector: Word vector method ("tf-idf" or "spaCy").

        Returns:
        DataFrame containing top-k answers and scores.
        """
        answer_score_df = pd.DataFrame()
        # Store article ids
        answer_score_df['id'] = article_ids       

        # Get articles based on ids
        articles = []
        for i in article_ids:
            for j in range(len(self.data['id'])):
                if self.data['id'][j] == i:
                    articles.append(self.data['processed_article'][j])
        answer_score_df['article'] = articles

        # Select method according to word_vector
        if word_vector == "tf-idf":
            similarities = self.tf_idf_score(query, articles)
        elif word_vector == "spaCy":
            similarities = self.spacy_score(query, articles)
        
        # Store similarity score
        answer_score_df['article_Score'] = similarities

        # Sort articles by their similarity scores and get the top results
        answer_score_df2 = answer_score_df.sort_values(by='article_Score', ascending=False)[:k]

        # Select the best answer sentence in the top k articles
        answer_list = []
        scores = []
        cnt = 0
        for index, row in answer_score_df2.iterrows():  
            article = row['article']
            article_id = row['id']
            answer, best_score = self.get_best_sentence(query, article, article_id, word_vector)
            answer_list.append(answer)
            scores.append(best_score)
            cnt += 1
            if cnt == k:
                break
        answer_score_df2['answer'] = answer_list
        answer_score_df2['score'] = scores

        # Sort the best answer according to sentence score
        answer_score_final = answer_score_df2.sort_values(by='score', ascending=False)[:k]

        return answer_score_final
    
    def text_matching(self, question, k=3, word_vector="spaCy"):
        """
        Perform text matching to find answers to the question.

        Args:
        question: Input question string.
        k: Number of top results to retrieve (default is 3).
        word_vector: Word embedding method ("tf-idf" or "spaCy", default is "spaCy").

        Returns:
        DataFrame containing top-k answers and scores.
        """
        # Question pre-processing
        query = self.preprocess_query(question)

        # Use NER to identify named entities in the question
        query_ents = [ent.text.strip() for ent in self.nlp(str(query)).ents]

        # Use InvertedIndex method and NER to get article IDs
        if query_ents == []:
            article_ids = self.get_article_ids(query)
        else:
            article_ids = self.get_article_ids(query_ents)

        # Use KB to find associated names of named entities in the question
        if not article_ids:
            query_str = " ".join(query)
            if len(query) == 1 and len(query_str.split(" ")) == 1:
                lst = self.kb.search(query_str)
                for i in lst:
                    article_ids.extend(self.get_article_ids(i))
            else:
                for word in query_str.split(" "):
                    lst = self.kb.search(word)
                    for i in lst:
                        article_ids.extend(self.get_article_ids(i)) 

        # If article_ids is empty, finish the function 
        if not article_ids:
            print("Sorry, no answer for this question.")
            return None

        # Remove repeated article_ids
        article_ids = list(set(article_ids))

        # Get top k articles answer and score   
        top_results = self.get_topk_answers(query, article_ids, k, word_vector)
        
        return top_results

    def display_results(self, top_results): 
        """
        Display top-k results.

        Args:
        top_results: DataFrame containing top-k answers and scores.
        """
        if top_results is not None:       
            for index, row in top_results.iterrows():
                print("The article id is ", row['id'])  
                print("The answer is \n", row['answer'])
                print("The score is ", row['score'],"\n")
        return None


In [27]:
# Create an instance of TextMatchingUtility
tm_utility = TextMatchingUtility(article_ner_dict, news_dataset, KB2)

# Sample question
question = "What Hollywood actor lent his name to the event and received some of the loudest cheers of the evening?"

# Perform text matching and get top results
top_results = tm_utility.text_matching(question)

# Display top results
tm_utility.display_results(top_results)

The article id is  17509
The answer is 
 included HBO, the Weinstein Company, Fox, NBCUniversal, Netflix, Amazon, Warner Bros. and InStyle.
The score is  0.5151522631183384 

The article id is  17510
The answer is 
 And when it comes to the red carpet, at least pretending to dress as yourself as opposed to, say, a cut flower or Disney caricature, has power.
The score is  0.48566571376004464 

The article id is  17511
The answer is 
 It was a fitting choice, if not an especially hilarious one, at the most escapist of awards shows.
The score is  0.47108653202391615 



In [28]:
# question for text matching
question = "What Hollywood actor lent his name to the event and received some of the loudest cheers of the evening?"
result = matching.text_matching(question, k=3,word_vector="spaCy")
print("Question: ",question)
matching.display_results(result)   

NameError: name 'matching' is not defined

## A. Tasks as specified for your team structure

**One headings for each task.**

## B. References

## C. Appendix