In [66]:
import numpy as np
import pandas as pd
import nltk
# nltk.download('punkt') # one time execution
import re




In [67]:
entire_df = pd.read_csv("capstone_data.csv")

In [68]:
#Generate a single article from all the articles. 
text=entire_df ['text_body'][1]
text

"Introduction\n\nThe sudden emergence of severe acute respiratory syndrome (SARS) in April 2003 caused much concern and reaction. Refereed medical journals ever since have been rife with articles about SARS. The eventual containment and treatment of SARS has seen a diminution of the massive media publicity and overt public concern. However, fears have recently surfaced about the potential for re-emergence of SARS in the near future. As we confront the potential need to return to more stringent infection control measures once again, this is an appropriate time to reflect on the ethical values that underlay the strict visitation restrictions imposed in hospitals in Ontario during the SARS outbreak and the moderate restrictions in place since SARS. This reflection will facilitate future decision making with respect to visitation restrictions.\n\nWhen public health trumps civil liberties: the collateral damage associated with victims of SARS\n\nOur infectious disease colleagues are adamant

In [69]:
#Split Text into Sentences
# break the text into individual sentences. using the sent_tokenize( ) function of the nltk library to do this.
#Flattening lists means converting a multidimensional or nested list into a one-dimensional list.
from nltk.tokenize import sent_tokenize
sentences= sent_tokenize(text)




In [70]:
#Let’s print a few elements of the list sentences.

sentences[:5]

['Introduction\n\nThe sudden emergence of severe acute respiratory syndrome (SARS) in April 2003 caused much concern and reaction.',
 'Refereed medical journals ever since have been rife with articles about SARS.',
 'The eventual containment and treatment of SARS has seen a diminution of the massive media publicity and overt public concern.',
 'However, fears have recently surfaced about the potential for re-emergence of SARS in the near future.',
 'As we confront the potential need to return to more stringent infection control measures once again, this is an appropriate time to reflect on the ethical values that underlay the strict visitation restrictions imposed in hospitals in Ontario during the SARS outbreak and the moderate restrictions in place since SARS.']

In [71]:
# Extract word vectors
#GloVe is a commonly used algorithm for natural language processing (NLP). It was trained on Wikipedia and Gigawords.
#Get the word embeddings
# https://www.kaggle.com/datasets/thanakomsn/glove6b300dtxt
word_embeddings = {}
f = open("archive/glove.6B.300d.txt", encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

In [72]:
#Getting the number of word embeddings in this glove model 
len(word_embeddings)


400000

In [73]:
# remove punctuations, numbers and special characters
clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ")

# make alphabets lowercase
clean_sentences = [s.lower() for s in clean_sentences]

In [74]:
#Get rid of the stopwords (commonly used words of a language – is, am, the, of, in, etc.) present in the sentences. 
#If you have not downloaded nltk-stopwords, then execute the following line of code:

nltk.download('stopwords')
#Now we can import the stopwords.

from nltk.corpus import stopwords
stop_words = stopwords.words('english')
#Let’s define a function to remove these stopwords from our dataset.

# function to remove stopwords
def remove_stopwords(sen):
    sen_new = " ".join([i for i in sen if i not in stop_words])
    return sen_new
# remove stopwords from the sentences
clean_sentences = [remove_stopwords(r.split()) for r in clean_sentences]
#We will use clean_sentences to create vectors for sentences in our data with the help of the GloVe word vectors.

[nltk_data] Error loading stopwords: <urlopen error [WinError 10060] A
[nltk_data]     connection attempt failed because the connected party
[nltk_data]     did not properly respond after a period of time, or
[nltk_data]     established connection failed because connected host
[nltk_data]     has failed to respond>


In [75]:
clean_sentences

['introduction sudden emergence severe acute respiratory syndrome (sars) april 2003 caused much concern reaction.',
 'refereed medical journals ever since rife articles sars.',
 'eventual containment treatment sars seen diminution massive media publicity overt public concern.',
 'however, fears recently surfaced potential re-emergence sars near future.',
 'confront potential need return stringent infection control measures again, appropriate time reflect ethical values underlay strict visitation restrictions imposed hospitals ontario sars outbreak moderate restrictions place since sars.',
 'reflection facilitate future decision making respect visitation restrictions.',
 'public health trumps civil liberties: collateral damage associated victims sars infectious disease colleagues adamant restricting movement people around hospital setting effective clinical epidemiological strategies help protect vulnerable patient population health care providers themselves, need stay healthy may care 

In [76]:
# Extract word vectors
word_embeddings = {}
f = open("archive/glove.6B.300d.txt", encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

In [77]:
#Now, let’s create vectors for our sentences. 
#We will first fetch vectors (each of size 100 elements) for the constituent words in a sentence and then 
#take mean/average of those vectors to arrive at a consolidated vector for the sentence.

sentence_vectors = []
for i in clean_sentences:
  if len(i) != 0:
    v = sum([word_embeddings.get(w, np.zeros((300,))) for w in i.split()])/(len(i.split())+0.003)
  else:
    v = np.zeros((300,))
  sentence_vectors.append(v)

In [78]:
# similarity matrix
sim_mat = np.zeros([len(sentences), len(sentences)])
#We will use Cosine Similarity to compute the similarity between a pair of sentences.

from sklearn.metrics.pairwise import cosine_similarity
#And initialize the matrix with cosine similarity scores.

for i in range(len(sentences)):
  for j in range(len(sentences)):
    if i != j:
      sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,300), sentence_vectors[j].reshape(1,300))[0,0]
 

In [79]:
import networkx as nx

nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph)
 

In [80]:
#Summary Extraction
#Finally,  extract the top N sentences based on their rankings for summary generation.
summary = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
# Extract no1 ranked sentences as the summary
for i in range(1):
    print(summary[i][1])

It could be argued that visitation restrictions, in light of a potential outbreak of a contagious disease, are ethically sound because of the compelling need to protect public health.However, even when public health concerns trump individual liberties, the ethical operationalization of this value would demand that 'those whose rights are being infringed' need to be managed in 'an ethical and even-handed manner so that they are not unfairly or disproportionately harmed by such measures' [1].This is an important and far-reaching consideration because SARS caused collateral damage and we know that the implementation of visitation restrictions will have an impact on a broad range of individuals.


In [81]:
#Evaluation
#Read reference summary
ref_summary = '''Our infectious disease colleagues are adamant that restricting the movement of people into and around the hospital setting are effective clinical and epidemiological strategies that will help protect both the vulnerable patient population and health care providers themselves, 
who need to stay healthy so that they may care for their patients. In a health care institution, visitation restrictions not only affect inpatients but also have an impact on ambulatory patients who must come for diagnostic tests or interventions and who, if deprived access, might develop urgent or emergent conditions.
Feedback should be sought from those individuals who would be affected by visitation restrictions, such as staff, patients and family members.Health care workers, being in direct communication with patients and families, bear the brunt of their anger and frustration regarding any restriction in visitation.
If a family is allowed to visit a patient whose death is presumed to be imminent, then the patient's identity should be protected by using privacy strategies.
'''
summary = '''It could be argued that visitation restrictions, in light of a potential outbreak of a contagious disease, 
are ethically sound because of the compelling need to protect public health.However, even when public health concerns trump 
individual liberties, the ethical operationalization of this value would demand that 'those whose rights are being infringed' 
need to be managed in 'an ethical and even-handed manner so that they are not unfairly or disproportionately harmed by such 
measures' [1].This is an important and far-reaching consideration because SARS caused collateral damage and we know that the 
implementation of visitation restrictions will have an impact on a broad range of individuals.'''

#The abstract is the target: Text containing the target (ground truth) text.- The Gold Standard
# the summary generated is the prediction: Text containing the predicted text.


from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1','rouge2'], use_stemmer=True)
scores = scorer.score(ref_summary,summary)

scores

{'rouge1': Score(precision=0.44036697247706424, recall=0.2962962962962963, fmeasure=0.3542435424354244),
 'rouge2': Score(precision=0.09259259259259259, recall=0.062111801242236024, fmeasure=0.07434944237918215)}