## Pegagus

In [35]:
import time
import requests
import nltk
import pandas as pd
from pathlib import Path
from bs4 import BeautifulSoup

In [4]:
API_KEY = 'LaeKPAYxU9a7LXNw9yzbbw5mydUhg3VQ'

In [5]:
def send_request(topic):
    '''Sends a request to the NYT Archive API for given date.'''
    base_url = 'https://api.nytimes.com/svc/topstories/v2/'
    url = base_url + '/' + topic + '.json?api-key=' + str(API_KEY)
    response = requests.get(url).json()
    time.sleep(2)
    return response

In [6]:
def parse_responses(response):

    data = {'title': [],
            'section': [],
            'url': [],
            }

    articles = response['results']
    for article in articles:
        title = article['title']
        section = article['section']
        url = article['url']
        if title:
            data['title'].append(title)
            data['section'].append(section)
            data['url'].append(url)

    return pd.DataFrame(data)

In [7]:
def get_data(topic):
    response = send_request(topic)
    return parse_responses(response)

In [8]:
def get_text_from_url(url):
    session = requests.Session()

    req = session.get(url)
    soup = BeautifulSoup(req.text, 'html.parser')
    paragraphs = soup.find_all('p')

    text = [p.get_text() for p in paragraphs]

    if text[-2].startswith('['):
        return ', '.join(text[4:-2])
    else:
        return ', '.join(text[4:-1])

In [9]:
def add_text_columns(df):

    df = df[:5].copy()
    t = time.time()
    df['article_text'] = [get_text_from_url(url) for url in df.url]
    print(time.time() - t)

    return df

In [12]:
def get_text(topic):

    df = get_data(topic)
    return add_text_columns(df)

In [78]:
def get_summary(title):

    # Get the text of the article based on the title that is given
    text = df.loc[df.title == title].article_text.values
    
    url = df.loc[df.title == title].url.values

    # Make sure article length
    #max_len = min(len(text[0]), 1024)
    #text = text[0][:max_len]

    return text, url

In [36]:
def nest_sentences(document):
    nested = []
    sent = []
    length = 0
    for sentence in nltk.sent_tokenize(document):
        length += len(sentence)
        if length < 1024:
            sent.append(sentence)
        else:
            nested.append(sent)
            sent = [sentence]
            length = len(sentence)

    if sent:
        nested.append(sent)
    return nested

In [25]:
topic = "science"

In [26]:
df = get_text(topic)

0.9856843948364258


In [66]:
title = df.title[2]

In [81]:
text,url = get_summary(title)
text, url = text[0], url[0]

In [90]:
from io import StringIO
import sys

result = StringIO()
sys.stdout = result

In [91]:
result.getvalue()

''

In [None]:
result_string = result.getvalue()

In [84]:
h = print(url)

https://www.nytimes.com/2020/10/13/health/eli-lilly-antibody.html


In [85]:
h

In [68]:
nested = nest_sentences(text)

In [69]:
nested[0]

['A government-sponsored clinical trial that is testing an antibody treatment for Covid-19 developed by the drugmaker Eli Lilly has been paused because of a “potential safety concern,” according to emails that government officials sent on Tuesday to researchers at testing sites.',
 'The company confirmed the pause., The news comes just a day after Johnson & Johnson announced the pause of its coronavirus vaccine trial because of a sick volunteer, and a month after AstraZeneca’s vaccine trial was halted over concerns about two participants who had fallen ill after getting the company’s vaccine., The Eli Lilly trial was designed to test the benefits of the antibody therapy on hundreds of people hospitalized with Covid-19, compared with a placebo.',
 'All of the study participants also received another experimental drug, remdesivir, which has become commonly used to treat coronavirus patients.']

In [70]:
import sys
sys.path.append('../')
from src.models.build_model import build_model
from src.configs.yacs_configs import get_cfg_defaults

class SummaryPredictor:
    """
    Summarizes a given text
    """
    def __init__(self, cfg):
        self.cfg = cfg
        self.model, self.tokenizer = build_model(self.cfg)

    def __call__(self, text, cfg):
        """
        Summarize on a single text
        """
        assert isinstance(text, str), "text must be a string"

        input_ids = self.tokenizer.encode(text, return_tensors="pt")
        input_ids = input_ids.to(cfg.DEVICE)
        output = self.model.generate(input_ids,
                                     max_length=cfg.MAX_LENGTH,
                                     num_beams=cfg.NUM_BEAMS,
                                     repetition_penalty=cfg.REPETITION_PENALTY,
                                     length_penalty=cfg.LENGTH_PENALTY,
                                     early_stopping=cfg.EARLY_STOPPING)

        return [self.tokenizer.decode(g,
                                      skip_special_tokens=True,
                                      clean_up_tokenization_spaces=True) for g in output][0]

    def generate_long_summary(self, nested_sentences, device):
        '''Generate summary on text with <= 1024 tokens'''

        summaries = []
        for nested in nested_sentences:
            input_tokenized = self.tokenizer.encode(' '.join(nested), truncation=True, return_tensors='pt')
            input_tokenized = input_tokenized.to(device)
            summary_ids = self.model.to(device).generate(input_tokenized,
                                                         length_penalty=3.0,
                                                         min_length=30,
                                                         max_length=100)
            output = [self.tokenizer.decode(g,
                                            skip_special_tokens=True,
                                            clean_up_tokenization_spaces=False) for g in summary_ids]
            summaries.append(output)
        summaries = [sentence for sublist in summaries for sentence in sublist]
        return summaries

In [71]:
cfg = get_cfg_defaults()
cfg.MODEL.DEVICE = 'cpu'
device = 'cpu'

In [72]:
predictor = SummaryPredictor(cfg.MODEL)

In [77]:
nested[:4]

[['A government-sponsored clinical trial that is testing an antibody treatment for Covid-19 developed by the drugmaker Eli Lilly has been paused because of a “potential safety concern,” according to emails that government officials sent on Tuesday to researchers at testing sites.',
  'The company confirmed the pause., The news comes just a day after Johnson & Johnson announced the pause of its coronavirus vaccine trial because of a sick volunteer, and a month after AstraZeneca’s vaccine trial was halted over concerns about two participants who had fallen ill after getting the company’s vaccine., The Eli Lilly trial was designed to test the benefits of the antibody therapy on hundreds of people hospitalized with Covid-19, compared with a placebo.',
  'All of the study participants also received another experimental drug, remdesivir, which has become commonly used to treat coronavirus patients.'],
 ['It is unclear exactly what safety issues prompted the pause., In large clinical trials, 

In [73]:
predictor.generate_long_summary(nested, device)

'A government-sponsored clinical trial that is testing an antibody treatment for Covid-19 has been paused. The company confirmed the pause. The news comes just a day after Johnson & Johnson announced the pause of its coronavirus vaccine trial because of a sick volunteer.\nIt is unclear exactly what safety issues prompted the pause., In large clinical trials, pauses are not unusual. Such halts are meant to allow an independent board of scientific experts to review the data.\nThe N.I.H. statement did not specify the nature of the participants’ conditions. The independent safety board found that after five days of treatment, the group of patients who had received the antibodies showed a different ‘clinical status’\n“It’s so amorphous,” Dr. Topol said. The safety board will review the data again on Oct. 26, and advise the N.I.H.\nEli Lilly is one of several companies pursuing experimental treatments for Covid-19. Mr. Trump has promoted such treatments, without evidence, as a “cure” for his

In [57]:
text = ['More than 38 million people worldwide have been infected with the coronavirus. As of Monday, fewer than five of those cases have been confirmed by scientists to be reinfections.',
 'An 89-year-old woman in the Netherlands died during her second illness. For at least three people, including one patient in Ecuador, the illness was more severe the second time around.',
 'Since the first confirmed case of reinfection, reported in Hong Kong on Aug. 24, there have been three published cases; reports of another 20 await scientific review. It’s impossible to know exactly how widespread the phenomenon is.',
 'A vast majority of people who do get reinfected may go undetected. The man in Hong Kong had no symptoms the second time, and his infection was discovered only because of routine screening at the airport.',
 '“If this was a very common event, we would have seen thousands of cases,” Dr. Iwasaki said. Reinfections can occur for any number of reasons.',
 'In other reinfected patients, genetic factors or the lack of certain previous immune exposures may have blunted the body’s ability to fight off a second attack. In a vast majority of known infected patients, experts said, the immune system functions as it should against other pathogens.',
 'For every confirmed case of reinfection, there are dozens of anecdotal reports of infected people who were sick and seemingly recovered. Usually there are crucial data missing in those cases, like a confirmed lab diagnosis.',
 'The virus may set off an inflammatory response that can flare up even weeks later. In rare cases, some patients may develop a chronic low-grade infection with the virus.',
 'South Korean researchers investigated hundreds of reports of reinfection and were able to rule them out as real cases. Similar procedures would be needed to rule out the possibility of transmission in each patient.',
 'The worry is that the immunity produced by vaccines will not be sufficient in preventing reinfections with the virus. In reality, vaccines have a better chance at generating robust immunity than does natural infection.',
 'Vaccines can also be manipulated to enhance immune memory, in that way producing more lasting, more protective responses. Vaccine trials are designed to look for an absence of disease, not infection.']

In [64]:
print('\n'.join(text))

More than 38 million people worldwide have been infected with the coronavirus. As of Monday, fewer than five of those cases have been confirmed by scientists to be reinfections.
An 89-year-old woman in the Netherlands died during her second illness. For at least three people, including one patient in Ecuador, the illness was more severe the second time around.
Since the first confirmed case of reinfection, reported in Hong Kong on Aug. 24, there have been three published cases; reports of another 20 await scientific review. It’s impossible to know exactly how widespread the phenomenon is.
A vast majority of people who do get reinfected may go undetected. The man in Hong Kong had no symptoms the second time, and his infection was discovered only because of routine screening at the airport.
“If this was a very common event, we would have seen thousands of cases,” Dr. Iwasaki said. Reinfections can occur for any number of reasons.
In other reinfected patients, genetic factors or the lack 