In [1]:
import bs4  # Scraping HTML
import requests  # Making Requests to a websites

import spacy
import pytextrank

import pandas as pd # Data Wrangling
import numpy as np # Math
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

import re # Regex

In [2]:
# Make a Request to a healthcare and pharmaceuticals subsite of Reuters
request = requests.get("https://www.reuters.com/business/healthcare-pharmaceuticals/")

In [3]:
# Parse Healthcare Subsite of Reuters
soup = bs4.BeautifulSoup(request.text, 'html.parser')

In [4]:
# Find all html parts containing links
anchor = soup.find_all("a")

In [5]:
# Print all extracted Links found on website
for attr in anchor:
    print(attr["href"])

#main-content
/differentiator/
/
/world/
/business/
/markets/
/sustainability/
/legal/
/breakingviews/
/technology/
https://www.reuters.com/investigates/
/account/register/sign-up/&journeyStart=navigation
/business/healthcare-pharmaceuticals/lilly-build-25-bln-diabetes-drug-plant-germany-2023-11-17/
/business/healthcare-pharmaceuticals/
/business/healthcare-pharmaceuticals/lilly-build-25-bln-diabetes-drug-plant-germany-2023-11-17/
/business/future-of-money/
/business/healthcare-pharmaceuticals/boom-weight-loss-drugs-drive-up-us-employers-medical-costs-2024-mercer-2023-11-17/
/business/healthcare-pharmaceuticals/boom-weight-loss-drugs-drive-up-us-employers-medical-costs-2024-mercer-2023-11-17/
/tags/mergers-acquisitions/
/business/healthcare-pharmaceuticals/manufacturer-lonza-says-it-will-not-fill-obesity-drug-syringes-2023-11-17/
/business/healthcare-pharmaceuticals/manufacturer-lonza-says-it-will-not-fill-obesity-drug-syringes-2023-11-17/
/tags/regulatory-oversight/
/business/healthca

In [6]:
# Get only links to articles about novo nordisk company
novo_news = [attr["href"] for attr in anchor if "novo" in attr["href"] or "ozempic" in attr["href"] or "wegovy" in attr["href"]]

In [7]:
novo_news

['https://www.reuters.com/business/healthcare-pharmaceuticals/healthcare-companies-counter-investor-worries-over-wegovy-effect-2023-10-27/',
 'https://www.reuters.com/business/healthcare-pharmaceuticals/wegovy-other-weight-loss-drugs-scrutinized-over-reports-suicidal-thoughts-2023-09-28/']

In [8]:
all_article_paragraphs = []

for article in novo_news:
    # Make a Request Append Https Reuters string for proper url format.
    request1 = requests.get("https://www.reuters.com" if "https" not in article else "" + article)
    
    # Parse HTML
    soup_subsite = bs4.BeautifulSoup(request1.text, "html.parser")
    
    # Seek All Paragraphs
    paragraphs = soup_subsite.find_all("p")
    
    # Delete html stuff
    # extracted_text = [re.sub("<[^>]+>", "", str(paragraph)) for paragraph in paragraphs]
    extracted_text = [paragraph.text for paragraph in paragraphs]
    

    # Append to a list of all articles
    all_article_paragraphs.append(extracted_text)

Article Requires Further Cleaning - Images, Legal Notices and other should be removed.

In [9]:
def article_tokenization(article):
    article = " ".join(article)
    sentences = nltk.sent_tokenize(article)
    tokenized = [nltk.word_tokenize(sentence) for sentence in sentences]
    return tokenized    

In [10]:
def process_words(tokenized_article, stemmer):
    words_to_remove = set(stopwords.words("english"))
    return [[stemmer.stem(word) for word in sentence if word not in words_to_remove] for sentence in tokenized_article]

In [11]:
tokenized_articles = [article_tokenization(article) for article in all_article_paragraphs]

In [12]:
ps = nltk.PorterStemmer()

In [13]:
tokenized_articles = [process_words(article, ps) for article in tokenized_articles]

In [14]:
tokenized_articles

[[['[',
   '1/3',
   ']',
   'a',
   'view',
   'plastic',
   'model',
   'stomach',
   'interview',
   'doctor',
   'thoma',
   'horbach',
   ',',
   'specialist',
   'surgeri',
   ',',
   'viscer',
   'surgeri',
   'nutrit',
   'medicin',
   'novo',
   'nordisk',
   ',',
   'start',
   'sell',
   'huge',
   'popular',
   'obes',
   'drug',
   'wegovi',
   'germani',
   'later',
   'month',
   ',',
   'munich',
   ',',
   'germani',
   ',',
   'juli',
   '17',
   ',',
   '2023',
   '....',
   'acquir',
   'licens',
   'right',
   'read',
   'oct',
   '27',
   '(',
   'reuter',
   ')',
   '-',
   'healthcar',
   'compani',
   'profit',
   'treat',
   'obes',
   'overweight',
   'patient',
   'tri',
   'convinc',
   'investor',
   'power',
   'new',
   'weight-loss',
   'drug',
   'wo',
   "n't",
   'shrink',
   'busi',
   '.'],
  ['the',
   'global',
   'market',
   'obes',
   'drug',
   'could',
   'reach',
   'much',
   '$',
   '100',
   'billion',
   'within',
   'decad',
   'due',


In [15]:
bow = CountVectorizer()

In [16]:
bow.fit([" ".join(sentence) for sentence in tokenized_articles[0]])

In [17]:
bow.transform([" ".join(sentence) for sentence in tokenized_articles[0]]).toarray()

array([[0, 0, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [52]:

# example text
text = " ".join([paragraph for paragraph in all_article_paragraphs[1] if "reuters" not in paragraph.lower()])
# load a spaCy model, depending on language, scale, etc.
nlp = spacy.load("en_core_web_sm")

# add PyTextRank to the spaCy pipeline
nlp.add_pipe("textrank")
doc = nlp(text)

In [53]:
len(text)

8697

In [54]:
len(doc)

1611

In [55]:
print(*[paragraph for paragraph in doc.text.split(".") if len(paragraph) > 2], sep="\n")

Each Sunday for more than a year, the 53-year-old Ohio real estate agent took her weekly injection to help control her blood sugar
 Then every Tuesday, she felt lethargic, depressed and sometimes suicidal, thinking her husband and four children might be better off without her
 These feelings would last a few days, and the cycle repeated every week — except when she skipped a dose
 “I knew it was the drug,” said Heidlebaugh, who said she had not previously suffered from depression
 Moore, faculty associate at the Johns Hopkins Bloomberg School of Public Health, who has studied the incidence of rare psychiatric side effects
 “This doesn’t mean to automatically blame the drug,” he said
 “It does mean that a patient's complaint should not be automatically dismissed
” More than half of the narrative summaries describe suicidal thoughts appearing shortly after the person started the medicine or increased the dose
 About two fifths of them said symptoms ceased after the patient stopped taking