In [1]:
import re
import time
import urllib
from bs4 import BeautifulSoup

In [2]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
# VADER (Valence Aware Dictionary and sEntiment Reasoner) is a lexicon and rule-based sentiment analysis tool
# that is specifically attuned to sentiments expressed in social media.
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mblume\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\mblume\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\mblume\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\mblume\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\mblume\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [3]:
exampleItem = 'The incredibly intimidating NLP scares people away who are sissies.'
contentArray =['Starbucks is not doing very well lately.',
               'Overall, while it may seem there is already a Starbucks on every corner, Starbucks still has a lot of room to grow.',
               'They just began expansion into food products, which has been going quite well so far for them.',
               'I can attest that my own expenditure when going to Starbucks has increased, in lieu of these food products.',
               'Starbucks is also indeed expanding their number of stores as well.',
               'Starbucks still sees strong sales growth here in the united states, and intends to actually continue increasing this.',
               'Starbucks also has one of the more successful loyalty programs, which accounts for 30%  of all transactions being loyalty-program-based.',
               'As if news could not get any more positive for the company, Brazilian weather has become ideal for producing coffee beans.',
               'Brazil is the world\'s #1 coffee producer, the source of about 1/3rd of the entire world\'s supply!',
               'Given the dry weather, coffee farmers have amped up production, to take as much of an advantage as possible with the dry weather.',
               'Increase in supply... well you know the rules...',]

In [4]:
tokenized = nltk.word_tokenize(contentArray[3])
print(tokenized)

['I', 'can', 'attest', 'that', 'my', 'own', 'expenditure', 'when', 'going', 'to', 'Starbucks', 'has', 'increased', ',', 'in', 'lieu', 'of', 'these', 'food', 'products', '.']


In [5]:
# POS tag list:
# * CC coordinating conjunction
# * CD cardinal digit
# * DT determiner
# * EX existential there (like: "there is" ... think of it like "there exists")
# * FW foreign word
# * IN preposition/subordinating conjunction
# * JJ adjective 'big'
# * JJR adjective, comparative 'bigger'
# * JJS adjective, superlative 'biggest'
# * LS list marker 1)
# * MD modal could, will
# * NN noun, singular 'desk'
# * NNS noun plural 'desks'
# * NNP proper noun, singular 'Harrison'
# * NNPS proper noun, plural 'Americans'
# * PDT predeterminer 'all the kids'
# * POS possessive ending parent's
# * PRP personal pronoun I, he, she
# * PRP$ possessive pronoun my, his, hers
# * RB adverb very, silently,
# * RBR adverb, comparative better
# * RBS adverb, superlative best
# * RP particle give up
# * TO to go 'to' the store.
# * UH interjection errrrrrrrm
# * VB verb, base form take
# * VBD verb, past tense took
# * VBG verb, gerund/present participle taking
# * VBN verb, past participle taken
# * VBP verb, sing. present, non-3d take
# * VBZ verb, 3rd person sing. present takes
# * WDT wh-determiner which
# * WP wh-pronoun who, what
# * WP$ possessive wh-pronoun whose
# * WRB wh-abverb where, when
tagged = nltk.pos_tag(tokenized)
print(tagged)

[('I', 'PRP'), ('can', 'MD'), ('attest', 'VB'), ('that', 'IN'), ('my', 'PRP$'), ('own', 'JJ'), ('expenditure', 'NN'), ('when', 'WRB'), ('going', 'VBG'), ('to', 'TO'), ('Starbucks', 'NNP'), ('has', 'VBZ'), ('increased', 'VBN'), (',', ','), ('in', 'IN'), ('lieu', 'NN'), ('of', 'IN'), ('these', 'DT'), ('food', 'NN'), ('products', 'NNS'), ('.', '.')]


In [6]:
# Entity types:
# * LOCATION
# * ORGANIZATION
# * PERSON
# * GPE = geopolitical entity
# * DURATION
# * DATE
# * CARDINAL
# * PERCENT
# * MONEY
# * MEASURE
namedEnt = nltk.ne_chunk(tagged)
print(namedEnt)
# namedEnt.draw()

(S
  I/PRP
  can/MD
  attest/VB
  that/IN
  my/PRP$
  own/JJ
  expenditure/NN
  when/WRB
  going/VBG
  to/TO
  (PERSON Starbucks/NNP)
  has/VBZ
  increased/VBN
  ,/,
  in/IN
  lieu/NN
  of/IN
  these/DT
  food/NN
  products/NNS
  ./.)


In [7]:
def processLanguage():
    try:
        for item in contentArray:
            tokenized = nltk.word_tokenize(item)
            tagged = nltk.pos_tag(tokenized)
            namedEnt = nltk.ne_chunk(tagged)
            # namedEnt.draw()
            # time.sleep(1)
            print(namedEnt)

    except Exception:
        print(str(Exception))

In [8]:
processLanguage()

(S
  Starbucks/NNS
  is/VBZ
  not/RB
  doing/VBG
  very/RB
  well/RB
  lately/RB
  ./.)
(S
  (GPE Overall/JJ)
  ,/,
  while/IN
  it/PRP
  may/MD
  seem/VB
  there/EX
  is/VBZ
  already/RB
  a/DT
  Starbucks/NNS
  on/IN
  every/DT
  corner/NN
  ,/,
  (PERSON Starbucks/NNP)
  still/RB
  has/VBZ
  a/DT
  lot/NN
  of/IN
  room/NN
  to/TO
  grow/VB
  ./.)
(S
  They/PRP
  just/RB
  began/VBD
  expansion/NN
  into/IN
  food/NN
  products/NNS
  ,/,
  which/WDT
  has/VBZ
  been/VBN
  going/VBG
  quite/RB
  well/RB
  so/RB
  far/RB
  for/IN
  them/PRP
  ./.)
(S
  I/PRP
  can/MD
  attest/VB
  that/IN
  my/PRP$
  own/JJ
  expenditure/NN
  when/WRB
  going/VBG
  to/TO
  (PERSON Starbucks/NNP)
  has/VBZ
  increased/VBN
  ,/,
  in/IN
  lieu/NN
  of/IN
  these/DT
  food/NN
  products/NNS
  ./.)
(S
  Starbucks/NNS
  is/VBZ
  also/RB
  indeed/RB
  expanding/VBG
  their/PRP$
  number/NN
  of/IN
  stores/NNS
  as/IN
  well/RB
  ./.)
(S
  Starbucks/NNS
  still/RB
  sees/VBZ
  strong/JJ
  sales/NNS
  growth

In [9]:
sia = SentimentIntensityAnalyzer()

In [10]:
# The Compound score is a metric that calculates the sum of all the lexicon ratings which have been normalized 
# between -1 (most extreme negative) and +1 (most extreme positive).
# * positive sentiment : (compound score >= 0.05)
# * neutral sentiment : (compound score > -0.05) and (compound score < 0.05)
# * negative sentiment : (compound score <= -0.05)
sia.polarity_scores("Good")

{'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': 0.4404}

In [11]:
sia.polarity_scores(contentArray[7])

{'neg': 0.0, 'neu': 0.723, 'pos': 0.277, 'compound': 0.807}

In [12]:
link = "https://khn.org/news/trump-touted-abbotts-quick-covid-19-test-hhs-document-shows-only-5500-are-on-way-for-entire-u-s/"
# link = "https://text.npr.org/s.php?sId=841401198"
f = urllib.request.urlopen(link)
myfile = f.read()
print(myfile)

b'<!DOCTYPE html>\n<!--[if IE 6]>\n<html id="ie6" lang="en-US">\n<![endif]-->\n<!--[if IE 7]>\n<html id="ie7" lang="en-US">\n<![endif]-->\n<!--[if IE 8]>\n<html id="ie8" lang="en-US">\n<![endif]-->\n<!--[if !(IE 6) | !(IE 7) | !(IE 8)  ]><!-->\n<html lang="en-US">\n<!--<![endif]-->\n<head>\n\t<meta charset="UTF-8" />\n\t<meta http-equiv="X-UA-Compatible" content="IE=edge">\n\t<meta name="viewport" content="width=device-width, initial-scale=1">\n\t<title>Trump Touted Abbott\xe2\x80\x99s Quick COVID-19 Test. HHS Document Shows Only 5,500 Are On Way For Entire U.S. | Kaiser Health News</title>\n\n\t\t\t<meta name="description" content="States urgently need millions of tests, and the game changer they\xe2\x80\x99ve been waiting on falls well short of what is needed, according to government documents obtained by KHN." />\n\t\t\t\t\t<meta name="author" content="Rachana Pradhan" />\n\t\t\n\t<meta name="pocket-site-verification" content="53d9be31d11b661044e9f301a46a89" />\n\n\t<!-- Styles -->\

In [43]:
soup = BeautifulSoup(myfile, 'lxml')
mytext = soup.text
print(mytext)







Trump Touted Abbott’s Quick COVID-19 Test. HHS Document Shows Only 5,500 Are On Way For Entire U.S. | Kaiser Health News















































































				Kaiser Health News				







				Connect With Us 

















						Toggle navigation					


Support Our Work






Support Our Work



						Connect With Us:					



								Contact							



Twitter


Facebook


LinkedIn


Instagram


RSS






COVID-19
Health Law
Aging
Pharma
Investigations

Bill Of The Month
No Mercy
Electronic Health Records
Hidden Harm: Medical Devices
UVA Lawsuits
ALL KHN INVESTIGATIONS


More Topics

Podcasts
Medicaid
Medicare
Cost and Quality
Health Care Costs
Insurance
Mental Health
Public Health
Uninsured


Data & Docs
 

















		COVID-19	


Trump Touted Abbott’s Quick COVID-19 Test. HHS Document Shows Only 5,500 Are On Way For Entire U.S.




		By Rachana Pradhan 
April 2, 2020 



Republish This Story




 TwitterFacebookLinkedInPrint

In [44]:
mytext = re.sub(r'window._wpemojiSettings.+ga\(\'send\'\, \'pageview\'\)\;', '\n', mytext, 1, flags=re.S)
print(mytext)







Trump Touted Abbott’s Quick COVID-19 Test. HHS Document Shows Only 5,500 Are On Way For Entire U.S. | Kaiser Health News















































































				Kaiser Health News				







				Connect With Us 

















						Toggle navigation					


Support Our Work






Support Our Work



						Connect With Us:					



								Contact							



Twitter


Facebook


LinkedIn


Instagram


RSS






COVID-19
Health Law
Aging
Pharma
Investigations

Bill Of The Month
No Mercy
Electronic Health Records
Hidden Harm: Medical Devices
UVA Lawsuits
ALL KHN INVESTIGATIONS


More Topics

Podcasts
Medicaid
Medicare
Cost and Quality
Health Care Costs
Insurance
Mental Health
Public Health
Uninsured


Data & Docs
 

















		COVID-19	


Trump Touted Abbott’s Quick COVID-19 Test. HHS Document Shows Only 5,500 Are On Way For Entire U.S.




		By Rachana Pradhan 
April 2, 2020 



Republish This Story




 TwitterFacebookLinkedInPrint

In [45]:
mytext = re.sub(r'\s*\n', '\n', mytext)
mytext = re.sub(r'(?s)Connect With Us.*?Data . Docs', '\n', mytext, 1)
mytext = re.sub(r'\s*(Republish This Story|TwitterFacebookLinkedInPrint)\s*\n', '\n', mytext)
mytext = re.sub(r'\s*This story can be republished for free \(details\)\.', '', mytext, 1)
mytext = re.sub(r'\s*This story can be republished for free \(details\)\..*', '', mytext, 1, flags=re.S)
print(mytext)


Trump Touted Abbott’s Quick COVID-19 Test. HHS Document Shows Only 5,500 Are On Way For Entire U.S. | Kaiser Health News
				Kaiser Health News
				

		COVID-19
Trump Touted Abbott’s Quick COVID-19 Test. HHS Document Shows Only 5,500 Are On Way For Entire U.S.
		By Rachana Pradhan
April 2, 2020


 U.S. President Donald Trump holds a 5-minute test for COVID-19 from Abbott Laboratories. (Mandel Ngan/AFP via Getty Images)
	This story also ran on Daily Beast.
A coronavirus test made by Abbott Laboratories and introduced with considerable fanfare by President Donald Trump in a Rose Garden news conference this week is giving state and local health officials very little added capacity to perform speedy tests needed to control the COVID-19 pandemic.
“That’s a whole new ballgame,” Trump said. “I want to thank Abbott Labs for the incredible work they’ve done. They’ve been working around-the-clock.”
Yet a document circulated among officials at the Department of Health and Human Services and the 

In [46]:
words = re.split(r'\W+', mytext)
# print(words)
nWords = len(words)
print(nWords)

1657
