In [1]:
# 1. Use urllib or requests package to read this CNBC article through its URL link
import urllib
html = urllib.request.urlopen('https://www.cnbc.com/2019/01/17/netflix-price-hike-helps-disney-upcoming-streaming-service-analyst.html').read()

In [2]:
# 2. Use BeautifulSoup (Links to an external site.) or another HTML parsing package to extract text from the article.
from bs4 import BeautifulSoup
from bs4.element import Comment
def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True

def text_from_html(body):
    soup = BeautifulSoup(body, 'html.parser')
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)
    return u" ".join(t.strip() for t in visible_texts)

txt=text_from_html(html)
print(txt)



In [7]:
# 3. Use re (regular expression) package to:
# Find all matches of $ amounts in the article
import re
print('$ amounts:')
print(re.findall('\$\d*\.?\d+?', txt))

# the position of $ amounts in the article
first_pos=0
ans=[]
for i in range(txt.count('$')):
    new_list = txt[first_pos:]
    next_pos = new_list.index('$') + 1
    ans.append(first_pos + new_list.index('$'))
    first_pos += next_pos
print('positions: ')
print(ans)

$ amounts:
['$125', '$308', '$156', '$11.6', '$11.6', '$325', '$351', '$1']
positions: 
[2194, 2321, 2372, 3404, 3447, 6499, 6541, 8413]


In [8]:
# Substitute all numbers with # character and print the output
print(re.sub(r'[0-9]','#',txt))



In [9]:
# Count (using regular expressions) ”Netflix” and “Disney” mentions
print('Netflix: '+str(len(re.findall('Netflix', txt, flags=0))))
print('Disney: '+str(len(re.findall('Disney', txt, flags=0))))

Netflix: 13
Disney: 7


In [10]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ivy06\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [11]:
# 4. Use NTLK and/or Spacy (Links to an external site.) tokenization features to:
# Tokenize sentences and words
from nltk import word_tokenize, sent_tokenize, ngrams, pos_tag, RegexpParser
from collections import Counter
sentences = sent_tokenize(txt)

In [12]:
for sentence in sentences:
    print(sentence)
    print()

Skip Navigation × LOG IN SIGN UP Keep Me Logged In SIGN IN Pro Watchlist Make It USA INTL Markets Pre-Markets U.S. Markets Currencies Cryptocurrency Futures & Commodities Bonds Funds & ETFs Watchlist Business Economy Finance Health & Science Media Real Estate Energy Transportation Industrials Retail Wealth Small Business Investing Invest In You Personal Finance Financial Advisors Trading Nation Options Action ETF Street Buffett Archive Earnings Trader Talk Tech Cybersecurity Enterprise Internet Media Mobile Social Media Venture Capital Tech Guide Politics White House Policy Defense Congress 2020 Elections CNBC TV Live TV Live Audio Latest Video Top Video CEO Interviews Business Day Shows Primetime Shows CNBC World Digital Originals Full Episodes Menu SEARCH QUOTES Markets Pre-Markets U.S. Markets Currencies Cryptocurrency Futures & Commodities Bonds Funds & ETFs Watchlist Business Economy Finance Health & Science Media Real Estate Energy Transportation Industrials Retail Wealth Small B

In [13]:
tokens_all=word_tokenize(txt)
print(tokens_all)



In [14]:
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ivy06\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
for sentence in sentences:
    tokens = word_tokenize(sentence)
    print(tokens)
    print()

['Skip', 'Navigation', '×', 'LOG', 'IN', 'SIGN', 'UP', 'Keep', 'Me', 'Logged', 'In', 'SIGN', 'IN', 'Pro', 'Watchlist', 'Make', 'It', 'USA', 'INTL', 'Markets', 'Pre-Markets', 'U.S.', 'Markets', 'Currencies', 'Cryptocurrency', 'Futures', '&', 'Commodities', 'Bonds', 'Funds', '&', 'ETFs', 'Watchlist', 'Business', 'Economy', 'Finance', 'Health', '&', 'Science', 'Media', 'Real', 'Estate', 'Energy', 'Transportation', 'Industrials', 'Retail', 'Wealth', 'Small', 'Business', 'Investing', 'Invest', 'In', 'You', 'Personal', 'Finance', 'Financial', 'Advisors', 'Trading', 'Nation', 'Options', 'Action', 'ETF', 'Street', 'Buffett', 'Archive', 'Earnings', 'Trader', 'Talk', 'Tech', 'Cybersecurity', 'Enterprise', 'Internet', 'Media', 'Mobile', 'Social', 'Media', 'Venture', 'Capital', 'Tech', 'Guide', 'Politics', 'White', 'House', 'Policy', 'Defense', 'Congress', '2020', 'Elections', 'CNBC', 'TV', 'Live', 'TV', 'Live', 'Audio', 'Latest', 'Video', 'Top', 'Video', 'CEO', 'Interviews', 'Business', 'Day', 'S

In [16]:
nltk.download('stopwords')
from nltk.corpus import stopwords

nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ivy06\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ivy06\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [20]:
# Remove all English stop words
stop_words = set(stopwords.words('english')) 
filtered_words = [] 

# Lemmatize and deduplicate unigrams into a dictionary
# using Lemmatization
wordnet_lemmatizer = WordNetLemmatizer()
dedup=[]

# using stemming
from nltk.stem.porter import PorterStemmer
porter_stemmer=PorterStemmer()
dedup2=[]

for w in tokens_all: 
    if w not in stop_words: 
        filtered_words.append(w) 
        if wordnet_lemmatizer.lemmatize(w) not in dedup:
            dedup.append(wordnet_lemmatizer.lemmatize(w))
        if porter_stemmer.stem(w) not in dedup2:
            dedup2.append(porter_stemmer.stem(w))
print('After removing all English stop words: \n',filtered_words,'\n')

print('Lemmatizing and deduplicating unigrams into a dictionary: \n',dedup,'\n')
print('Length of the dictionary: ',len(dedup),'\n')
#print(Counter(ngrams(dedup,1)))  # this command can be used to calculate the frequency of each word after lemmatization

print('Stemming: \n',dedup2,'\n')
print('Length of the dictionary: ',len(dedup2),'\n')
#print(Counter(ngrams(dedup2,1)))

After removing all English stop words: 

Lemmatizing and deduplicating unigrams into a dictionary: 

Length of the dictionary:  663 

Stemming: 
 ['skip', 'navig', '×', 'log', 'IN', 'sign', 'UP', 'keep', 'Me', 'In', 'pro', 'watchlist', 'make', 'It', 'usa', 'intl', 'market', 'pre-market', 'u.s.', 'currenc', 'cryptocurr', 'futur', '&', 'commod', 'bond', 'fund', 'etf', 'busi', 'economi', 'financ', 'health', 'scienc', 'media', 'real', 'estat', 'energi', 'transport', 'industri', 'retail', 'wealth', 'small', 'invest', 'you', 'person', 'financi', 'advisor', 'trade', 'nation', 'option', 'action', 'street', 'buffett', 'archiv', 'earn', 'trader', 'talk', 'tech', 'cybersecur', 'enterpris', 'internet', 'mobil', 'social', 'ventur', 'capit', 'guid', 'polit', 'white', 'hous', 'polici', 'defens', 'congress', '2020', 'elect', 'cnbc', 'TV', 'live', 'audio', 'latest', 'video', 'top', 'ceo', 'interview', 'day', 'show', 'primetim', 'world', 'digit', 'origin', 'full', 'episod', 'menu', 'search', 'quot', 'st

In [21]:
len(filtered_words)

1285

In [39]:
# List and count n-grams for any given input n
def ListAndCount(tokens,n):
    for item in ngrams(tokens,n):
        print(item)
    print(Counter(ngrams(tokens,n)))

# example: n=3
ListAndCount(filtered_words,3)

('Skip', 'Navigation', '×')
('Navigation', '×', 'LOG')
('×', 'LOG', 'IN')
('LOG', 'IN', 'SIGN')
('IN', 'SIGN', 'UP')
('SIGN', 'UP', 'Keep')
('UP', 'Keep', 'Me')
('Keep', 'Me', 'Logged')
('Me', 'Logged', 'In')
('Logged', 'In', 'SIGN')
('In', 'SIGN', 'IN')
('SIGN', 'IN', 'Pro')
('IN', 'Pro', 'Watchlist')
('Pro', 'Watchlist', 'Make')
('Watchlist', 'Make', 'It')
('Make', 'It', 'USA')
('It', 'USA', 'INTL')
('USA', 'INTL', 'Markets')
('INTL', 'Markets', 'Pre-Markets')
('Markets', 'Pre-Markets', 'U.S.')
('Pre-Markets', 'U.S.', 'Markets')
('U.S.', 'Markets', 'Currencies')
('Markets', 'Currencies', 'Cryptocurrency')
('Currencies', 'Cryptocurrency', 'Futures')
('Cryptocurrency', 'Futures', '&')
('Futures', '&', 'Commodities')
('&', 'Commodities', 'Bonds')
('Commodities', 'Bonds', 'Funds')
('Bonds', 'Funds', '&')
('Funds', '&', 'ETFs')
('&', 'ETFs', 'Watchlist')
('ETFs', 'Watchlist', 'Business')
('Watchlist', 'Business', 'Economy')
('Business', 'Economy', 'Finance')
('Economy', 'Finance', 'Health

In [44]:
# Print bigrams and trigrams in the first 5 sentences
for i in range(5):
    print(sentences[i])
    tokens = word_tokenize(sentences[i])
    print(ListAndCount(tokens,2))
    print(ListAndCount(tokens,3))
    print()

Skip Navigation × LOG IN SIGN UP Keep Me Logged In SIGN IN Pro Watchlist Make It USA INTL Markets Pre-Markets U.S. Markets Currencies Cryptocurrency Futures & Commodities Bonds Funds & ETFs Watchlist Business Economy Finance Health & Science Media Real Estate Energy Transportation Industrials Retail Wealth Small Business Investing Invest In You Personal Finance Financial Advisors Trading Nation Futures Now Options Action ETF Street Buffett Archive Earnings Trader Talk Tech Cybersecurity Enterprise Internet Media Mobile Social Media Venture Capital Tech Guide Politics White House Policy Defense Congress 2020 Elections CNBC TV Live TV Live Audio Latest Video Top Video CEO Interviews Business Day Shows Primetime Shows CNBC World Digital Originals Full Episodes Menu SEARCH QUOTES Markets Pre-Markets U.S. Markets Currencies Cryptocurrency Futures & Commodities Bonds Funds & ETFs Watchlist Business Economy Finance Health & Science Media Real Estate Energy Transportation Industrials Retail We

In [45]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ivy06\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [46]:
# Print POS tags in the first 5 sentences
for i in range(5):
    tokens = word_tokenize(sentences[i])
    sentence_pos = pos_tag(tokens)
    print(sentence_pos)
    print()

[('Skip', 'NNP'), ('Navigation', 'NNP'), ('×', 'NNP'), ('LOG', 'NNP'), ('IN', 'NNP'), ('SIGN', 'NNP'), ('UP', 'NNP'), ('Keep', 'NNP'), ('Me', 'NNP'), ('Logged', 'NNP'), ('In', 'IN'), ('SIGN', 'NNP'), ('IN', 'NNP'), ('Pro', 'NNP'), ('Watchlist', 'NNP'), ('Make', 'NNP'), ('It', 'PRP'), ('USA', 'NNP'), ('INTL', 'NNP'), ('Markets', 'NNP'), ('Pre-Markets', 'NNP'), ('U.S.', 'NNP'), ('Markets', 'NNP'), ('Currencies', 'NNP'), ('Cryptocurrency', 'NNP'), ('Futures', 'NNP'), ('&', 'CC'), ('Commodities', 'NNP'), ('Bonds', 'NNP'), ('Funds', 'NNP'), ('&', 'CC'), ('ETFs', 'NNP'), ('Watchlist', 'NNP'), ('Business', 'NNP'), ('Economy', 'NNP'), ('Finance', 'NNP'), ('Health', 'NNP'), ('&', 'CC'), ('Science', 'NNP'), ('Media', 'NNP'), ('Real', 'NNP'), ('Estate', 'NNP'), ('Energy', 'NNP'), ('Transportation', 'NNP'), ('Industrials', 'NNP'), ('Retail', 'NNP'), ('Wealth', 'NNP'), ('Small', 'NNP'), ('Business', 'NNP'), ('Investing', 'NNP'), ('Invest', 'NNP'), ('In', 'IN'), ('You', 'PRP'), ('Personal', 'NNP'), 