In [1]:
import os
import pandas as pd
%matplotlib inline

In [2]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
analyzer = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\melis\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


# News Headlines Sentiment

Use the news api to pull the latest news articles for bitcoin and ethereum and create a DataFrame of sentiment scores for each coin. 

Use descriptive statistics to answer the following questions:
1. Which coin had the highest mean positive score?
2. Which coin had the highest negative score?
3. Which coin had the highest positive score?

In [3]:
# Read your api key environment variable
from newsapi import NewsApiClient
api_key = os.getenv('NEWS_API')

In [4]:
# Create a newsapi client
newsapi = NewsApiClient(api_key = api_key)

In [5]:
# Fetch the Bitcoin news articles
bitcoin_headlines = newsapi.get_everything(q = 'bitcoin', language = 'en', page_size = 100, sort_by = 'relevancy' )


In [6]:
# Fetch the Ethereum news articles
eth_headlines = newsapi.get_everything(q = 'ethereum', language = 'en', page_size= 100, sort_by='relevancy')

In [7]:
# Create the Bitcoin sentiment scores DataFrame
bitcoin_sentiments = []

for article in bitcoin_headlines["articles"]:
    try:
        text = article["content"]
        date = article["publishedAt"][:10]
        sentiment = analyzer.polarity_scores(text)
        compound = sentiment["compound"]
        pos = sentiment["pos"]
        neu = sentiment["neu"]
        neg = sentiment["neg"]
        
        bitcoin_sentiments.append({
            "text": text,
            "date": date,
            "compound": compound,
            "positive": pos,
            "negative": neg,
            "neutral": neu
            
        })
        
    except AttributeError:
        pass
bitcoin_df = pd.DataFrame(bitcoin_sentiments)

# Reorder DataFrame columns
cols = ["text", "compound", "positive", "negative", "neutral"]
bitcoin_df = bitcoin_df[cols]

bitcoin_df.head()

 

Unnamed: 0,text,compound,positive,negative,neutral
0,The Winklevoss twinsthe brothers Mark Zuckerbe...,0.0,0.0,0.0,1.0
1,If youre planning to do some protesting and yo...,-0.3464,0.046,0.134,0.819
2,Reuters\r\n<ul><li>Goldman Sachs gave five rea...,-0.2755,0.0,0.07,0.93
3,<ul><li>Famed investor Michael Novogratz tweet...,0.0,0.0,0.0,1.0
4,Our robot colleague Satoshi Nakaboto writes ab...,0.507,0.09,0.0,0.91


In [8]:
# Create the ethereum sentiment scores DataFrame
eth_sentiments = []

for article in eth_headlines["articles"]:
    try:
        text = article["content"]
        date = article["publishedAt"][:10]
        sentiment = analyzer.polarity_scores(text)
        compound = sentiment["compound"]
        pos = sentiment["pos"]
        neu = sentiment["neu"]
        neg = sentiment["neg"]
        
        eth_sentiments.append({
            "text": text,
            "date": date,
            "compound": compound,
            "positive": pos,
            "negative": neg,
            "neutral": neu
            
        })
        
    except AttributeError:
        pass
eth_df = pd.DataFrame(eth_sentiments)

# Reorder DataFrame columns
cols = ["text", "compound", "positive", "negative", "neutral"]
eth_df = eth_df[cols]

eth_df.head()

Unnamed: 0,text,compound,positive,negative,neutral
0,Editor’s note:Andreessen HorowitzsCrypto Start...,0.0,0.0,0.0,1.0
1,Our robot colleague Satoshi Nakaboto writes ab...,0.507,0.09,0.0,0.91
2,For developers looking to quickly build identi...,0.0,0.0,0.0,1.0
3,"Introducing Web3Torrent\r\nJune 18, 2020\r\nTo...",0.5574,0.137,0.0,0.863
4,"Akron, Ohio, the hometown of LeBron James and ...",0.0,0.0,0.0,1.0


In [9]:
# Describe the Bitcoin Sentiment
bitcoin_df.describe()

Unnamed: 0,compound,positive,negative,neutral
count,95.0,95.0,95.0,95.0
mean,0.161498,0.070011,0.033211,0.896789
std,0.397383,0.06549,0.050842,0.074951
min,-0.8176,0.0,0.0,0.665
25%,-0.149,0.0,0.0,0.855
50%,0.2023,0.075,0.0,0.91
75%,0.507,0.09,0.0645,0.9405
max,0.9198,0.335,0.215,1.0


In [10]:
# Describe the Ethereum Sentiment
eth_df.describe()

Unnamed: 0,compound,positive,negative,neutral
count,95.0,95.0,95.0,95.0
mean,0.262743,0.088295,0.029895,0.881779
std,0.372512,0.070544,0.048266,0.07946
min,-0.8074,0.0,0.0,0.665
25%,0.0,0.0155,0.0,0.836
50%,0.3182,0.09,0.0,0.873
75%,0.54985,0.134,0.056,0.937
max,0.9198,0.335,0.223,1.0


### Questions:

Q: Which coin had the highest mean positive score?

A: Ethereum

Q: Which coin had the highest compound score?

A: Ethereum

Q. Which coin had the highest positive score?

A: Ethereum

---

# Tokenizer

In this section, you will use NLTK and Python to tokenize the text for each coin. Be sure to:
1. Lowercase each word
2. Remove Punctuation
3. Remove Stopwords

In [19]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from string import punctuation
import re
import nltk
nltk.download('wordnet')


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\melis\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [20]:
# Expand the default stopwords list if necessary


In [21]:
# Complete the tokenizer function
def tokenizer(text):
    """Tokenizes text."""
    lemmatizer = WordNetLemmatizer()
    sw = set(stopwords.words('english'))
    regex = re.compile("[^a-zA-Z ]")
    re_clean = regex.sub('', text)
    words = word_tokenize(re_clean)
    lem = [lemmatizer.lemmatize(word) for word in words]
    tokens = [word.lower() for word in lem if word.lower() not in sw]


    return tokens


In [26]:
# Create a new tokens column for bitcoin
bitcoin_df['tokens']= bitcoin_df['text'].apply(tokenizer)
bitcoin_df.head()

Unnamed: 0,text,compound,positive,negative,neutral,tokens
0,The Winklevoss twinsthe brothers Mark Zuckerbe...,0.0,0.0,0.0,1.0,"[winklevoss, twinsthe, brother, mark, zuckerbe..."
1,If youre planning to do some protesting and yo...,-0.3464,0.046,0.134,0.819,"[youre, planning, protesting, dont, want, risk..."
2,Reuters\r\n<ul><li>Goldman Sachs gave five rea...,-0.2755,0.0,0.07,0.93,"[reutersulligoldman, sachs, gave, five, reason..."
3,<ul><li>Famed investor Michael Novogratz tweet...,0.0,0.0,0.0,1.0,"[ullifamed, investor, michael, novogratz, twee..."
4,Our robot colleague Satoshi Nakaboto writes ab...,0.507,0.09,0.0,0.91,"[robot, colleague, satoshi, nakaboto, writes, ..."


In [27]:
# Create a new tokens column for ethereum
eth_df['tokens'] = eth_df['text'].apply(tokenizer)
eth_df.head()

Unnamed: 0,text,compound,positive,negative,neutral,tokens
0,Editor’s note:Andreessen HorowitzsCrypto Start...,0.0,0.0,0.0,1.0,"[editors, noteandreessen, horowitzscrypto, sta..."
1,Our robot colleague Satoshi Nakaboto writes ab...,0.507,0.09,0.0,0.91,"[robot, colleague, satoshi, nakaboto, writes, ..."
2,For developers looking to quickly build identi...,0.0,0.0,0.0,1.0,"[developer, looking, quickly, build, identity,..."
3,"Introducing Web3Torrent\r\nJune 18, 2020\r\nTo...",0.5574,0.137,0.0,0.863,"[introducing, webtorrentjune, today, excited, ..."
4,"Akron, Ohio, the hometown of LeBron James and ...",0.0,0.0,0.0,1.0,"[akron, ohio, hometown, lebron, james, seat, u..."


---

# NGrams and Frequency Analysis

In this section you will look at the ngrams and word frequency for each coin. 

1. Use NLTK to produce the n-grams for N = 2. 
2. List the top 10 words for each coin. 

In [28]:
from collections import Counter
from nltk import ngrams

In [47]:
# Generate the Bitcoin N-grams where N=2
btc_tokens = [item for tokens in bitcoin_df.tokens for item in tokens]
bigram_counts = Counter(ngrams(btc_tokens, n = 2))
print(dict(bigram_counts))

a', 'struck'): 1, ('struck', 'char'): 1, ('char', 'aboutthis'): 1, ('aboutthis', 'website'): 1, ('website', 'let'): 1, ('let', 'subscribe'): 1, ('subscribe', 'rss'): 1, ('rss', 'feed'): 1, ('feed', 'website'): 1, ('website', 'support'): 1, ('support', 'rss'): 1, ('rss', 'using'): 1, ('using', 'respective'): 1, ('respective', 'website'): 1, ('website', 'api'): 1, ('api', 'translating'): 1, ('translating', 'data'): 1, ('data', 'rss'): 1, ('rss', 'feedsif'): 1, ('feedsif', 'get'): 1, ('get', 'p'): 1, ('p', 'char'): 2, ('char', 'imagine'): 1, ('imagine', 'sending'): 1, ('sending', 'friend'): 1, ('friend', 'charged'): 1, ('charged', 'million'): 1, ('million', 'fee'): 1, ('fee', 'seems'): 1, ('seems', 'case'): 1, ('case', 'cryptocurrency'): 1, ('cryptocurrency', 'user'): 1, ('user', 'paid'): 1, ('paid', 'million'): 1, ('million', 'transaction'): 1, ('transaction', 'fee'): 1, ('fee', 'trade'): 1, ('wednesday', 'morning'): 1, ('morning', 'char'): 1, ('char', 'january'): 1, ('january', 'financi

In [48]:
# Generate the Ethereum N-grams where N=2
eth_tokens = [item for tokens in eth_df.tokens for item in tokens]
bigram_counts = Counter(ngrams(eth_tokens, n = 2))
print(dict(bigram_counts))

t', 'brought'): 1, ('brought', 'node'): 1, ('node', 'online'): 1, ('online', 'first'): 1, ('first', 'vol'): 1, ('vol', 'char'): 1, ('ethereum', 'wa'): 1, ('wa', 'created'): 1, ('created', 'carry'): 1, ('carry', 'complex'): 1, ('complex', 'application'): 1, ('application', 'bitcoin'): 1, ('bitcoin', 'supposedly'): 1, ('supposedly', 'couldnt'): 1, ('couldnt', 'handle'): 1, ('handle', 'new'): 1, ('new', 'app'): 1, ('app', 'brings'): 1, ('brings', 'idea'): 1, ('idea', 'full'): 1, ('full', 'circlepseudonymous'): 1, ('circlepseudonymous', 'developer'): 1, ('developer', 'fiatjaf'): 1, ('fiatjaf', 'ha'): 1, ('ha', 'created'): 1, ('created', 'etleneum'): 1, ('etleneum', 'describ'): 1, ('describ', 'char'): 1, ('char', 'reddit'): 2, ('reddit', 'anticipating'): 1, ('anticipating', 'big'): 1, ('big', 'demand'): 1, ('demand', 'ethereumbased'): 1, ('ethereumbased', 'community'): 1, ('community', 'points'): 1, ('points', 'system'): 1, ('system', 'one'): 1, ('one', 'month'): 1, ('month', 'rolling'): 1,

In [49]:
# Use the token_count function to generate the top 10 words from each coin
def token_count(tokens, N=10):
    """Returns the top N tokens from the frequency count"""
    
    return Counter(tokens).most_common(N)

In [56]:
# Get the top 10 words for Bitcoin
token_count(btc_tokens, N=10)

[('bitcoin', 124),
 ('char', 95),
 ('satoshi', 41),
 ('nakaboto', 38),
 ('today', 24),
 ('ha', 23),
 ('price', 22),
 ('another', 21),
 ('btc', 20),
 ('whats', 20)]

In [57]:
# Get the top 10 words for Ethereum
token_count(eth_tokens, N=10)

[('char', 94),
 ('ethereum', 35),
 ('would', 31),
 ('level', 31),
 ('bitcoin', 27),
 ('first', 21),
 ('ha', 21),
 ('day', 20),
 ('blockchain', 19),
 ('market', 19)]

# Word Clouds

In this section, you will generate word clouds for each coin to summarize the news for each coin

In [59]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = [20.0, 10.0]

ModuleNotFoundError: No module named 'wordcloud'

In [63]:
# Generate the Bitcoin word cloud
lemmatizer = WordNetLemmatizer()
def process_text(text):
    sw = set(stopwords.words('english'))
    regex = re.compile("[^a-zA-Z ]")
    re_clean = regex.sub('', text)
    words = word_tokenize(re_clean)
    lem = [lemmatizer.lemmatize(word) for word in words]
    output = [word.lower() for word in lem if word.lower() not in sw]
    return ' '.join(output)

big_string = ' '.join(bitcoin_df['text'])
input_text = process_text(big_string)

wc = WordCloud().generate(input_text)
plt.imshow(wc)


NameError: name 'WordCloud' is not defined

In [64]:
# Generate the Ethereum word cloud
big_string = ' '.join(eth_df['text'])
input_text = process_text(big_string)
wc = WorldCloud().generate(input_text)
plt.imshow(wc)

NameError: name 'WorldCloud' is not defined

# Named Entity Recognition

In this section, you will build a named entity recognition model for both coins and visualize the tags using SpaCy.

In [66]:
import spacy
from spacy import displacy

In [75]:
# Optional - download a language model for SpaCy
!python -m spacy download en_core_web_sm


✔ Download and installation successful
You can now load the model via spacy.load('en_core_web_sm')


In [76]:
# Load the spaCy model
nlp = spacy.load('en_core_web_sm')

OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a shortcut link, a Python package or a valid path to a data directory.

## Bitcoin NER

In [72]:
# Concatenate all of the bitcoin text together
btc_article = bitcoin_df['text'].str.cat()
print(btc_article)

The Winklevoss twinsthe brothers Mark Zuckerberg allegedly stole the idea for Facebook fromare making a movie about themselves, Deadline reported this week. If that doesnt seem masturbatory enough, r… [+2657 chars]If youre planning to do some protesting and you dont want to risk your expensive smartphone in the processor you simply want to keep yourself as anonymous as possible while youre outconsider buying a… [+5568 chars]Reuters
<ul><li>Goldman Sachs gave five reasons why it thinks bitcoin is "not an asset class" nor "a suitable investment" in a presentation Wednesday morning.</li><li>Goldman observed that the crypt… [+3412 chars]<ul><li>Famed investor Michael Novogratz tweeted on Monday that bitcoin will soon take out the $10,000 resistance level, telling his followers, "Get on the train."</li><li>Novogratz said that turmoil… [+2757 chars]Our robot colleague Satoshi Nakaboto writes about Bitcoin BTC every fucking day.
Welcome to another edition of Bitcoin Today, where I, Satoshi Na

In [77]:
# Run the NER processor on all of the text
btc_doc = nlp(btc_text)
# Add a title to the document
btc_doc.user_data['title'] = 'Bitcoin NER'

NameError: name 'nlp' is not defined

In [32]:
# Render the visualization
displacy.render(btc_doc, style = 'ent')

In [33]:
# List all Entities

Binance PERSON
Binance GPE
around $8,130 MONEY
60.84 percent PERCENT
the past month DATE
3,086.14 MONEY
the latter half of 2017 DATE
$40 million MONEY
Bitcoin GPE
Flexa ORG
today DATE
GameStop ORG
Nordstrom NORP
7,000 CARDINAL
Bitcoin GPE
Thursday DATE
Bitcoin GPE
6,000 MONEY
first ORDINAL
November last year DATE
$15 million MONEY
Europol PERSON
three CARDINAL
second ORDINAL
more than €550,000 MONEY
around $615,000 MONEY
last week DATE
’m PERSON
May 7 DATE
about 10,000 CARDINAL
Baltimore GPE
RobbinHood ORG
13 CARDINAL
76,280 MONEY
today DATE
102,310 MONEY
Binance PERSON
last week's DATE
7,000 CARDINAL
roughly $40 million MONEY
Monday DATE
Binance GPE
one CARDINAL
about 7,000 CARDINAL
around $40 million MONEY
Bloomb PERSON
Tether PERSON
New Yorks GPE
month DATE
California GPE
Maine GPE
New York GPE
Albany GPE
Peter da Silva PERSON
the National Academy of Sciencies ORG
1965 DATE
Bitcoin BTC ORG
Tether ORG
Bitcoin GPE
Bitcoin GPE
the Wall Street Market FAC
European NORP
U.S. GPE
Millions 

---

## Ethereum NER

In [34]:
# Concatenate all of the bitcoin text together

'President Trump tweeted insults at Twitter again this morning, but this time Jack took the conversation off platform to the White House. In other news, a controversial Census question creates some strange bedfellows, and a "blockchain bandit" is pilfering mil… [+1705 chars]After announcing that they were launching a managed blockchain service late last year, Amazon Web Services is now opening that service up for general availability.\r\nIt was only about five months ago that AWS chief executive Andy Jassy announced that the compa… [+1220 chars]Captain Kirk and neo-Dadaists. Repugnant markets and legendary cryptographers. “Digital couture” auctioned by CryptoKitties developers. Distributed autonomous art organizations. A keynote speech looking back from 2047 at the near-apocalypse of 2026, from whic… [+5265 chars]So long as cryptocurrency exists, so too will the extraordinarylengths to which thieves will go to try to steal it. Unfortunately, that also includes preying on weak private k

In [35]:
# Run the NER processor on all of the text

# Add a title to the document

In [36]:
# Render the visualization

In [37]:
# List all Entities

Trump PERSON
Twitter GPE
this morning TIME
Jack PERSON
the White House ORG
late last year DATE
Amazon Web Services ORG
only about five months ago DATE
Andy Jassy PERSON
Kirk PERSON
neo-Dadaists NORP
Digital ORG
CryptoKitties ORG
2047 DATE
2026 DATE
one CARDINAL
millions CARDINAL
7,000 CARDINAL
Bitcoin GPE
Thursday DATE
Bitcoin GPE
6,000 MONEY
first ORDINAL
November last year DATE
Ether PERSON
ETH ORG
ETH ORG
Vitalik Buterin PERSON
Twitter GPE
Ethereum GPE
early this morning TIME
April 30 DATE
201 CARDINAL
chars]Have PERSON
first ORDINAL
SEC ORG
CoinDesk Korea GPE
over $6.1 million MONEY
Ethereum GPE
Independent Security Evaluators ORG
ISE ORG
ETH ORG
just 7 percent PERCENT
a third CARDINAL
Chainanalysis GPE
chars]Cisco CVE-2019-1804 PERSON
Cisco GPE
9000 PRODUCT
Cisco PERSON
hundreds of millions CARDINAL
Ion ORG
Airbnb PERSON
Facebook PERSON
TRON ORG
TRON ORG
Opera ORG
millions of dollars MONEY
Ethereum GPE
Morgan PERSON
Microsoft ORG
Jane Connolly PERSON
Quorum   PERSON
Ethereum GPE
f