<h1>Emotion Detection and Information Retrieval</h1>

<h2>Installing Libraries</h2>

In [17]:
#!pip install duckduckgo-search
#!pip install search-engines
#!pip install beautifulsoup4
#!pip install spacy
#!pip install spacy-langdetect
#!python -m spacy download en_core_web_lg
#!python -m spacy download en_core_web_trf
#!pip install spacytextblob
#!pip install pymongo

<h2>Defining Relevant Tokens</h2>

In [18]:
cryptocurrency_name = "bitcoin"
cryptocurrency_symbol = "$BTC"
keywords = f"{cryptocurrency_name} {cryptocurrency_symbol} sentiment"

In [19]:
expected_tokens_text = f"{keywords} " \
                       f"cryptos cryptocurrencies currencies markets price"

In [20]:
expected_tokens_text

'bitcoin $BTC sentiment cryptos cryptocurrencies currencies markets price'

<h2>Retrieve Links from Search Engines</h2>

In [21]:
# References:
############# DuckDuckGo
# (GoogleSearch) 1. https://medium.com/@nutanbhogendrasharma/how-to-scrape-google-search-engines-in-python-44770b8eab5
# (DuckDuckGo)   2. https://pypi.org/project/duckduckgo-search/
# (DuckDuckGo vs GoogleSearch) 3. https://medium.com/hackernoon/duckduckgo-vs-google-what-you-need-to-know-869368b08c4f
# (DuckDuckGo vs GoogleSearch) 4. https://www.cnet.com/tech/mobile/in-ios-17-apple-adds-ability-to-change-search-engine-in-safari-private-browsing/

############# Serch engines like Bing or Yahoo
#https://pypi.org/project/search-engines/

<h3> Importing Libraries </h3>

In [22]:
from duckduckgo_search import DDGS
from search_engines import bing_search, yahoo_search
import requests

In [23]:
MAX_SITES_RESULTS = 100
TIMEOUT_SECONDS = 5

<h3>Functions</h3>

In [24]:
def get_results(search_engine, page_url):
    try:
        response = requests.get(page_url, timeout=TIMEOUT_SECONDS)
        response.raise_for_status()  # Raise an exception for HTTP errors
        html = response.text
        results, next_page_url = search_engine.extract_search_results(html, page_url)
        return results, response.url
    except requests.exceptions.RequestException as e:
        print(f"An error occurred during 'get_results' function execution: {e}")
        return [], None

In [25]:
def fetch_search_results(search_engine, query, max_results):
    search_results = []
    search_url = search_engine.get_search_url(query)

    while len(search_results) < max_results and search_url:
        try:
            next_search_results, search_url = get_results(search_engine, search_url)
            for result in next_search_results:
                if "url" in result:
                    search_results.append(result['url'])
        except Exception as e:
            print(f"An error occurred during 'fetch_search_results' function execution: {e}")
            break

    return search_results[:max_results]


<h3>DuckDuckGo for SearchEngine</h3>

In [26]:
ddgs_results = list(map(lambda r: r["href"], DDGS().text(keywords, max_results=MAX_SITES_RESULTS)))
len(ddgs_results)

100

<h3>Bing for SearchEngine</h3>

In [27]:
bing_search_results = fetch_search_results(bing_search, keywords, MAX_SITES_RESULTS)
len(bing_search_results)

Extracted 9 results from page .
Extracted 9 results from page .
Extracted 0 results from page .
Extracted 0 results from page .
Extracted 0 results from page .
Extracted 0 results from page .
Extracted 0 results from page .
Extracted 0 results from page .
Extracted 0 results from page .
Extracted 0 results from page .
Extracted 9 results from page .
Extracted 0 results from page .
An error occurred during 'get_results' function execution: ("Connection broken: InvalidChunkLength(got length b'', 0 bytes read)", InvalidChunkLength(got length b'', 0 bytes read))


27

<h3>Yahoo for SearchEngine</h3>

In [28]:
yahoo_search_results = fetch_search_results(yahoo_search, keywords, MAX_SITES_RESULTS)
len(yahoo_search_results)

100

<h3> Mergin all Results </h3>

In [29]:
search_engines_results = set().union(yahoo_search_results, bing_search_results, ddgs_results)
search_engines_results

{'',
 'https://academy.binance.com/en/articles/what-is-crypto-market-sentiment',
 'https://alternative.me/crypto/',
 'https://alternative.me/crypto/fear-and-greed-index/',
 'https://ambcrypto.com/bitcoin-what-do-the-next-2-years-hold-for-btc/',
 'https://beincrypto.com/bitcoin-btc-color-charts-indicate-neutral-sentiment/',
 'https://beincrypto.com/price/bitcoin/price-prediction/',
 'https://bitcoinist.com/bitcoin-sentiment-returns-neutral-prices-down/',
 'https://bitcoinist.com/bitcoin-sentiment-turns-neutral-btc-plunges-29000/',
 'https://capital.com/bitcoin-price-prediction-2030-2050',
 'https://coincodex.com/crypto/bitcoin/price-prediction/',
 'https://coincodex.com/sentiment/',
 'https://coinmarketcap.com/currencies/bitcoin/',
 'https://coinpedia.org/price-prediction/bitcoin-price-prediction/',
 'https://cointelegraph.com/news/btc-mining-difficulty-5-things-bitcoin-this-week',
 'https://cointelegraph.com/news/btc-price-fear-svb-5-things-bitcoin-this-week',
 'https://cointelegraph.c

In [30]:
len(search_engines_results)

103

<h2>Web Scraping</h2>

In [31]:
# References:
# (BeautifulSoup) 1. https://ai.plainenglish.io/mastering-web-scraping-and-sentiment-analysis-with-python-and-machine-learning-255d1d6234c5
#                 2. https://j2logo.com/python/web-scraping-con-python-guia-inicio-beautifulsoup/
#                 3. https://www.geeksforgeeks.org/remove-all-style-scripts-and-html-tags-using-beautifulsoup/

<h3> Importing Libraries </h3>

In [32]:
from bs4 import BeautifulSoup
import re

In [33]:
selectors_to_remove = ["header", "div.header", "footer", "script", "noscript", "iframe"]

<h3>Functions</h3>

In [34]:
def fetch_page_content(page_url):
    try:
        response = requests.get(page_url, timeout=TIMEOUT_SECONDS)
        response.raise_for_status()
        return response.text
    except Exception as e:
        print(f"Error fetching page content: {e}")
        return None

In [35]:
def remove_unwanted_elements(soup):
    for selector in selectors_to_remove:
        elements = soup.body.select(selector)
        for element in elements:
            element.decompose()

In [36]:
def get_mineable_text_from_soup(soup):
    remove_unwanted_elements(soup)
    return " ".join(soup.stripped_strings)

In [37]:
def get_page_content(page_url):
    try:
        page_content = fetch_page_content(page_url)
        if page_content:
            soup = BeautifulSoup(page_content, "html.parser")
        
            page_title = soup.find("title").get_text().strip()
            print(f"Title web page: {page_title}")
        
            mineable_text = get_mineable_text_from_soup(soup)
            
            return {
                'title': page_title,
                'text': mineable_text,
                'page_url': page_url
            }
    except Exception as e:
        print(f"Error in retrieving {page_url}")
        print(f"An error occurred {e}")
        return None

In [38]:
web_scrap_pages = list(filter(lambda wbp: wbp is not None, map(get_page_content, search_engines_results)))

Error fetching page content: Invalid URL '': No scheme supplied. Perhaps you meant http://?
Error fetching page content: 403 Client Error: Forbidden for url: https://cointelegraph.com/news/btc-mining-difficulty-5-things-bitcoin-this-week
Title web page: Fidelity Spot Bitcoin ETF Amendment Spurs BTC Jump on SEC Dialog Speculation
Title web page: MSN
Title web page: Bitcoin (BTC) Latest – Breaking Higher, A Change of Sentiment?
Title web page: Introducing Gemini's BitcoinBuzz Sentiment Indicator | Gemini
Title web page: Trade USD | USD to  | Bitcoin (USD)  | IG International
Title web page: Bitcoin Price Prediction, News, and Analysis (BTC)
Title web page: Bitcoin Price Prediction: BTC Stops The Drop - Bullish Sentiments Taking Over?
Title web page: Crypto Sentiment API - BittsAnalytics API Documentation (2.1.0)
Error fetching page content: 403 Client Error: Forbidden for url: https://cointelegraph.com/news/the-economics-of-bitcoin-halving-understanding-the-effects-on-price-and-market-se

Title web page: Popular Crypto Indicators and Token Metrics | Gemini
Title web page: Bitcoin (BTC) has a Neutral Sentiment Score, is Falling, and Underperforming the Crypto Market Sunday: What's Next?
Title web page: Is the future of bitcoin safe? A triangulation approach in the reality of BTC market through a sentiments analysis | SpringerLink
Title web page: Bitcoin: What do the next 2 years hold for BTC? - AMBCrypto
Error fetching page content: 403 Client Error: Forbidden for url: https://www.lookintobitcoin.com/
Title web page: What Is Crypto Market Sentiment? | Binance Academy
Title web page: Crypto Dashboard - Alternative.me
Title web page: Bitcoin Sentiment On Social Media Now Most Red Since COVID Crash, Bottom Signal?
Error fetching page content: 403 Client Error: Forbidden for url: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9581699/
Title web page: Bitcoin (BTC) Sentiment is Bullish But is $98 Million Even Possible?
Error fetching page content: HTTPSConnectionPool(host='www.

In [39]:
len(web_scrap_pages)

71

In [40]:
web_scrap_pages[0]['text']

'Fidelity Spot Bitcoin ETF Amendment Spurs BTC Jump on SEC Dialog Speculation share on Facebook share on LinkedIn share on Twitter Bitcoin jumped to a two-month high on Wednesday after Fidelity joined the ranks of firms amending their spot bitcoin ETF filings. The amendments could indicate open communication lines between the firms and regulators. Bitcoin ( BTC ) shot to as high as $28,817 earlier today as amendments to a spot bitcoin ETF filing in the U.S. fired up bullish sentiment. The largest cryptocurrency added 2.8% in the past 24 hours, reaching a two-month high and leading gains among major tokens. The advance extended its weekly rally to almost 7%. Asset management giant Fidelity filed an amendment to its proposed spot bitcoin ETF, the Wise Origin Bitcoin Trust, with the U.S. Securities and Exchange Commission (SEC) late Tuesday, specifying how it will safeguard customers’ bitcoin in custody accounts and disclose risks related to the shaky regulatory environment around cryptoc

<h2>Sentiment Analysis</h2>

In [41]:
# References:
    #1. https://cnvrg.io/sentiment-analysis-python/
    #2. https://spacy.io/usage/spacy-101
    #3. https://importsem.com/evaluate-sentiment-analysis-in-bulk-with-spacy-and-python/
    #4. https://www.analyticsvidhya.com/blog/2021/06/nlp-application-named-entity-recognition-ner-in-python-with-spacy/
    #5. https://www.geeksforgeeks.org/python-named-entity-recognition-ner-using-spacy/
    #6. https://josiah-adesola.medium.com/how-to-use-named-entity-recognition-in-spacy-to-analyze-blog-content-c46d50a8eb94
    #7. https://spacy.io/models/en

<h3> Importing Libraries </h3>

In [42]:
import spacy
from spacytextblob.spacytextblob import SpacyTextBlob
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from datetime import date
from spacy import displacy

In [43]:
NER = spacy.load("en_core_web_lg")
#NER = spacy.load("en_core_web_trf")
NER.add_pipe("spacytextblob")
today = date.today()

<h3> Defining Custom CountVectorizer </h3>

In [44]:
class EnglishStemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        stemmer = SnowballStemmer('english')
        stop_words = set(stopwords.words('english'))

        def analyze(doc):
            words = word_tokenize(doc.lower())
            words = [stemmer.stem(word) for word in words if word not in stop_words]
            return words

        return analyze

In [45]:
english_analyzer = EnglishStemmedCountVectorizer()

In [46]:
X1 = english_analyzer.fit_transform([expected_tokens_text])

In [47]:
expected_tokens = english_analyzer.get_feature_names_out().tolist() 
expected_tokens

['$',
 'bitcoin',
 'btc',
 'crypto',
 'cryptocurr',
 'currenc',
 'market',
 'price',
 'sentiment']

<h3>Functions</h3>

In [48]:
def get_page_sentiment(doc, threshold=0.02):
    polarity = doc._.blob.polarity
    page_sentiment = 'NEUTRAL'
    if polarity < -threshold:
        page_sentiment = "NEGATIVE"
    elif polarity > threshold:
        page_sentiment = "POSITIVE"
    
    return {
        "polarity": polarity,
        "sentiment": page_sentiment
    }

In [49]:
def get_NERS_from_page(doc, mineable_words):
    ners_with_frequency = []

    for token in doc.ents:
        ner_frequency = mineable_words.count(token.lemma_)

        if ner_frequency != 0:
            ner_info = {
                'lemma': token.lemma_,
                'label': token.label_,
                'frequency': ner_frequency
            }
            if ner_info not in ners_with_frequency:
                ners_with_frequency.append(ner_info)

    return ners_with_frequency

In [50]:
def analyze_page_sentiment(page_title, mineable_text):
    try:
        print(f"=== Analyzing Sentiment for {page_title} ===")
        doc = NER(mineable_text)
        #displacy.render(doc,style="ent",jupyter=True)
        mineable_words = list(word_tokenize(mineable_text))
        ners = get_NERS_from_page(doc, mineable_text)
        sentiment = get_page_sentiment(doc)
        return {
            "ners": ners,
            "timestamp": today.isoformat(),
            "sentiment": sentiment
        }
    except Exception as e:
        print(f"Error in analyzing sentiment for {page_title}")
        print(f"An error occurred {e}")
        return None

In [51]:
def analyze_page_content(web_page):
    page_title = web_page['title']
    mineable_text = web_page['text']
    english_analyzer = EnglishStemmedCountVectorizer()
    try:
        print(f"====== Analyzing Content for {page_title} ======")
        X1 = english_analyzer.fit_transform([mineable_text])
        page_tokens = english_analyzer.get_feature_names_out().tolist() 
        if all(token not in expected_tokens for token in page_tokens):
            raise Exception("Web content discarded, not a single expected token was found")
        
        sentiment_result = analyze_page_sentiment(page_title, mineable_text)
        sentiment_result['page_title'] = page_title
        sentiment_result['page_url'] = web_page['page_url']
        sentiment_result['mineable_text'] = mineable_text
        return sentiment_result

    except Exception as e:
        print(f"Error in analyzing {page_title}")
        print(f"An error occurred {e}")
        return None

In [52]:
result_analysis = list(filter(lambda wbp: wbp is not None, map(analyze_page_content, web_scrap_pages)))

=== Analyzing Sentiment for Fidelity Spot Bitcoin ETF Amendment Spurs BTC Jump on SEC Dialog Speculation ===
Error in analyzing MSN
An error occurred Web content discarded, not a single expected token was found
=== Analyzing Sentiment for Bitcoin (BTC) Latest – Breaking Higher, A Change of Sentiment? ===
=== Analyzing Sentiment for Introducing Gemini's BitcoinBuzz Sentiment Indicator | Gemini ===
=== Analyzing Sentiment for Trade USD | USD to  | Bitcoin (USD)  | IG International ===
=== Analyzing Sentiment for Bitcoin Price Prediction, News, and Analysis (BTC) ===
=== Analyzing Sentiment for Bitcoin Price Prediction: BTC Stops The Drop - Bullish Sentiments Taking Over? ===
=== Analyzing Sentiment for Crypto Sentiment API - BittsAnalytics API Documentation (2.1.0) ===
=== Analyzing Sentiment for BDCC | Free Full-Text | A Complete VADER-Based Sentiment Analysis of Bitcoin (BTC) Tweets during the Era of COVID-19 ===
=== Analyzing Sentiment for This Is ‘Not Priced In’—A ‘Huge Shift’ Just H

=== Analyzing Sentiment for Bitcoin Sentiment Now Close To Extreme Fear: Why This Matters ===
Error in analyzing Bloomberg - Are you a robot?
An error occurred Web content discarded, not a single expected token was found
=== Analyzing Sentiment for Bitcoin Sentiment Returns To Neutral As Price Tumbles Down ===
=== Analyzing Sentiment for Chainlink (LINK) Defies Market Sentiment, Bitcoin (BTC) Stalls at $26.5K (Weekend Watch) ===
=== Analyzing Sentiment for Bitcoin (BTC) Price, Real-time Quote & News - Google Finance ===
=== Analyzing Sentiment for Crypto Fear and Greed Index - Bitcoin Momentum Tracker ===
=== Analyzing Sentiment for 7 Cryptos to Watch as Market Sentiment Hits a Snag | InvestorPlace ===
=== Analyzing Sentiment for Bitcoin (BTC), Immutable (IMX), and VC Spectra (SPCT) Sustain Bullish Sentiment in Crypto Community | Finbold ===
=== Analyzing Sentiment for Bitcoin Overview - CME Group ===
=== Analyzing Sentiment for Bitcoin's Hidden Indicator - Social Dominance - Santiment

In [53]:
len(result_analysis)

66

In [54]:
result_analysis[0]

{'ners': [{'lemma': 'Fidelity Spot', 'label': 'ORG', 'frequency': 1},
  {'lemma': 'Facebook', 'label': 'ORG', 'frequency': 1},
  {'lemma': 'Twitter Bitcoin', 'label': 'ORG', 'frequency': 1},
  {'lemma': 'two-month', 'label': 'DATE', 'frequency': 2},
  {'lemma': 'Wednesday', 'label': 'DATE', 'frequency': 1},
  {'lemma': 'Fidelity', 'label': 'ORG', 'frequency': 4},
  {'lemma': 'as high as $28,817', 'label': 'MONEY', 'frequency': 1},
  {'lemma': 'U.S.', 'label': 'GPE', 'frequency': 2},
  {'lemma': '2.8%', 'label': 'PERCENT', 'frequency': 1},
  {'lemma': 'the past 24 hour', 'label': 'TIME', 'frequency': 1},
  {'lemma': 'weekly', 'label': 'DATE', 'frequency': 1},
  {'lemma': 'almost 7%', 'label': 'PERCENT', 'frequency': 1},
  {'lemma': 'the Wise Origin Bitcoin Trust', 'label': 'ORG', 'frequency': 1},
  {'lemma': 'the U.S. Securities and Exchange Commission',
   'label': 'ORG',
   'frequency': 1},
  {'lemma': 'SEC', 'label': 'ORG', 'frequency': 5},
  {'lemma': 'late Tuesday', 'label': 'DATE'

<h2>Storage of Sentiment and Pages</h2>

In [55]:
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

In [56]:
#References
    #1. https://www.mongodb.com/languages/python

In [57]:
def get_database():
    password = 'INSERT_PASSWORD'
     # Provide the mongodb atlas url to connect python to mongodb using pymongo
    CONNECTION_STRING = f"mongodb+srv://admin:{password}@tp2bdnosql.3g6xqw3.mongodb.net/?retryWrites=true&w=majority"
 
    # Create a new client and connect to the server
    client = MongoClient(CONNECTION_STRING, server_api=ServerApi('1'))
 
    # Send a ping to confirm a successful connection
    try:
        client.admin.command('ping')
        print("Pinged your deployment. You successfully connected to MongoDB!")
        return client['nlp']
    except Exception as e:
        print("Error connecting MongoDB!")
        print(f"Error: {e}")

In [58]:
dbname = get_database()

Pinged your deployment. You successfully connected to MongoDB!


In [61]:
dbcollection = dbname[f'crypto_sentiment_{cryptocurrency_name}']

In [62]:
dbcollection.insert_many(result_analysis)

<pymongo.results.InsertManyResult at 0x2a81815da00>