<h1>Emotion Detection and Information Retrieval</h1>

<h2>Installing Libraries</h2>

In [53]:
#!pip install duckduckgo-search
#!pip install search-engines
#!pip install beautifulsoup4
#!pip install spacy
#!pip install spacy-langdetect
#!python -m spacy download en_core_web_lg
#!python -m spacy download en_core_web_trf
#!pip install spacytextblob

<h2>Defining Relevant Tokens</h2>

In [54]:
cryptocurrency_name = "bitcoin"
cryptocurrency_symbol = "$BTC"
keywords = f"{cryptocurrency_name} {cryptocurrency_symbol} sentiment"

In [55]:
expected_tokens_text = f"{keywords} " \
                       f"cryptos cryptocurrencies currencies markets price"

In [56]:
expected_tokens_text

'bitcoin $BTC sentiment cryptos cryptocurrencies currencies markets price'

<h2>Retrieve Links from Search Engines</h2>

In [57]:
# References:
############# DuckDuckGo
# (GoogleSearch) 1. https://medium.com/@nutanbhogendrasharma/how-to-scrape-google-search-engines-in-python-44770b8eab5
# (DuckDuckGo)   2. https://pypi.org/project/duckduckgo-search/
# (DuckDuckGo vs GoogleSearch) 3. https://medium.com/hackernoon/duckduckgo-vs-google-what-you-need-to-know-869368b08c4f
# (DuckDuckGo vs GoogleSearch) 4. https://www.cnet.com/tech/mobile/in-ios-17-apple-adds-ability-to-change-search-engine-in-safari-private-browsing/

############# Serch engines like Bing or Yahoo
#https://pypi.org/project/search-engines/

<h3> Importing Libraries </h3>

In [58]:
from duckduckgo_search import DDGS
from search_engines import bing_search, yahoo_search
import requests

In [59]:
MAX_SITES_RESULTS = 100
TIMEOUT_SECONDS = 5

<h3>Functions</h3>

In [60]:
def get_results(search_engine, page_url):
    try:
        response = requests.get(page_url, timeout=TIMEOUT_SECONDS)
        response.raise_for_status()  # Raise an exception for HTTP errors
        html = response.text
        results, next_page_url = search_engine.extract_search_results(html, page_url)
        return results, response.url
    except requests.exceptions.RequestException as e:
        print(f"An error occurred during 'get_results' function execution: {e}")
        return [], None

In [61]:
def fetch_search_results(search_engine, query, max_results):
    search_results = []
    search_url = search_engine.get_search_url(query)

    while len(search_results) < max_results and search_url:
        try:
            next_search_results, search_url = get_results(search_engine, search_url)
            for result in next_search_results:
                if "url" in result:
                    search_results.append(result['url'])
        except Exception as e:
            print(f"An error occurred during 'fetch_search_results' function execution: {e}")
            break

    return search_results[:max_results]


<h3>DuckDuckGo for SearchEngine</h3>

In [62]:
ddgs_results = list(map(lambda r: r["href"], DDGS().text(keywords, max_results=MAX_SITES_RESULTS)))
len(ddgs_results)

100

<h3>Bing for SearchEngine</h3>

In [63]:
bing_search_results = fetch_search_results(bing_search, keywords, MAX_SITES_RESULTS)
len(bing_search_results)

Extracted 0 results from page .
Extracted 0 results from page .
Extracted 0 results from page .
Extracted 0 results from page .
Extracted 0 results from page .
Extracted 0 results from page .
Extracted 0 results from page .
Extracted 0 results from page .
Extracted 0 results from page .
Extracted 0 results from page .
Extracted 0 results from page .
Extracted 0 results from page .
Extracted 0 results from page .
Extracted 0 results from page .
Extracted 0 results from page .
Extracted 0 results from page .
Extracted 0 results from page .
Extracted 0 results from page .
Extracted 0 results from page .
Extracted 0 results from page .
Extracted 0 results from page .
Extracted 0 results from page .
Extracted 0 results from page .
Extracted 0 results from page .
Extracted 0 results from page .
Extracted 0 results from page .
Extracted 0 results from page .
Extracted 0 results from page .
Extracted 0 results from page .
Extracted 0 results from page .
An error occurred during 'get_results' f

0

<h3>Yahoo for SearchEngine</h3>

In [64]:
yahoo_search_results = fetch_search_results(yahoo_search, keywords, MAX_SITES_RESULTS)
len(yahoo_search_results)

100

<h3> Mergin all Results </h3>

In [65]:
search_engines_results = set().union(yahoo_search_results, bing_search_results, ddgs_results)
search_engines_results

{'https://academy.binance.com/en/articles/what-is-crypto-market-sentiment',
 'https://alternative.me/crypto/',
 'https://alternative.me/crypto/fear-and-greed-index/',
 'https://ambcrypto.com/bitcoin-what-do-the-next-2-years-hold-for-btc/',
 'https://beincrypto.com/bitcoin-btc-color-charts-indicate-neutral-sentiment/',
 'https://beincrypto.com/price/bitcoin/price-prediction/',
 'https://bitcoinist.com/bitcoin-sentiment-returns-neutral-prices-down/',
 'https://bitcoinist.com/bitcoin-sentiment-turns-neutral-btc-plunges-29000/',
 'https://capital.com/bitcoin-price-prediction-2030-2050',
 'https://cfgi.io/bitcoin-fear-greed-index/',
 'https://coincodex.com/crypto/bitcoin/price-prediction/',
 'https://coincodex.com/sentiment/',
 'https://coinmarketcap.com/currencies/bitcoin/',
 'https://coinpedia.org/price-prediction/bitcoin-price-prediction/',
 'https://cointelegraph.com/news/btc-mining-difficulty-5-things-bitcoin-this-week',
 'https://cointelegraph.com/news/btc-price-fear-svb-5-things-bitc

In [66]:
len(search_engines_results)

100

<h2>Web Scraping</h2>

In [67]:
# References:
# (BeautifulSoup) 1. https://ai.plainenglish.io/mastering-web-scraping-and-sentiment-analysis-with-python-and-machine-learning-255d1d6234c5
#                 2. https://j2logo.com/python/web-scraping-con-python-guia-inicio-beautifulsoup/
#                 3. https://www.geeksforgeeks.org/remove-all-style-scripts-and-html-tags-using-beautifulsoup/

<h3> Importing Libraries </h3>

In [68]:
from bs4 import BeautifulSoup
import re

In [69]:
selectors_to_remove = ["header", "div.header", "footer", "script", "noscript", "iframe"]

<h3>Functions</h3>

In [70]:
def fetch_page_content(page_url):
    try:
        response = requests.get(page_url, timeout=TIMEOUT_SECONDS)
        response.raise_for_status()
        return response.text
    except Exception as e:
        print(f"Error fetching page content: {e}")
        return None

In [71]:
def remove_unwanted_elements(soup):
    for selector in selectors_to_remove:
        elements = soup.body.select(selector)
        for element in elements:
            element.decompose()

In [72]:
def get_mineable_text_from_soup(soup):
    remove_unwanted_elements(soup)
    return " ".join(soup.stripped_strings)

In [74]:
def get_page_content(page_url):
    try:
        page_content = fetch_page_content(page_url)
        if page_content:
            soup = BeautifulSoup(page_content, "html.parser")
        
            page_title = soup.find("title").get_text().strip()
            print(f"Title web page: {page_title}")
        
            mineable_text = get_mineable_text_from_soup(soup)
            
            return {
                'title': page_title,
                'text': mineable_text,
                'page_url': page_url
            }
    except Exception as e:
        print(f"Error in retrieving {page_url}")
        print(f"An error occurred {e}")
        return None

In [75]:
web_scrap_pages = list(filter(lambda wbp: wbp is not None, map(get_page_content, search_engines_results)))

Error fetching page content: 403 Client Error: Forbidden for url: https://news.bitcoin.com/bitcoin-ethereum-technical-analysis-btc-eth-consolidate-ahead-of-us-gdp-consumer-sentiment-data/
Title web page: Bitcoin Price Today | BTC Live Chart and Forecast
Error in retrieving https://twitter.com/btcsentimentCOM
An error occurred 'NoneType' object has no attribute 'get_text'
Title web page: Bitcoin Investor Sentiment Remains Steady As BTC Stalls At $16,000
Title web page: 3 Bitcoin Sentiment Analysis Tools: How They Work & How to Use Them
Error fetching page content: 403 Client Error: Forbidden for url: https://www.lookintobitcoin.com/charts/active-address-sentiment-indicator/
Error fetching page content: 403 Client Error: Forbidden for url: https://news.bitcoin.com/bitcoin-ethereum-technical-analysis-btc-eth-consolidate-ahead-of-us-consumer-sentiment-data
Title web page: Bitcoin shorts vs Longs - Click for BTC margin charts - Datamish
Title web page: sentix Bitcoin sentiment index - Crypt

Title web page: Bitcoin Sentiment Turns Neutral As BTC Plunges Below $29,000 | Bitcoinist.com
Error fetching page content: 403 Client Error: Forbidden for url: https://www.lookintobitcoin.com/charts/bitcoin-fear-and-greed-index/
Error fetching page content: 403 Client Error: Forbidden for url: https://news.bitcoin.com/bitcoin-ethereum-technical-analysis-btc-hits-1-week-low-bullish-sentiment-fades-on-monday
Error fetching page content: HTTPSConnectionPool(host='www.coindesk.com', port=443): Read timed out. (read timeout=5)
Error fetching page content: 403 Client Error: Forbidden for url: https://www.researchgate.net/publication/346808998_A_complete_vader-based_sentiment_analysis_of_bitcoin_BTC_tweets_during_the_ERA_of_COVID-19
Title web page: Bitcoin's Hidden Indicator - Social Dominance - Santiment Community Insights
Title web page: Bitcoin Sentiment Reaches Local High, But Can’t Shake Recent Fear
Title web page: Bitcoin IG Client Sentiment: Our data shows traders are now at their leas

In [76]:
len(web_scrap_pages)

69

In [77]:
web_scrap_pages[0]['text']



<h2>Sentiment Analysis</h2>

In [78]:
# References:
    #1. https://cnvrg.io/sentiment-analysis-python/
    #2. https://spacy.io/usage/spacy-101
    #3. https://importsem.com/evaluate-sentiment-analysis-in-bulk-with-spacy-and-python/
    #4. https://www.analyticsvidhya.com/blog/2021/06/nlp-application-named-entity-recognition-ner-in-python-with-spacy/
    #5. https://www.geeksforgeeks.org/python-named-entity-recognition-ner-using-spacy/
    #6. https://josiah-adesola.medium.com/how-to-use-named-entity-recognition-in-spacy-to-analyze-blog-content-c46d50a8eb94
    #7. https://spacy.io/models/en

<h3> Importing Libraries </h3>

In [79]:
import spacy
from spacytextblob.spacytextblob import SpacyTextBlob
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from datetime import date
from spacy import displacy

In [80]:
NER = spacy.load("en_core_web_lg")
#NER = spacy.load("en_core_web_trf")
NER.add_pipe("spacytextblob")
today = date.today()

<h3> Defining Custom CountVectorizer </h3>

In [81]:
class EnglishStemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        stemmer = SnowballStemmer('english')
        stop_words = set(stopwords.words('english'))

        def analyze(doc):
            words = word_tokenize(doc.lower())
            words = [stemmer.stem(word) for word in words if word not in stop_words]
            return words

        return analyze

In [82]:
english_analyzer = EnglishStemmedCountVectorizer()

In [83]:
X1 = english_analyzer.fit_transform([expected_tokens_text])

In [84]:
expected_tokens = english_analyzer.get_feature_names_out().tolist() 
expected_tokens

['$',
 'bitcoin',
 'btc',
 'crypto',
 'cryptocurr',
 'currenc',
 'market',
 'price',
 'sentiment']

<h3>Functions</h3>

In [118]:
def get_page_sentiment(doc, threshold=0.02):
    polarity = doc._.blob.polarity
    page_sentiment = 'NEUTRAL'
    if polarity < -threshold:
        page_sentiment = "NEGATIVE"
    elif polarity > threshold:
        page_sentiment = "POSITIVE"
    
    return {
        "polarity": polarity,
        "sentiment": page_sentiment
    }

In [162]:
def get_NERS_from_page(doc, mineable_words):
    ners_with_frequency = []

    for token in doc.ents:
        ner_frequency = mineable_words.count(token.lemma_)

        if ner_frequency != 0:
            ner_info = {
                'lemma': token.lemma_,
                'label': token.label_,
                'frequency': ner_frequency
            }
            if ner_info not in ners_with_frequency:
                ners_with_frequency.append(ner_info)

    return ners_with_frequency

In [151]:
def analyze_page_sentiment(page_title, mineable_text):
    try:
        print(f"=== Analyzing Sentiment for {page_title} ===")
        doc = NER(mineable_text)
        #displacy.render(doc,style="ent",jupyter=True)
        mineable_words = list(word_tokenize(mineable_text))
        ners = get_NERS_from_page(doc, mineable_text)
        sentiment = get_page_sentiment(doc)
        return {
            "ners": ners,
            "timestamp": today.isoformat(),
            "sentiment": sentiment
        }
    except Exception as e:
        print(f"Error in analyzing sentiment for {page_title}")
        print(f"An error occurred {e}")
        return None

In [119]:
def analyze_page_content(web_page):
    page_title = web_page['title']
    mineable_text = web_page['text']
    english_analyzer = EnglishStemmedCountVectorizer()
    try:
        print(f"====== Analyzing Content for {page_title} ======")
        X1 = english_analyzer.fit_transform([mineable_text])
        page_tokens = english_analyzer.get_feature_names_out().tolist() 
        if all(token not in expected_tokens for token in page_tokens):
            raise Exception("Web content discarded, not a single expected token was found")
        
        sentiment_result = analyze_page_sentiment(page_title, mineable_text)
        sentiment_result['page_title'] = page_title
        sentiment_result['page_url'] = web_page['page_url']
        sentiment_result['mineable_text'] = mineable_text
        return sentiment_result

    except Exception as e:
        print(f"Error in analyzing {page_title}")
        print(f"An error occurred {e}")
        return None

In [164]:
result_analysis = list(filter(lambda wbp: wbp is not None, map(analyze_page_content, web_scrap_pages)))

=== Analyzing Sentiment for Bitcoin Price Today | BTC Live Chart and Forecast ===
=== Analyzing Sentiment for Bitcoin Investor Sentiment Remains Steady As BTC Stalls At $16,000 ===
=== Analyzing Sentiment for 3 Bitcoin Sentiment Analysis Tools: How They Work & How to Use Them ===
=== Analyzing Sentiment for Bitcoin shorts vs Longs - Click for BTC margin charts - Datamish ===
=== Analyzing Sentiment for sentix Bitcoin sentiment index - Crypto Currencies Sentiment ===
=== Analyzing Sentiment for BTC Dominance Is Behaving Weirdly, and That’s Sort of Good ===
=== Analyzing Sentiment for Bitcoin (BTC), Immutable (IMX), and VC Spectra (SPCT) Sustain Bullish Sentiment in Crypto Community | Finbold ===
Error in analyzing MSN
An error occurred Web content discarded, not a single expected token was found
=== Analyzing Sentiment for Is the future of bitcoin safe? A triangulation approach in the reality of BTC market through a sentiments analysis | SpringerLink ===
=== Analyzing Sentiment for The 

=== Analyzing Sentiment for BDCC | Free Full-Text | A Complete VADER-Based Sentiment Analysis of Bitcoin (BTC) Tweets during the Era of COVID-19 ===
=== Analyzing Sentiment for What Is Long/Short Ratio and What Does It Convey in Cryptocurrency Futures? | Binance Blog ===
=== Analyzing Sentiment for Bitcoin Price | BTC Price Index and Live Chart - CoinDesk ===
=== Analyzing Sentiment for Bitcoin Price Prediction, News, and Analysis (BTC) ===
=== Analyzing Sentiment for Bitcoin Price Prediction 2023, 2024, 2025, 2026 - 2030 ===
=== Analyzing Sentiment for Bitcoin, XRP and Two Other Trends Are Driving the Crypto Markets Right Now, According to Analytics Firm Santiment - The Daily Hodl ===
=== Analyzing Sentiment for Bitcoin Sentiment Index - CoinDesk ===
=== Analyzing Sentiment for Bitcoin historical data for csv download, BTC free prices, sentiment and other crypto data sets history ===
=== Analyzing Sentiment for Bitcoin Sentiment Returns To Neutral, Will Traders Embrace Greed Next? ===

In [165]:
len(result_analysis)

66

In [167]:
result_analysis[0]

{'ners': [{'lemma': 'Oct 22', 'label': 'DATE', 'frequency': 10},
  {'lemma': 'Gold', 'label': 'ORG', 'frequency': 16},
  {'lemma': 'Powell', 'label': 'PERSON', 'frequency': 10},
  {'lemma': 'ECB', 'label': 'ORG', 'frequency': 10},
  {'lemma': 'BoC', 'label': 'GPE', 'frequency': 8},
  {'lemma': 'Australia', 'label': 'GPE', 'frequency': 8},
  {'lemma': 'Germany Ifo', 'label': 'GPE', 'frequency': 8},
  {'lemma': 'UK', 'label': 'GPE', 'frequency': 8},
  {'lemma': 'Oct 21', 'label': 'DATE', 'frequency': 8},
  {'lemma': '2023', 'label': 'CARDINAL', 'frequency': 53},
  {'lemma': 'Gold/Silver Weekly', 'label': 'ORG', 'frequency': 3},
  {'lemma': '150', 'label': 'CARDINAL', 'frequency': 5},
  {'lemma': 'Alphabet', 'label': 'ORG', 'frequency': 2},
  {'lemma': 'Microsoft', 'label': 'ORG', 'frequency': 2},
  {'lemma': 'Amazon', 'label': 'ORG', 'frequency': 2},
  {'lemma': '2023 US Dollar', 'label': 'MONEY', 'frequency': 2},
  {'lemma': 'Fed', 'label': 'ORG', 'frequency': 5},
  {'lemma': 'US', 'lab

<h2>Storage of Sentiment and Pages</h2>