<h1>Emotion Detection and Information Retrieval</h1>

<h2>Installing Libraries</h2>

In [84]:
#!pip install duckduckgo-search
#!pip install search-engines
#!pip install beautifulsoup4
#!pip install spacy
#!pip install spacy-langdetect
#!python -m spacy download en_core_web_lg
#!pip install spacytextblob

<h2>Defining Relevant Tokens</h2>

In [85]:
cryptocurrency_name = "bitcoin"
cryptocurrency_symbol = "$BTC"
keywords = f"{cryptocurrency_name} {cryptocurrency_symbol} sentiment"

In [86]:
expected_tokens_text = f"{cryptocurrency_name} " \
                       f"{cryptocurrency_symbol} " \
                       f"sentiments cryptos cryptocurrencies currencies markets"

In [87]:
expected_tokens_text

'bitcoin $BTC sentiments cryptos cryptocurrencies currencies markets'

<h2>Retrieve Links from Search Engines</h2>

In [88]:
# References:
############# DuckDuckGo
# (GoogleSearch) 1. https://medium.com/@nutanbhogendrasharma/how-to-scrape-google-search-engines-in-python-44770b8eab5
# (DuckDuckGo)   2. https://pypi.org/project/duckduckgo-search/
# (DuckDuckGo vs GoogleSearch) 3. https://medium.com/hackernoon/duckduckgo-vs-google-what-you-need-to-know-869368b08c4f
# (DuckDuckGo vs GoogleSearch) 4. https://www.cnet.com/tech/mobile/in-ios-17-apple-adds-ability-to-change-search-engine-in-safari-private-browsing/

############# Serch engines like Bing or Yahoo
#https://pypi.org/project/search-engines/

<h3> Importing Libraries </h3>

In [89]:
from duckduckgo_search import DDGS
from search_engines import bing_search, yahoo_search
import requests

In [90]:
MAX_SITES_RESULTS = 100
TIMEOUT_SECONDS = 5

<h3>Functions</h3>

In [91]:
def get_results(search_engine, page_url):
    try:
        response = requests.get(page_url, timeout=TIMEOUT_SECONDS)
        response.raise_for_status()  # Raise an exception for HTTP errors
        html = response.text
        results, next_page_url = search_engine.extract_search_results(html, page_url)
        return results, response.url
    except requests.exceptions.RequestException as e:
        print(f"An error occurred during 'get_results' function execution: {e}")
        return [], None

In [92]:
def fetch_search_results(search_engine, query, max_results):
    search_results = []
    search_url = search_engine.get_search_url(query)

    while len(search_results) < max_results and search_url:
        try:
            next_search_results, search_url = get_results(search_engine, search_url)
            for result in next_search_results:
                if "url" in result:
                    search_results.append(result['url'])
        except Exception as e:
            print(f"An error occurred during 'fetch_search_results' function execution: {e}")
            break

    return search_results[:max_results]


<h3>DuckDuckGo for SearchEngine</h3>

In [93]:
ddgs_results = list(map(lambda r: r["href"], DDGS().text(keywords, max_results=MAX_SITES_RESULTS)))
len(ddgs_results)

100

<h3>Bing for SearchEngine</h3>

In [94]:
bing_search_results = fetch_search_results(bing_search, keywords, MAX_SITES_RESULTS)
len(bing_search_results)

Extracted 0 results from page .
Extracted 0 results from page .
Extracted 0 results from page .
Extracted 0 results from page .
Extracted 0 results from page .
Extracted 0 results from page .
Extracted 0 results from page .
Extracted 0 results from page .
Extracted 0 results from page .
Extracted 0 results from page .
Extracted 0 results from page .
Extracted 0 results from page .
Extracted 0 results from page .
Extracted 0 results from page .
Extracted 0 results from page .
Extracted 0 results from page .
Extracted 0 results from page .
Extracted 0 results from page .
Extracted 0 results from page .
Extracted 0 results from page .
Extracted 0 results from page .
Extracted 0 results from page .
Extracted 9 results from page .
Extracted 10 results from page .
Extracted 10 results from page .
Extracted 10 results from page .
Extracted 10 results from page .
Extracted 10 results from page .
Extracted 10 results from page .
Extracted 10 results from page .
Extracted 10 results from page .


100

<h3>Yahoo for SearchEngine</h3>

In [95]:
yahoo_search_results = fetch_search_results(yahoo_search, keywords, MAX_SITES_RESULTS)
len(yahoo_search_results)

100

<h3> Mergin all Results </h3>

In [96]:
search_engines_results = set().union(yahoo_search_results, bing_search_results, ddgs_results)
search_engines_results

{'',
 'https://academy.binance.com/en/articles/what-is-crypto-market-sentiment',
 'https://academy.binance.com/en/glossary/halving',
 'https://alternative.me/crypto/',
 'https://alternative.me/crypto/fear-and-greed-index/',
 'https://beincrypto.com/bitcoin-btc-will-not-reach-new-ath-until-2025-7991-people-poll/',
 'https://beincrypto.com/learn/how-to-short-bitcoin/',
 'https://beincrypto.com/price/bitcoin/price-prediction/',
 'https://beincrypto.com/unbelievable-bitcoin-price-predictions-2023-top-analysts/',
 'https://bitcoinist.com/bitcoin-sentiment-returns-neutral-prices-down/',
 'https://bitcoinist.com/bitcoin-sentiment-surges-neutral-first-september/',
 'https://coincodex.com/crypto/bitcoin-cash/price-prediction/',
 'https://coincodex.com/crypto/bitcoin/price-prediction/',
 'https://coincodex.com/sentiment/',
 'https://coinmarketcap.com/currencies/bitcoin/',
 'https://coinpedia.org/price-prediction/bitcoin-price-prediction/',
 'https://cointelegraph.com/news/bitcoin-price-holds-as-

In [97]:
len(search_engines_results)

103

<h2>Web Scraping</h2>

In [98]:
# References:
# (BeautifulSoup) 1. https://ai.plainenglish.io/mastering-web-scraping-and-sentiment-analysis-with-python-and-machine-learning-255d1d6234c5
#                 2. https://j2logo.com/python/web-scraping-con-python-guia-inicio-beautifulsoup/
#                 3. https://www.geeksforgeeks.org/remove-all-style-scripts-and-html-tags-using-beautifulsoup/

<h3> Importing Libraries </h3>

In [99]:
from bs4 import BeautifulSoup
import re

In [100]:
selectors_to_remove = ["header", "div.header", "footer", "script", "noscript", "iframe"]

<h3>Functions</h3>

In [101]:
def fetch_page_content(page_url):
    try:
        response = requests.get(page_url, timeout=TIMEOUT_SECONDS)
        response.raise_for_status()
        return response.text
    except Exception as e:
        print(f"Error fetching page content: {e}")
        return None

In [102]:
def remove_unwanted_elements(soup):
    for selector in selectors_to_remove:
        elements = soup.body.select(selector)
        for element in elements:
            element.decompose()

In [103]:
def get_mineable_text_from_soup(soup):
    remove_unwanted_elements(soup)
    return " ".join(soup.stripped_strings)

In [104]:
def get_page_content(page_url):
    try:
        page_content = fetch_page_content(page_url)
        if page_content:
            soup = BeautifulSoup(page_content, "html.parser")
        
            page_title = soup.find("title").get_text().strip()
            print(f"Title web page: {page_title}")
        
            mineable_text = get_mineable_text_from_soup(soup)
            
            return {
                'title': page_title,
                'text': mineable_text
            }
    except Exception as e:
        print(f"Error in retrieving {page_url}")
        print(f"An error occurred {e}")
        return None

In [105]:
web_scrap_pages = list(filter(lambda wbp: wbp is not None, map(get_page_content, search_engines_results)))

Error fetching page content: Invalid URL '': No scheme supplied. Perhaps you meant http://?
Title web page: Bitcoin bulls in full retreat as BTC sentiment slumps to 'fear' territory
Error fetching page content: 403 Client Error: Forbidden for url: https://cointelegraph.com/news/uptober-might-be-over-bitcoin-price-data-shows-investor-sentiment-at-3-month-low
Title web page: Crypto Market Sentiment Analysis - Stockgeist
Title web page: Market Wrap: Bitcoin Sentiment Turns Extremely Bearish
Error fetching page content: 403 Client Error: Forbidden for url: https://news.bitcoin.com/bitcoin-ethereum-technical-analysis-btc-eth-consolidate-ahead-of-us-consumer-sentiment-data
Error fetching page content: 403 Client Error: Forbidden for url: https://cointelegraph.com/news/crypto-fear-and-greed-index-hits-highest-level-since-bitcoin-s-all-time-high
Error fetching page content: HTTPSConnectionPool(host='www.forbes.com', port=443): Read timed out. (read timeout=5)
Title web page: Fear And Greed Ind

Title web page: Mike Novogratz: SEC will approve bitcoin ETF as early as this year
Title web page: Crypto Dashboard - Alternative.me
Title web page: 7 Cryptos to Watch as a Hot Jobs Report Cools Sentiment | InvestorPlace
Error fetching page content: 403 Client Error: Forbidden for url: https://www.coinbase.com/price/bitcoin/usd
Title web page: Bitcoin Price Today | BTC Live Chart and Forecast
Title web page: 3 Bitcoin Sentiment Analysis Tools: How They Work & How to Use Them
Title web page: Why Do Crypto Traders Care About Spot Market Bitcoin (BTC) Exchange-Traded Funds (ETFs)?
Error fetching page content: 403 Client Error: Forbidden for url: https://cointelegraph.com/news/bitcoin-short-term-holders-panic-amid-nearly-100-unrealized-loss
Title web page: Bitcoin Price Prediction: What Elliott Wave Theory Suggests Is Next
Title web page: Bitcoin Price Prediction: BTC Stops The Drop - Bullish Sentiments Taking Over?
Title web page: IG Client Sentiment Report 2023-10-18 20:00
Title web page

In [106]:
len(web_scrap_pages)

60

In [107]:
web_scrap_pages[0]['text']

'Bitcoin bulls in full retreat as BTC sentiment slumps to \'fear\' territory Skip to main content TRENDING: EURUSD | GBPUSD | XAUUSD | AUDUSD | USDCAD TRENDING: EURUSD | GBPUSD | XAUUSD | AUDUSD | USDCAD | GET THE APP | Newsletter RATES & CHARTS Live Chart Forecast Poll Rates Table Technical Levels Technical Confluences Detector ASSETS EUR/USD GBP/USD USD/JPY AUD/USD NZD/USD USD/CAD GBP/JPY EUR/JPY Dollar Index Gold Oil SP500 News Forex News Institutional Research LATEST NEWS BY ASSETS EUR/USD GBP/USD USD/JPY AUD/USD NZD/USD USD/CAD USD/CHF EUR/GBP Dollar Index Commodities Bonds Equities Analysis Latest Analysis EDITORIAL SELECTION EUR/USD GBP/USD USD/JPY AUD/USD USD/CAD Dollar Index Oil Gold Stocks Commodities Bonds Risk On/Off Support and Resistance Elliott Wave Cycles Sentiment Economic Calendar Economic Calendar Interest Rates Market Hours TOP EVENTS Fed US CPI Nonfarm Payrolls BoC ECB BoE BoJ RBA RBNZ SNB Cryptos SECTIONS Latest News Industry News Rates & Charts Education SECTIONS

<h2>Sentiment Analysis</h2>

In [108]:
# References:
    #1. https://cnvrg.io/sentiment-analysis-python/
    #2. https://spacy.io/usage/spacy-101
    #3. https://importsem.com/evaluate-sentiment-analysis-in-bulk-with-spacy-and-python/

<h3> Importing Libraries </h3>

In [122]:
import spacy
from spacytextblob.spacytextblob import SpacyTextBlob
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from datetime import date

In [123]:
nlp = spacy.load("en_core_web_lg")
nlp.add_pipe("spacytextblob")
today = date.today()

<h3> Defining Custom CountVectorizer </h3>

In [111]:
class EnglishStemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        stemmer = SnowballStemmer('english')
        stop_words = set(stopwords.words('english'))

        def analyze(doc):
            words = word_tokenize(doc.lower())
            words = [stemmer.stem(word) for word in words if word not in stop_words]
            return words

        return analyze

In [112]:
english_analyzer = EnglishStemmedCountVectorizer()

In [113]:
X1 = english_analyzer.fit_transform([expected_tokens_text])

In [114]:
expected_tokens = english_analyzer.get_feature_names_out().tolist() 
expected_tokens

['$',
 'bitcoin',
 'btc',
 'crypto',
 'cryptocurr',
 'currenc',
 'market',
 'sentiment']

<h3>Functions</h3>

In [None]:
def get_page_sentiment(doc):
    polarity = doc._.blob.polarity
    

In [None]:
def analyze_page_sentiment(page_title, mineable_text):
    try:
        print(f"=== Analyzing Sentiment for {page_title} ===")
        doc = nlp(mineable_text)
        #mineable_words = list(word_tokenize(mineable_text))
        return {
            "timestamp": today.isoformat(),
        }
    except Exception as e:
        print(f"Error in analyzing sentiment for {page_title}")
        print(f"An error occurred {e}")
        return None

In [119]:
def analyze_page_content(page_title, mineable_text):
    english_analyzer = EnglishStemmedCountVectorizer()
    try:
        print(f"====== Analyzing Content for {page_title} ======")
        X1 = english_analyzer.fit_transform([mineable_text])
        page_tokens = english_analyzer.get_feature_names_out().tolist() 
        if all(token not in expected_tokens for token in page_tokens):
            raise Exception("Web content discarded, not a single token was found")
        
        return  analyze_page_sentiment(page_title, mineable_text)

    except Exception as e:
        print(f"Error in analyzing {page_title}")
        print(f"An error occurred {e}")
        return None

In [117]:
def analyze_pages(web_pages):
    try:
        for web_page in web_pages:
            result = analyze_page_content(web_page['title'], web_page['text'])
    except Exception as e:
        return None

In [118]:
analyze_pages(web_scrap_pages)

===Analyzing Bitcoin bulls in full retreat as BTC sentiment slumps to 'fear' territory===
===Analyzing Crypto Market Sentiment Analysis - Stockgeist===
===Analyzing Market Wrap: Bitcoin Sentiment Turns Extremely Bearish===
===Analyzing Fear And Greed Index | LookIntoBitcoin===
===Analyzing Bitcoin price today, BTC to USD live price, marketcap and chart | CoinMarketCap===
===Analyzing Bitcoin Sentiment – Bull & Bear Index – Augmento===
===Analyzing Bitcoin Price Live | LookIntoBitcoin===
===Analyzing Trade USD | USD to  | Bitcoin (USD)  | IG International===
===Analyzing sentix Bitcoin sentiment index - Crypto Currencies Sentiment===
===Analyzing Is the future of bitcoin safe? A triangulation approach in the reality of BTC market through a sentiments analysis | SpringerLink===
===Analyzing Is the Bitcoin Price in Danger? BTC Holders Sitting on Major Unrealized Losses===
===Analyzing Should You Buy Bitcoin While It's Below $30,000? | The Motley Fool===
===Analyzing BTCUSD history — Timel

<h2>Storage of Sentiment and Pages</h2>