# NLP - Equity Research Analysis

In [1]:
!pip install torch transformers sentence-transformers langchain langchain_community langchain-huggingface langchain_experimental langchain_chroma langchainhub streamlit unstructured faiss-cpu

Collecting langchain_community
  Downloading langchain_community-0.3.14-py3-none-any.whl.metadata (2.9 kB)
Collecting langchain-huggingface
  Downloading langchain_huggingface-0.1.2-py3-none-any.whl.metadata (1.3 kB)
Collecting langchain_experimental
  Downloading langchain_experimental-0.3.4-py3-none-any.whl.metadata (1.7 kB)
Collecting langchain_chroma
  Downloading langchain_chroma-0.2.0-py3-none-any.whl.metadata (1.7 kB)
Collecting langchainhub
  Downloading langchainhub-0.1.21-py3-none-any.whl.metadata (659 bytes)
Collecting streamlit
  Downloading streamlit-1.41.1-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting unstructured
  Downloading unstructured-0.16.13-py3-none-any.whl.metadata (24 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-ss

## Scraping latest articles in web related to finance and stocks

In [2]:
import re
import requests
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

In [3]:
query = "latest Financial News Outlets, Stock Market News, Financial Data Providers, Company Earnings Reports, Brokerage Research"
url = f"https://duckduckgo.com/html/?q={query}"
headers = {"User-Agent": "Mozilla/5.0"}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
urls = []

# Extract result links
for result in soup.find_all('a', class_='result__url'):
    # print(result.get('href'))
    urls.append("https:" + result.get('href'))

urls

['https://duckduckgo.com/l/?uddg=https%3A%2F%2Fwww.marketwatch.com%2F&rut=ccfb3a40f50b5c52867caf6a33b53da5d7fd38a4e85c9440763b7c572bdd1dbc',
 'https://duckduckgo.com/l/?uddg=https%3A%2F%2Fwww.bloomberg.com%2F&rut=69b84c78f742a3cf7fc533f450d4144ddd3219258dbf49c78622d389e41f00c5',
 'https://duckduckgo.com/l/?uddg=https%3A%2F%2Fwww.wsj.com%2F&rut=d52bda4c4d03738057357a20e335c4f5f7c36f49e9d4a3d781ff31a4a75667e2',
 'https://duckduckgo.com/l/?uddg=https%3A%2F%2Ffinance.yahoo.com%2F&rut=c7a42c7bdbb3ef83974b2f2a5e6aabe83da19481b721adc17efaf81fef1daf2b',
 'https://duckduckgo.com/l/?uddg=https%3A%2F%2Fwww.google.com%2Ffinance%2F&rut=252ac5565b096704cb33344d60f45939b769408a37b208a89719e876c228ab5f',
 'https://duckduckgo.com/l/?uddg=https%3A%2F%2Fwww.wsj.com%2Flivecoverage%2Fcpi%2Djpmorgan%2Dciti%2Dgoldman%2Dearnings%2Dstock%2Dmarket%2D01%2D15%2D2025&rut=b88c778b0f3738378676e558f09fed390d80aeab4353b93f4d3e0c9021a1a759',
 'https://duckduckgo.com/l/?uddg=https%3A%2F%2Fwww.marketwatch.com%2Fmarkets%2

In [4]:
# Load embeddings model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Convert URLs or titles to embeddings
embeddings = model.encode(urls)

# Create FAISS index
index = faiss.IndexFlatL2(embeddings.shape[1])  # L2 distance
index.add(np.array(embeddings))

# Save URLs with embeddings
url_to_index = {i: url for i, url in enumerate(urls)}

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [5]:
def search_relevant_urls(query, index, model, url_to_index, top_k=5):
    query_embedding = model.encode([query])
    distances, indices = index.search(np.array(query_embedding), top_k)
    results = [url_to_index[idx] for idx in indices[0]]
    return results

# Example Query
query = "latest Financial News Outlets, Stock Market News, Financial Data Providers, Company Earnings Reports, Brokerage Research"
relevant_urls = search_relevant_urls(query, index, model, url_to_index)
# print(relevant_urls)
for url in relevant_urls:
  print(url)

https://duckduckgo.com/l/?uddg=https%3A%2F%2Fapnews.com%2Farticle%2Fstock%2Dmarkets%2Dinflation%2Ddata%2Dearnings%2D42efcede3bc3db358117c72f2504edf2&rut=8e729d50597ecce60202f87db63eb8623307c82034d712145630018b2271a0e2
https://duckduckgo.com/l/?uddg=https%3A%2F%2Fwww.wsj.com%2Flivecoverage%2Fcpi%2Djpmorgan%2Dciti%2Dgoldman%2Dearnings%2Dstock%2Dmarket%2D01%2D15%2D2025&rut=b88c778b0f3738378676e558f09fed390d80aeab4353b93f4d3e0c9021a1a759
https://duckduckgo.com/l/?uddg=https%3A%2F%2Fwww.marketwatch.com%2Fmarkets%2Ffinancial%2Dmarkets&rut=9e0c593042da93a1df19f84a2c02d4a448d72ffe3e1f1da12193a88a5cebfbce
https://duckduckgo.com/l/?uddg=https%3A%2F%2Fwww.marketwatch.com%2F&rut=ccfb3a40f50b5c52867caf6a33b53da5d7fd38a4e85c9440763b7c572bdd1dbc
https://duckduckgo.com/l/?uddg=https%3A%2F%2Fwww.bloomberg.com%2F&rut=69b84c78f742a3cf7fc533f450d4144ddd3219258dbf49c78622d389e41f00c5


In [6]:
!pip install newspaper3k

Collecting newspaper3k
  Downloading newspaper3k-0.2.8-py3-none-any.whl.metadata (11 kB)
Collecting cssselect>=0.9.2 (from newspaper3k)
  Downloading cssselect-1.2.0-py2.py3-none-any.whl.metadata (2.2 kB)
Collecting feedparser>=5.2.1 (from newspaper3k)
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting tldextract>=2.0.1 (from newspaper3k)
  Downloading tldextract-5.1.3-py3-none-any.whl.metadata (11 kB)
Collecting feedfinder2>=0.0.4 (from newspaper3k)
  Downloading feedfinder2-0.0.4.tar.gz (3.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting jieba3k>=0.35.1 (from newspaper3k)
  Downloading jieba3k-0.35.1.zip (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m39.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tinysegmenter==0.3 (from newspaper3k)
  Downloading tinysegmenter-0.3.tar.gz (16 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Co

In [7]:
!pip install lxml[html_clean]

Collecting lxml-html-clean (from lxml[html_clean])
  Downloading lxml_html_clean-0.4.1-py3-none-any.whl.metadata (2.4 kB)
Downloading lxml_html_clean-0.4.1-py3-none-any.whl (14 kB)
Installing collected packages: lxml-html-clean
Successfully installed lxml-html-clean-0.4.1


In [8]:
import urllib.parse

def extract_final_url(duckduckgo_url):
    parsed_url = urllib.parse.urlparse(duckduckgo_url)
    query_params = urllib.parse.parse_qs(parsed_url.query)
    final_url = query_params.get('uddg', [None])[0]
    return final_url

resolved_urls = [extract_final_url(url) for url in relevant_urls]

# Print the resolved URLs to check
for url in resolved_urls:
    print(url)


https://apnews.com/article/stock-markets-inflation-data-earnings-42efcede3bc3db358117c72f2504edf2
https://www.wsj.com/livecoverage/cpi-jpmorgan-citi-goldman-earnings-stock-market-01-15-2025
https://www.marketwatch.com/markets/financial-markets
https://www.marketwatch.com/
https://www.bloomberg.com/


## Extracting data from relavant URLs

In [9]:
from langchain_community.document_loaders import UnstructuredURLLoader
from langchain_community.document_loaders import NewsURLLoader

loader = NewsURLLoader(urls=resolved_urls)
data = loader.load()
print(len(data))
data

ERROR:langchain_community.document_loaders.news:Error fetching or processing https://www.wsj.com/livecoverage/cpi-jpmorgan-citi-goldman-earnings-stock-market-01-15-2025, exception: Article `download()` failed with 403 Client Error: Forbidden for url: https://www.wsj.com/livecoverage/cpi-jpmorgan-citi-goldman-earnings-stock-market-01-15-2025 on URL https://www.wsj.com/livecoverage/cpi-jpmorgan-citi-goldman-earnings-stock-market-01-15-2025
ERROR:langchain_community.document_loaders.news:Error fetching or processing https://www.marketwatch.com/markets/financial-markets, exception: Article `download()` failed with 401 Client Error: HTTP Forbidden for url: https://www.marketwatch.com/markets/financial-markets on URL https://www.marketwatch.com/markets/financial-markets
ERROR:langchain_community.document_loaders.news:Error fetching or processing https://www.marketwatch.com/, exception: Article `download()` failed with 401 Client Error: HTTP Forbidden for url: https://www.marketwatch.com/ on 

2


[Document(metadata={'title': 'Stock market today: Wall Street drifts lower as momentum slows for US stocks', 'link': 'https://apnews.com/article/stock-markets-inflation-data-earnings-42efcede3bc3db358117c72f2504edf2', 'authors': [], 'language': 'en', 'description': 'U.S. stock indexes drifted lower following a mixed set of earnings reports from Morgan Stanley, UnitedHealth Group and other big companies.', 'publish_date': datetime.datetime(2025, 1, 16, 5, 20, 34)}, page_content='NEW YORK (AP) — U.S. stock indexes drifted lower Thursday following a mixed set of earnings reports from Morgan Stanley, UnitedHealth Group and other big companies.\n\nThe S&P 500 slipped 0.2% after flipping between small gains and losses through the day. More stocks rose within the index than fell, but drops for some influential stocks like Tesla outweighed them.\n\nThe Dow Jones Industrial Average dropped 68 points, or 0.2%, and the Nasdaq composite fell 0.9%.\n\nThe relatively modest moves for stocks came a d

In [10]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500)
docs = text_splitter.split_documents(data)

print("Total number of documents: ",len(docs))

Total number of documents:  13


In [11]:
docs

[Document(metadata={'title': 'Stock market today: Wall Street drifts lower as momentum slows for US stocks', 'link': 'https://apnews.com/article/stock-markets-inflation-data-earnings-42efcede3bc3db358117c72f2504edf2', 'authors': [], 'language': 'en', 'description': 'U.S. stock indexes drifted lower following a mixed set of earnings reports from Morgan Stanley, UnitedHealth Group and other big companies.', 'publish_date': datetime.datetime(2025, 1, 16, 5, 20, 34)}, page_content='NEW YORK (AP) — U.S. stock indexes drifted lower Thursday following a mixed set of earnings reports from Morgan Stanley, UnitedHealth Group and other big companies.\n\nThe S&P 500 slipped 0.2% after flipping between small gains and losses through the day. More stocks rose within the index than fell, but drops for some influential stocks like Tesla outweighed them.\n\nThe Dow Jones Industrial Average dropped 68 points, or 0.2%, and the Nasdaq composite fell 0.9%.'),
 Document(metadata={'title': 'Stock market toda

## Making the data retrieval-ready!

In [12]:
!pip install langchain_google_genai

Collecting langchain_google_genai
  Downloading langchain_google_genai-2.0.9-py3-none-any.whl.metadata (3.6 kB)
Downloading langchain_google_genai-2.0.9-py3-none-any.whl (41 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: langchain_google_genai
Successfully installed langchain_google_genai-2.0.9


In [13]:
from langchain_chroma import Chroma
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import os
from google.colab import userdata

os.environ["GOOGLE_API_KEY"] = userdata.get('GenAI_API_KEY')

embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
vector = embeddings.embed_query("test embeddings")
# vector

In [14]:
from langchain_community.vectorstores.utils import filter_complex_metadata

# Filter complex metadata from the documents
docs = filter_complex_metadata(docs)

print(docs)
vectorstore = Chroma.from_documents(documents=docs, embedding=GoogleGenerativeAIEmbeddings(model="models/embedding-001"))

[Document(metadata={'title': 'Stock market today: Wall Street drifts lower as momentum slows for US stocks', 'link': 'https://apnews.com/article/stock-markets-inflation-data-earnings-42efcede3bc3db358117c72f2504edf2', 'language': 'en', 'description': 'U.S. stock indexes drifted lower following a mixed set of earnings reports from Morgan Stanley, UnitedHealth Group and other big companies.'}, page_content='NEW YORK (AP) — U.S. stock indexes drifted lower Thursday following a mixed set of earnings reports from Morgan Stanley, UnitedHealth Group and other big companies.\n\nThe S&P 500 slipped 0.2% after flipping between small gains and losses through the day. More stocks rose within the index than fell, but drops for some influential stocks like Tesla outweighed them.\n\nThe Dow Jones Industrial Average dropped 68 points, or 0.2%, and the Nasdaq composite fell 0.9%.'), Document(metadata={'title': 'Stock market today: Wall Street drifts lower as momentum slows for US stocks', 'link': 'http

In [15]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})
retrieved_docs = retriever.invoke("what is the price of tiago iCNG?")
print('Length of retrieved documents: ', len(retrieved_docs))
print(retrieved_docs)
print(retrieved_docs[1].page_content)

Length of retrieved documents:  3
[Document(id='c1428005-da45-40a2-84f5-74579caec8fd', metadata={'description': '', 'language': 'en', 'link': 'https://www.bloomberg.com/', 'title': 'Are you a robot?'}, page_content='Why did this happen?\n\nPlease make sure your browser supports JavaScript and cookies and that you are not blocking them from loading. For more information you can review our Terms of Service and Cookie Policy.'), Document(id='b4c2bd03-5a0b-44ce-8f27-454518609256', metadata={'description': 'U.S. stock indexes drifted lower following a mixed set of earnings reports from Morgan Stanley, UnitedHealth Group and other big companies.', 'language': 'en', 'link': 'https://apnews.com/article/stock-markets-inflation-data-earnings-42efcede3bc3db358117c72f2504edf2', 'title': 'Stock market today: Wall Street drifts lower as momentum slows for US stocks'}, page_content='It was the company’s first financial report since the shooting of one of its executives outside a New York City hotel e

## Performing RAG

In [16]:
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro",temperature=0.3, max_tokens=500)

In [17]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an expert equity research analyst. Use the following pieces of context to answer the question."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [18]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

## Output Evaluation

In [19]:
response = rag_chain.invoke({"input": "how is Tech titan Oracle doing?"})
print(response["answer"])
print('sources:')

sources = set()

for doc in response['context']:
    source = doc.metadata.get('link', 'Unknown')
    sources.add(source)

print(sources)


This article doesn't mention Oracle's performance.  It focuses on the negative performance of Tesla, U.S. Bancorp, and UnitedHealth Group, and the positive performance of Taiwan Semiconductor.  There is no information provided about Oracle.
sources:
{'https://apnews.com/article/stock-markets-inflation-data-earnings-42efcede3bc3db358117c72f2504edf2'}


In [20]:
response = rag_chain.invoke({"input": "is boing resuming production of its bestselling plane?"})
print(response["answer"])
print('sources:')

sources = set()

for doc in response['context']:
    source = doc.metadata.get('link', 'Unknown')
    sources.add(source)

print(sources)

The provided text doesn't offer any information about Boeing resuming production of its bestselling plane.  It focuses on broader market trends, Tesla's performance, and general economic indicators.  There's no mention of Boeing or its production plans.
sources:
{'https://apnews.com/article/stock-markets-inflation-data-earnings-42efcede3bc3db358117c72f2504edf2'}


## UI using gradio

In [21]:
!pip install gradio

Collecting gradio
  Downloading gradio-5.12.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.5.4 (from gradio)
  Downloading gradio_client-1.5.4-py3-none-any.whl.metadata (7.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.2.2 (from gradio)
  Downloading ruff-0.9.2-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6 (from gradio)
  Downloading safehttpx-0.1.6-py3-none-any.whl.me

In [None]:
import gradio as gr

def query_model(user_question):
    try:
        response = rag_chain.invoke({"input": user_question})
        answer = response.get("answer", "No answer available.")

        # Extract sources
        sources = set()
        for doc in response["context"]:
            source = doc.metadata.get("link", "Unknown")
            sources.add(source)

        sources_text = "\n".join(sources)
        return answer, sources_text
    except Exception as e:
        return f"An error occurred: {e}", "No sources available."

# Create the Gradio interface
interface = gr.Interface(
    fn=query_model,
    inputs=gr.Textbox(label="Ask a Question"),
    outputs=[
        gr.Textbox(label="Answer"),
        gr.Textbox(label="Sources"),
    ],
    title="Equity Research Assistant",
    description="Ask questions about stock market and equity research. Powered by a retrieval-augmented generation (RAG) model."
)

# Launch the Gradio app
interface.launch(share=True, debug=True)


Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://444548534270921aea.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


### References
- [Codebasics](https://www.youtube.com/watch?v=MoqgmWV1fm8&t=80s)
- [Document Loaders](https://www.comet.com/site/blog/langchain-document-loaders-for-web-data/)