# NLP - Equity Research Analysis

In [None]:
!pip install torch transformers sentence-transformers langchain langchain_community langchain-huggingface langchain_experimental langchain_chroma langchainhub streamlit unstructured faiss-cpu

Collecting aiofiles>=24.1.0 (from unstructured-client->unstructured)
  Using cached aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Using cached aiofiles-24.1.0-py3-none-any.whl (15 kB)
Installing collected packages: aiofiles
  Attempting uninstall: aiofiles
    Found existing installation: aiofiles 23.2.1
    Uninstalling aiofiles-23.2.1:
      Successfully uninstalled aiofiles-23.2.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gradio 5.9.1 requires aiofiles<24.0,>=22.0, but you have aiofiles 24.1.0 which is incompatible.[0m[31m
[0mSuccessfully installed aiofiles-24.1.0


## Scraping latest articles in web related to finance and stocks

In [None]:
import re
import requests
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

  from tqdm.autonotebook import tqdm, trange


In [None]:
query = "latest Financial News Outlets, Stock Market News, Financial Data Providers, Company Earnings Reports, Brokerage Research"
url = f"https://duckduckgo.com/html/?q={query}"
headers = {"User-Agent": "Mozilla/5.0"}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
urls = []

# Extract result links
for result in soup.find_all('a', class_='result__url'):
    # print(result.get('href'))
    urls.append("https:" + result.get('href'))

urls

['https://duckduckgo.com/l/?uddg=https%3A%2F%2Ffinance.yahoo.com%2Ftopic%2Fstock%2Dmarket%2Dnews%2F&rut=83e985eaeed9214d28eb9eccd1f6d573259812b3ea40ecbe6690556e9cbd3576',
 'https://duckduckgo.com/l/?uddg=https%3A%2F%2Fwww.investors.com%2Fnews%2Fstock%2Dmarket%2Dtoday%2Dstock%2Dmarket%2Dnews%2F&rut=ef86255c20841f361f0cb595748d3f38e536888c9c2b9253daf26b5bc87811cf',
 'https://duckduckgo.com/l/?uddg=https%3A%2F%2Fwww.reuters.com%2Fmarkets%2Fus%2F&rut=3855f727256969c53994a9a30b57b732711fe0a26971c80ded9c92907864517f',
 'https://duckduckgo.com/l/?uddg=https%3A%2F%2Ffinance.yahoo.com%2Fnews%2Fstock%2Dmarket%2Dnews&rut=72d3564cd4c9918305c4e2da09402a9c7c7e3e0734c33e35ec755969077e443d',
 'https://duckduckgo.com/l/?uddg=https%3A%2F%2Fwww.reuters.com%2Fmarkets%2Fstocks%2F&rut=17afeefdfad32b11eb595149471c363786dafbfac161de62cd9b33d390b2bc35',
 'https://duckduckgo.com/l/?uddg=https%3A%2F%2Ffinance.yahoo.com%2Fnews%2Flive%2Fstock%2Dmarket%2Dtoday%2Dsp%2D500%2Dnasdaq%2Ddow%2Dslide%2Dwith%2Drate%2Dcuts%

In [None]:
# Load embeddings model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Convert URLs or titles to embeddings
embeddings = model.encode(urls)

# Create FAISS index
index = faiss.IndexFlatL2(embeddings.shape[1])  # L2 distance
index.add(np.array(embeddings))

# Save URLs with embeddings
url_to_index = {i: url for i, url in enumerate(urls)}

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
def search_relevant_urls(query, index, model, url_to_index, top_k=5):
    query_embedding = model.encode([query])
    distances, indices = index.search(np.array(query_embedding), top_k)
    results = [url_to_index[idx] for idx in indices[0]]
    return results

# Example Query
query = "latest Financial News Outlets, Stock Market News, Financial Data Providers, Company Earnings Reports, Brokerage Research"
relevant_urls = search_relevant_urls(query, index, model, url_to_index)
# print(relevant_urls)
for url in relevant_urls:
  print(url)

https://duckduckgo.com/l/?uddg=https%3A%2F%2Fwww.investors.com%2Fnews%2Fstock%2Dmarket%2Dtoday%2Dstock%2Dmarket%2Dnews%2F&rut=ef86255c20841f361f0cb595748d3f38e536888c9c2b9253daf26b5bc87811cf
https://duckduckgo.com/l/?uddg=https%3A%2F%2Fwww.reuters.com%2Fmarkets%2Fstocks%2F&rut=17afeefdfad32b11eb595149471c363786dafbfac161de62cd9b33d390b2bc35
https://duckduckgo.com/l/?uddg=https%3A%2F%2Fwww.reuters.com%2Fmarkets%2Fus%2F&rut=3855f727256969c53994a9a30b57b732711fe0a26971c80ded9c92907864517f
https://duckduckgo.com/l/?uddg=https%3A%2F%2Ffinance.yahoo.com%2Ftopic%2Fstock%2Dmarket%2Dnews%2F&rut=83e985eaeed9214d28eb9eccd1f6d573259812b3ea40ecbe6690556e9cbd3576
https://duckduckgo.com/l/?uddg=https%3A%2F%2Ffinance.yahoo.com%2Fnews%2Fstock%2Dmarket%2Dnews&rut=72d3564cd4c9918305c4e2da09402a9c7c7e3e0734c33e35ec755969077e443d


In [None]:
!pip install newspaper3k



In [None]:
!pip install lxml[html_clean]



In [None]:
import urllib.parse

def extract_final_url(duckduckgo_url):
    parsed_url = urllib.parse.urlparse(duckduckgo_url)
    query_params = urllib.parse.parse_qs(parsed_url.query)
    final_url = query_params.get('uddg', [None])[0]
    return final_url

resolved_urls = [extract_final_url(url) for url in relevant_urls]

# Print the resolved URLs to check
for url in resolved_urls:
    print(url)


https://www.investors.com/news/stock-market-today-stock-market-news/
https://www.reuters.com/markets/stocks/
https://www.reuters.com/markets/us/
https://finance.yahoo.com/topic/stock-market-news/
https://finance.yahoo.com/news/stock-market-news


## Extracting data from relavant URLs

In [None]:
from langchain_community.document_loaders import UnstructuredURLLoader
from langchain_community.document_loaders import NewsURLLoader

loader = NewsURLLoader(urls=resolved_urls)
data = loader.load()
print(len(data))
data

ERROR:langchain_community.document_loaders.news:Error fetching or processing https://www.investors.com/news/stock-market-today-stock-market-news/, exception: Article `download()` failed with 403 Client Error: Forbidden for url: https://www.investors.com/news/stock-market-today-stock-market-news/ on URL https://www.investors.com/news/stock-market-today-stock-market-news/
ERROR:langchain_community.document_loaders.news:Error fetching or processing https://www.reuters.com/markets/stocks/, exception: Article `download()` failed with 401 Client Error: HTTP Forbidden for url: https://www.reuters.com/markets/stocks/ on URL https://www.reuters.com/markets/stocks/
ERROR:langchain_community.document_loaders.news:Error fetching or processing https://www.reuters.com/markets/us/, exception: Article `download()` failed with 401 Client Error: HTTP Forbidden for url: https://www.reuters.com/markets/us/ on URL https://www.reuters.com/markets/us/


2


[Document(metadata={'title': 'Latest Stock Market News', 'link': 'https://finance.yahoo.com/topic/stock-market-news/', 'authors': [], 'language': 'en', 'description': 'Get the latest news on the stock market and events that move stocks, with in-depth analyses to help you make investing and trading decisions.', 'publish_date': None}, page_content='Trump Trade Comes to Europe, Giving Jolt to Cheap Hungary Market\n\n(Bloomberg) -- There is arguably no leader in all of Europe who’s more chummy, or ideologically aligned, with Donald Trump than Hungary’s Viktor Orban.Most Read from BloombergHow California Sees the World, and ItselfWhich goes a long way to explaining why at a time when the mood is markedly glum across European financial markets, there is a sense of optimism in Budapest. Here, investors aren’t fretting so much about the prospect of new Trump tariffs or NATO-funding feuds — like they are in, say,'),
 Document(metadata={'title': 'Latest Stock Market News', 'link': 'https://finan

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500)
docs = text_splitter.split_documents(data)

print("Total number of documents: ",len(docs))

Total number of documents:  5


In [None]:
docs

[Document(metadata={'title': 'Latest Stock Market News', 'link': 'https://finance.yahoo.com/topic/stock-market-news/', 'authors': [], 'language': 'en', 'description': 'Get the latest news on the stock market and events that move stocks, with in-depth analyses to help you make investing and trading decisions.', 'publish_date': None}, page_content='Trump Trade Comes to Europe, Giving Jolt to Cheap Hungary Market'),
 Document(metadata={'title': 'Latest Stock Market News', 'link': 'https://finance.yahoo.com/topic/stock-market-news/', 'authors': [], 'language': 'en', 'description': 'Get the latest news on the stock market and events that move stocks, with in-depth analyses to help you make investing and trading decisions.', 'publish_date': None}, page_content='(Bloomberg) -- There is arguably no leader in all of Europe who’s more chummy, or ideologically aligned, with Donald Trump than Hungary’s Viktor Orban.Most Read from BloombergHow California Sees the World, and ItselfWhich goes a long 

## Making the data retrieval-ready!

In [None]:
!pip install langchain_google_genai



In [None]:
from langchain_chroma import Chroma
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import os

os.environ["GOOGLE_API_KEY"] = "AIzaSyAmBveTWmMZyNKNwE1PQQG9DGayQMLxWsA"

embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
vector = embeddings.embed_query("test embeddings")
# vector

In [None]:
from langchain_community.vectorstores.utils import filter_complex_metadata

# Filter complex metadata from the documents
docs = filter_complex_metadata(docs)

print(docs)
vectorstore = Chroma.from_documents(documents=docs, embedding=GoogleGenerativeAIEmbeddings(model="models/embedding-001"))

[Document(metadata={'title': 'Latest Stock Market News', 'link': 'https://finance.yahoo.com/topic/stock-market-news/', 'language': 'en', 'description': 'Get the latest news on the stock market and events that move stocks, with in-depth analyses to help you make investing and trading decisions.'}, page_content='Trump Trade Comes to Europe, Giving Jolt to Cheap Hungary Market'), Document(metadata={'title': 'Latest Stock Market News', 'link': 'https://finance.yahoo.com/topic/stock-market-news/', 'language': 'en', 'description': 'Get the latest news on the stock market and events that move stocks, with in-depth analyses to help you make investing and trading decisions.'}, page_content='(Bloomberg) -- There is arguably no leader in all of Europe who’s more chummy, or ideologically aligned, with Donald Trump than Hungary’s Viktor Orban.Most Read from BloombergHow California Sees the World, and ItselfWhich goes a long way to explaining why at a time when the mood is markedly glum across Europ

In [None]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})
retrieved_docs = retriever.invoke("what is the price of tiago iCNG?")
print('Length of retrieved documents: ', len(retrieved_docs))
print(retrieved_docs)
print(retrieved_docs[1].page_content)

Length of retrieved documents:  3
[Document(metadata={'description': 'Get the latest news on the stock market and events that move stocks, with in-depth analyses to help you make investing and trading decisions.', 'language': 'en', 'link': 'https://finance.yahoo.com/news/stock-market-news', 'title': 'Latest Stock Market News'}, page_content='Business\n\nBloomberg'), Document(metadata={'description': 'Get the latest news on the stock market and events that move stocks, with in-depth analyses to help you make investing and trading decisions.', 'language': 'en', 'link': 'https://finance.yahoo.com/topic/stock-market-news/', 'title': 'Latest Stock Market News'}, page_content='Trump Trade Comes to Europe, Giving Jolt to Cheap Hungary Market'), Document(metadata={'description': 'Get the latest news on the stock market and events that move stocks, with in-depth analyses to help you make investing and trading decisions.', 'language': 'en', 'link': 'https://finance.yahoo.com/topic/stock-market-n

## Performing RAG

In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro",temperature=0.3, max_tokens=500)

In [None]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an expert equity research analyst. Use the following pieces of context to answer the question."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [None]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

## Output Evaluation

In [None]:
response = rag_chain.invoke({"input": "how is Tech titan Oracle doing?"})
print(response["answer"])
print('sources:')

sources = set()

for doc in response['context']:
    source = doc.metadata.get('link', 'Unknown')
    sources.add(source)

print(sources)


This text does not contain information about Oracle's performance. It discusses European markets, specifically positive sentiment in Hungary amidst European market glums, and the recovering performance of European automakers.  Therefore, I cannot answer your question about Oracle's performance based on the provided context.

sources:
{'https://finance.yahoo.com/topic/stock-market-news/', 'https://finance.yahoo.com/news/stock-market-news'}


In [None]:
response = rag_chain.invoke({"input": "is boing resuming production of its bestselling plane?"})
print(response["answer"])
print('sources:')

sources = set()

for doc in response['context']:
    source = doc.metadata.get('link', 'Unknown')
    sources.add(source)

print(sources)

This text discusses European automakers, not Boeing.  Therefore, there is no information in the provided text about Boeing resuming production of its bestselling plane.

sources:
{'https://finance.yahoo.com/topic/stock-market-news/', 'https://finance.yahoo.com/news/stock-market-news'}


## UI using gradio

In [None]:
!pip install gradio

Collecting aiofiles<24.0,>=22.0 (from gradio)
  Using cached aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Using cached aiofiles-23.2.1-py3-none-any.whl (15 kB)
Installing collected packages: aiofiles
  Attempting uninstall: aiofiles
    Found existing installation: aiofiles 24.1.0
    Uninstalling aiofiles-24.1.0:
      Successfully uninstalled aiofiles-24.1.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
unstructured-client 0.28.1 requires aiofiles>=24.1.0, but you have aiofiles 23.2.1 which is incompatible.[0m[31m
[0mSuccessfully installed aiofiles-23.2.1


In [None]:
import gradio as gr

def query_model(user_question):
    try:
        response = rag_chain.invoke({"input": user_question})
        answer = response.get("answer", "No answer available.")

        # Extract sources
        sources = set()
        for doc in response["context"]:
            source = doc.metadata.get("link", "Unknown")
            sources.add(source)

        sources_text = "\n".join(sources)
        return answer, sources_text
    except Exception as e:
        return f"An error occurred: {e}", "No sources available."

# Create the Gradio interface
interface = gr.Interface(
    fn=query_model,
    inputs=gr.Textbox(label="Ask a Question"),
    outputs=[
        gr.Textbox(label="Answer"),
        gr.Textbox(label="Sources"),
    ],
    title="Equity Research Assistant",
    description="Ask questions about stock market and equity research. Powered by a retrieval-augmented generation (RAG) model."
)

# Launch the Gradio app
interface.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://3ee19f3d18b438f08b.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




### References
- [Codebasics](https://www.youtube.com/watch?v=MoqgmWV1fm8&t=80s)
- [Document Loaders](https://www.comet.com/site/blog/langchain-document-loaders-for-web-data/)