In [1]:
!pip install -q langchain faiss-cpu unstructured PyPDF2
!pip install -q huggingface_hub
!pip install -U langchain-community langchain-huggingface
!pip install -q langchain-huggingface
!pip install transformers datasets tqdm ddgs
!pip install langchain-embedding
!pip install InstructorEmbedding

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m449.8/449.8 kB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain-classic 1.0.0 requires langchain-core<2.0.0,>=1.0.0, but you have langchain-core 0.3.79 which is incompatible.
langchain-classic 1.0.0 requires langchain-text-splitters<2.0.0,>=1.0.0, but you have langchain-text-splitters 0.3.11 which is incompatible.
langchain-huggingface 1.0.0 requires langchain-core<2.0.0,>=1.0.0, but you have langchain-core 0.3.79 which is incompatible.
langchain-community 0.4.1 requires langchain-core<2.0.0,>=1.0.1, but you have langchain-core 0.3.79 which is incompatible.[0m[31m
Collecting langchain-core<2.0.0,>=1.0.1 (from langchain-community)
  Using cached langchain_core-1.0.1-py3-none-any.whl.metadata (3.5 kB)
Collecting langchain-text-splitters<

In [2]:
import os
import json
from tqdm import tqdm
from langchain.vectorstores import FAISS

# Model Load

In [5]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain.llms import HuggingFacePipeline
from InstructorEmbedding import INSTRUCTOR
from langchain_community.embeddings import HuggingFaceInstructEmbeddings
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# === 1️⃣ Load fine-tuned financial LLaMA3 model for .invoke() ===
llm_model_path = "/kaggle/input/investing-fine-tuned-model-llama-3-2/other/default/1"
llm_tokenizer = AutoTokenizer.from_pretrained(llm_model_path)
llm_model = AutoModelForCausalLM.from_pretrained(llm_model_path, torch_dtype=torch.float16, device_map="cpu")

llm_pipeline = pipeline(
    "text-generation",
    model=llm_model,
    tokenizer=llm_tokenizer,
    device_map="auto",
    torch_dtype=torch.float16,
    max_new_tokens=256,
    temperature=0.2,
    do_sample=False
)
llm = HuggingFacePipeline(pipeline=llm_pipeline)
print("✅ Fine-tuned financial model loaded.")

# === 2️⃣ Load local INSTRUCTOR embedding model ===
# embedding_model_path = "/kaggle/input/qwen-3-embedding/transformers/4b/1"  # 4B parameters
embedding_model_path = "/kaggle/input/qwen-3-embedding/transformers/0.6b/1"  # 0.6B parameters
# embeddings_model = INSTRUCTOR(embedding_model_path, device=device)

embeddings_model = HuggingFaceInstructEmbeddings(
    model_name=embedding_model_path,
    model_kwargs={"device": device}
)

print("✅ Embedding model loaded successfully.")

# LLM
response = llm.invoke("Get Tesla 2023 annual report")
print(response)

# Embeddings
vector = embeddings_model.embed_documents("Tesla revenue 2023")
print(len(vector))

Using device: cuda
Get Tesla 2023 annual report
The passage is referring to the Tesla 2023 annual report. However, it does not provide any details about the content of the report. To know the details, you would need to refer to the report itself. Additionally, the passage does not provide any context or information about the report's release date or author. For a complete understanding, you would need to refer to the report itself. (Note: The passage is not providing the actual details, but it mentions the report itself, which might be a reference to a different document or event.) (Note: The passage is not providing the actual details, but it mentions the report itself, which might be a reference to a different document or event.) (Note: The passage is not providing the actual details, but it mentions the report itself, which might be a reference to a different document or event.) (Note: The passage is not providing the actual details, but it mentions the report itself, which might be

# Common Functions

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from typing import List

def get_split_docs(documents):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=2000,
        chunk_overlap=350
    )
    split_docs = text_splitter.split_documents(documents)
    print(f"Created {len(split_docs)} text chunks.")
    return split_docs


# Example documents
docs = [
    Document(page_content="This is the content of document 1."),
    Document(page_content="This is the content of document 2.")
]

chunks = get_split_docs(docs)
print(chunks[0].page_content)  # first chunk

Created 2 text chunks.
This is the content of document 1.


In [7]:
def get_combined_vector_db(*args):
    combined_docs = []
    for doc_list in args:
        if doc_list:  # make sure it's not None or empty
            combined_docs.extend(doc_list)
    
    # Create vector DB for all docs
    combined_vector_db = FAISS.from_documents(combined_docs, embeddings_model)
    
    return combined_vector_db


In [8]:
from langchain_community.document_loaders import PyPDFLoader
def get_pdf_split_docs(pdf_folder):
    pdf_files = [os.path.join(pdf_folder, f) for f in os.listdir(pdf_folder) if f.endswith(".pdf")]
    all_documents = []
    
    for pdf_path in pdf_files:
        loader = PyPDFLoader(pdf_path)
        docs = loader.load()  # this returns list of Document objects
        all_documents.extend(docs)
        
    split_docs = get_split_docs(all_documents)
    
    return split_docs

# Three sources of information will be used. 
### 1. PDF books added to Kaggle input directory
### 2. Web search results
### 3. Company Annual Reports

# 1. PDF Embedding

In [9]:
pdf_folder = "/kaggle/input/investing-books-pdf"
local_pdf_split_docs = get_pdf_split_docs(pdf_folder)

def get_local_pdf_split_docs():
    return local_pdf_split_docs

Created 8126 text chunks.


# 2. Web Search

In [10]:
from ddgs import DDGS
import requests
from bs4 import BeautifulSoup

def web_search(query, max_results=3):
    docs = []

    # Search the web using DuckDuckGo
    with DDGS() as ddgs:
        results = list(ddgs.text(query, max_results=max_results))

    for result in results:
        url = result.get("href")
        title = result.get("title")

        try:
            # Fetch the web page content
            response = requests.get(url, timeout=10)
            response.raise_for_status()

            # Parse visible text from HTML
            soup = BeautifulSoup(response.text, "html.parser")
            for tag in soup(["script", "style", "noscript"]):
                tag.extract()
            text = " ".join(soup.get_text(separator=" ").split())

            # Only keep if text length is reasonable
            if len(text) > 500:
                docs.append(Document(page_content=text, metadata={"source": url, "title": title}))
        except Exception as e:
            pass

    # print(f"✅ Retrieved {len(docs)} web documents from top {max_results} results.")
    return docs

query = "Tesla 2025 annual revenue analysis"
documents = web_search(query)

print(f"Number of documents: {len(documents)}")
print(documents[0].metadata)
print(documents[0].page_content[:500])  # preview text

Number of documents: 1
{'source': 'https://carbuzz.com/12-percent-tesla-owners-choose-full-self-driving/', 'title': 'Tesla Customers Are Not Paying Extra To Get FSD'}
Tesla Customers Are Not Paying Extra To Get FSD Menu Sign in now Close News Features Car Brands Best Cars Submenu Best SUVs Best Crossovers Best Trucks Best Vans Best Sedans Best Coupes Best Hatchbacks Best Convertibles Best Hybrid Cars Best Electric Cars Best Sports Cars Best Luxury Cars Best Small Cars Best Wagons Car Comparisons Reviews Car Advice Videos Threads CarBuzz Awards Sign in Newsletter Menu Follow Followed Like Threads More Action Summary Generate a summary of this story Sign in now


In [11]:
def get_web_search_split_docs(query, max_results=3):
    docs = web_search(query, max_results)
    split_docs = get_split_docs(docs)
    return split_docs

# 3. Web PDF Retriever

In [12]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import json

def analyze_query_with_llm(query: str):
    """
    Use the fine-tuned LLM to decide if annual report retrieval is required,
    and extract the company name and year if applicable.

    Returns a dictionary like:
    {
        "required": True,
        "company": "Tesla",
        "year": "2024"
    }
    """
    prompt = f"""
    You are an intelligent financial assistant. 
    Your task is to analyze the user query and decide:
    1. Whether it requires downloading a company's annual report.
    2. If yes, extract the company name and the year of the report.
    
    Respond strictly in this JSON format:
    {{
      "required": true or false,
      "company": "Company name or null",
      "year": "Year or null"
    }}
    
    User query: "{query}"
    """
    # Tokenize input
    inputs = llm_tokenizer(prompt, return_tensors="pt").to(llm_model.device)

    # Generate model output
    outputs = llm_model.generate(
        **inputs,
        max_new_tokens=256,
        temperature=0.2,
        do_sample=False
    )

    response_text = llm_tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Try to extract JSON response from model output
    try:
        json_start = response_text.find("{")
        json_end = response_text.rfind("}") + 1
        json_str = response_text[json_start:json_end]
        result = json.loads(json_str)
    except Exception as e:
        print("⚠️ Could not parse model output properly. Raw output:")
        # print(response_text)
        result = {"required": False, "company": None, "year": None}

    return result


In [13]:
from ddgs import DDGS

def search_pdfs(company_name, year=2024, max_results=5):
    query = f"{company_name} {year} annual report filetype:pdf"
    with DDGS() as ddgs:
        results = list(ddgs.text(query, max_results=max_results))
    pdf_links = [r['href'] for r in results if r['href'].endswith('.pdf')]
    return pdf_links

# Example:
pdf_urls = search_pdfs("Square Pharma", 2024)
print(pdf_urls)

['https://www.squarepharma.com.bd/downloads/Square+Pharma_AR_24+dt-24-11-24_compressed_1.pdf', 'https://pharmamar.com/wp-content/uploads/2025/04/ANNUAL-REPORT-2024.pdf', 'https://www.squarepharma.com.bd/SPL+1st+Qtr+Financial+Report+2023-2024.pdf', 'https://www.squarepharma.com.bd/Latest+Audited+Financial+Statement.pdf', 'https://www.squarepharma.com.bd/Square+Pharma_AR_2023.pdf']


In [14]:
import requests

def download_pdf(pdf_links, save_dir):
    os.makedirs(save_dir, exist_ok=True)  # ensure directory exists

    for i, url in enumerate(pdf_links, 1):
        response = requests.get(url, stream=True)
        if response.status_code == 200 and 'application/pdf' in response.headers.get('Content-Type', ''):
            file_path = os.path.join(save_dir, f"web_pdf_{i}.pdf")
            with open(file_path, 'wb') as f:
                for chunk in response.iter_content(1024):
                    f.write(chunk)
            print(f"✅ Downloaded: {file_path}")
        else:
            print(f"❌ Failed to download from {url}")

# for i in range(len(pdf_urls)):
# download_pdf(pdf_urls[4], "apple_2024_annual_report.pdf")
# download_pdf("https://www.squarepharma.com.bd/downloads/Square%20Pharma_AR_24%20dt-24-11-24_compressed_1.pdf", "hjd.pdf")

In [15]:
import shutil
def clean_output_dir():
    path = "/kaggle/working/"
    for f in os.listdir(path):
        fp = os.path.join(path, f)
        if os.path.isfile(fp) or os.path.islink(fp):
            os.unlink(fp)
        elif os.path.isdir(fp):
            shutil.rmtree(fp)

In [16]:
def get_web_pdf_split_docs(query, max_results=5):
    result = analyze_query_with_llm(query) 
    
    pdf_links = search_pdfs(result['company'], result['year'])
    
    output_dir = "/kaggle/working/"
    
    download_pdf(pdf_links, output_dir)
    
    split_docs = get_pdf_split_docs(output_dir)

    return split_docs
    

# Retrieval

In [17]:
def rag_retrieval(vector_db, query):
    """
    Perform similarity search on the vector DB using the query.
    Returns top relevant documents.
    """
    return vector_db.similarity_search(query, k=5)  # top 5 results


# Augmentation

In [18]:
from langchain_core.prompts import PromptTemplate # <-- Fix this line

# Define the template string
prompt = """
You are a financial and investing expert specializing in long-term investing, company analysis, and stock market strategies. 
You have deep knowledge of business fundamentals, financial statements, market trends, and investment analysis.

Use the following context from company reports, web data, and relevant documents to answer the user query. 
Always base your answers on the provided context and do not make unsupported claims. 

Context:
{context}

User Question:
{question}

Instructions:
- Analyze the financial and business information carefully.
- Provide long-term investment insights.
- Give clear reasoning and avoid generic statements.
- If the context does not provide sufficient information, say "Insufficient data provided."
- Summarize your analysis in a professional and concise manner.

Answer:
"""

# Create a PromptTemplate instance
investing_prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt
)

# Example usage
retrieved_context = "Tesla's revenue in 2023 increased by 15% compared to 2022. Gross margin is 25%."
user_query = "Should I invest in Tesla for the next 5 years?"

final_prompt = investing_prompt.format(context=retrieved_context, question=user_query)
print(final_prompt)


You are a financial and investing expert specializing in long-term investing, company analysis, and stock market strategies. 
You have deep knowledge of business fundamentals, financial statements, market trends, and investment analysis.

Use the following context from company reports, web data, and relevant documents to answer the user query. 
Always base your answers on the provided context and do not make unsupported claims. 

Context:
Tesla's revenue in 2023 increased by 15% compared to 2022. Gross margin is 25%.

User Question:
Should I invest in Tesla for the next 5 years?

Instructions:
- Analyze the financial and business information carefully.
- Provide long-term investment insights.
- Give clear reasoning and avoid generic statements.
- If the context does not provide sufficient information, say "Insufficient data provided."
- Summarize your analysis in a professional and concise manner.

Answer:



# Generation

In [19]:
query = "Give me detailed fundamental analysis on Marico Bangladesh Limited"

### After user query, three functions are called to get split docs from three different data source. Then, the split docs are combined and embedded with the get_combined_vector_db function. All,the embedded vectors are stored in one vector_db to retrieve relevant context for prompt augmentation.

In [None]:
local_pdf_split_docs = get_local_pdf_split_docs()
web_pdf_split_docs = get_web_pdf_split_docs(query)
web_search_split_docs = get_web_pdf_split_docs(query)
clean_output_dir() 
vector_db = get_combined_vector_db(local_pdf_split_docs, web_pdf_split_docs, web_search_split_docs)
vector_db

In [None]:
retrieved_context = rag_retrieval(vector_db, query)
final_prompt = investing_prompt.format(context=retrieved_context, question=user_query)

In [None]:
ans = llm.invoke(final_prompt)
print(ans)

# Chain Construction

In [None]:
# from langchain.schema import Document
# from langchain.runnables import (
#     RunnablePassthrough,
#     RunnableLambda,
#     RunnableSequence,
#     RunnableParallel,
# )

# # 2️⃣ Parallel execution of three branches
# parallel_chain = RunnableParallel(
#     {
#         "books": RunnableSequence(
#             [
#                 RunnableLambda(lambda query: get_local_pdf_split_docs(query)),   # split preloaded book PDFs
#             ]
#         ),
#         "online_text": RunnableSequence(
#             [
#                 RunnableLambda(lambda query: get_web_search_split_docs(query)),  # search + split text
#             ]
#         ),
#         "online_pdf": RunnableSequence(
#             [
#                 RunnableLambda(lambda query: get_web_pdf_split_docs(query)),  # download + split PDFs
#             ]
#         ),
#     },
#     combine_mode="list",  # returns a list of outputs for each branch
# )

# # 3️⃣ Combine embeddings and build RAG pipeline
# combine_chain = RunnableSequence(
#     [
#         RunnableLambda(lambda split_docs_lists: get_combined_vector_db(*split_docs_lists)),  # combine all split docs
#         RunnableLambda(lambda vector_db: rag_retrieval(vector_db)),                           # retrieval + augmentation
#         RunnableLambda(lambda result: generate_answer(result)),                               # LLM generation
#     ]
# )

# # 4️⃣ Full pipeline: sequence of parallel + combine
# full_pipeline = RunnableSequence(
#     [
#         RunnablePassthrough(),

#         # Step 2: Parallel execution
#         RunnableLambda(
#             lambda query: parallel_chain.invoke({
#                 "books": query,  # load local book PDFs internally
#                 "online_text": query,                # user query for web search
#                 "online_pdf": query                  # user query for online PDFs
#             })
#         ),

#         # Step 3: Combine embeddings and run RAG
#         combine_chain
#     ]
# )