### Document loaders & processing

In [21]:
from langchain_community.document_loaders import WebBaseLoader, PyPDFLoader, TextLoader
from dotenv import load_dotenv
import os
load_dotenv()
os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN')
os.environ['GROQ_API_KEY'] = os.getenv('GROQ_API_KEY')



In [2]:
def load_documents_from_pdf(folder_path):
    """
    Load documents from a folder containing multiple PDF files.
    """
    import os
    pdf_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.pdf')]
    pdf_documents = []
    for pdf_file in pdf_files:
        loader = PyPDFLoader(pdf_file)
        pdf_documents.extend(loader.load())
    return pdf_documents
folder_path = "./data/pdf"
pdf_documents = load_documents_from_pdf(folder_path)
print(f"Loaded {len(pdf_documents)} documents from {folder_path}")


Loaded 1638 documents from ./data/pdf


In [3]:
print(len(pdf_documents))

1638


In [4]:
def load_documents_from_url(file_path):
    """
    Load documents from a given URL using WebBaseLoader.
    """
    with open(file_path, 'r') as f:
        urls = f.readlines()
    urls = [url.strip() for url in urls if url.strip()]
    web_documents = []
    for url in urls:
        loader = WebBaseLoader(url)
        web_documents.extend(loader.load())
    print(web_documents)
    return web_documents
file_path = "./data/urls/urls.txt"
web_documents = load_documents_from_url(file_path)
print(f"Loaded {len(web_documents)} documents from {file_path}")


[Document(metadata={'source': 'https://www.cbinsights.com/research', 'title': 'ERROR: The request could not be satisfied', 'language': 'No language found.'}, page_content="\n\nERROR: The request could not be satisfied\n\n403 ERROR\nThe request could not be satisfied.\n\nRequest blocked.\nWe can't connect to the server for this app or website at this time. There might be too much traffic or a configuration error. Try again later, or contact the app or website owner.\n\nIf you provide content to customers through CloudFront, you can find steps to troubleshoot and help prevent this error by reviewing the CloudFront documentation.\n\n\n\nGenerated by cloudfront (CloudFront)\nRequest ID: uWWCp92wTVQd-Jg3kw-Qi531EnGj2VHtqU3S4W875jWaIn9zHWlfpg==\n\n\n\n"), Document(metadata={'source': 'https://a16z.com', 'title': 'Andreessen Horowitz | Software Is Eating the World', 'description': 'Andreessen Horowitz (a16z) is a venture capital firm in Silicon Valley, California, investing in bold founders, 

In [5]:
print(len(web_documents))

49


In [6]:


from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.vectorstores import Chroma, FAISS



In [9]:
# from langchain.schema import Document

## Flatten nested lists
# def flatten(doc_lists):
#     return [doc for sublist in doc_lists for doc in sublist if isinstance(doc, Document)]

# # Example: apply to your documents
# pdf_documents = flatten(pdf_documents)
# web_documents = flatten(web_documents)

# # Now combine safely
all_documents = pdf_documents + web_documents


In [10]:
print(len(all_documents))

1687


In [11]:

# ---- Split into chunks ----
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.split_documents(web_documents)

# ---- Embed and Store ----
embedding = HuggingFaceBgeEmbeddings(model_name="all-MiniLM-L6-v2")
vectorstore = Chroma.from_documents(chunks, embedding=embedding, persist_directory="./data/chroma_db")
# Persist the vectorstore
vectorstore.persist()

print(f"✅ Loaded and embedded {len(chunks)} chunks into ChromaDB.")


  embedding = HuggingFaceBgeEmbeddings(model_name="all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm


✅ Loaded and embedded 145 chunks into ChromaDB.


  vectorstore.persist()


In [None]:
# FAISS example
# vectorstore2 = FAISS.from_documents(chunks, embedding)


### Creating Retriever Tool

In [13]:
from langchain.tools.retriever import create_retriever_tool
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

# Create a retriever tool
retriever_tool = create_retriever_tool(
    retriever=retriever,
    name="startup reports Retriever",
    description="A tool to retrieve documents from ChromaDB on startup reports.",
    
)

In [14]:
from langchain.tools import tool, DuckDuckGoSearchRun, ArxivQueryRun, WikipediaQueryRun

@tool
def market_trend_tool(industry: str) -> str:
    """Returns a summary of current market trends for a given industry."""
    query = f"What are current market trends in {industry}?"
    return qa.run(query)

In [15]:
@tool
def swot_tool(idea: str) -> str:
    """Performs a SWOT analysis on a given idea."""
    prompt = f"Give a SWOT analysis of the idea: {idea}."
    return qa.run(prompt)


In [16]:
@tool
def competitor_tool(idea: str) -> str:
    """Finds top 5 competitors for the idea."""
    prompt = f"List top 5 competitors for this startup idea: {idea}."
    return qa.run(prompt)


In [17]:
@tool
def startup_score_tool(idea: str) -> str:
    """Gives a startup viability score out of 100."""
    prompt = f"Rate the viability of this startup idea on a scale from 0 to 100: {idea}."
    return qa.run(prompt)


In [19]:
from langchain_community.utilities import DuckDuckGoSearchAPIWrapper
from langchain_community.tools import DuckDuckGoSearchResults

wrapper = DuckDuckGoSearchAPIWrapper(region="in-en", time="m", max_results=3)

search_tool = DuckDuckGoSearchResults(api_wrapper=wrapper, source="news")


In [20]:

from langchain.agents import initialize_agent, AgentType

tools = [
    retriever_tool,
    market_trend_tool,
    swot_tool,
    competitor_tool,
    startup_score_tool,
    search_tool
]

In [27]:
from langchain_groq import ChatGroq
import os

# Set up your ChatGroq LLM with necessary parameters
llm = ChatGroq(
    model="DeepSeek-R1-Distill-Llama-70b",  # Ensure the correct model name (e.g., llama-3-7b)
    temperature=0.7,  # Adjust temperature if needed
    max_tokens=2000,  # Set maximum tokens
    groq_api_key=os.getenv("GROQ_API_KEY"),  # API key from environment variables
    streaming=True,  # Enable streaming (if needed)
    verbose=True  # Set to True for detailed logs
)



In [3]:
from langchain import hub
from langchain.agents import AgentExecutor, create_react_agent

prompt = hub.pull("hwchase17/react")
print(prompt)

input_variables=['agent_scratchpad', 'input', 'tool_names', 'tools'] input_types={} partial_variables={} metadata={'lc_hub_owner': 'hwchase17', 'lc_hub_repo': 'react', 'lc_hub_commit_hash': 'd15fe3c426f1c4b3f37c9198853e4a86e20c425ca7f4752ec0c9b0e97ca7ea4d'} template='Answer the following questions as best you can. You have access to the following tools:\n\n{tools}\n\nUse the following format:\n\nQuestion: the input question you must answer\nThought: you should always think about what to do\nAction: the action to take, should be one of [{tool_names}]\nAction Input: the input to the action\nObservation: the result of the action\n... (this Thought/Action/Action Input/Observation can repeat N times)\nThought: I now know the final answer\nFinal Answer: the final answer to the original input question\n\nBegin!\n\nQuestion: {input}\nThought:{agent_scratchpad}'


In [None]:


# from langchain_core.output_parsers.base import BaseOutputParser
# from langchain_core.prompts import PromptTemplate

# class CustomOutputParser(BaseOutputParser):
#     def parse(self, output: str) -> dict:
#         # Implement your custom parsing logic here
#         # Example: Convert the raw output string into a dictionary or structured data
#         if "business idea" in output:
#             return {"idea": output.strip()}
#         else:
#             return {"error": "No business idea found in the output."}
# custom_output_parser = CustomOutputParser()


# agent  = create_react_agent(
#     llm=llm,
#     tools=tools,
#     prompt=prompt,
#     output_parser=custom_output_parser
# )

# response = agent.invoke({
#     "input": "Tell me a business idea using AI and fitness",
#     "intermediate_steps": []
# })




In [1]:
from langchain_core.output_parsers import BaseOutputParser

class StartupIdeaValidatorParser(BaseOutputParser):
    def parse(self, output):
        # Directly use the raw output string instead of accessing it as a dictionary
        response = output  # This should be a string
        
        # Extract the sections of the response using string operations
        idea_start = response.find("**Business Idea:") + len("**Business Idea:")
        idea_end = response.find("**Market Opportunity:")
        business_idea = response[idea_start:idea_end].strip()
        
        market_start = response.find("**Market Opportunity:") + len("**Market Opportunity:")
        market_end = response.find("**Competitive Landscape:")
        market_opportunity = response[market_start:market_end].strip()
        
        competitive_start = response.find("**Competitive Landscape:") + len("**Competitive Landscape:")
        competitive_end = response.find("**SWOT Analysis:")
        competitive_landscape = response[competitive_start:competitive_end].strip()

        swot_start = response.find("**SWOT Analysis:") + len("**SWOT Analysis:")
        swot_end = response.find("**Financial Viability:")
        swot_analysis = response[swot_start:swot_end].strip()

        financial_start = response.find("**Financial Viability:") + len("**Financial Viability:")
        financial_end = response.find("**Risk Assessment:")
        financial_viability = response[financial_start:financial_end].strip()
        
        risk_start = response.find("**Risk Assessment:") + len("**Risk Assessment:")
        risk_end = response.find("**Recommendation:")
        risk_assessment = response[risk_start:risk_end].strip()

        recommendation_start = response.find("**Recommendation:") + len("**Recommendation:")
        recommendation = response[recommendation_start:].strip()

        return {
            "business_idea": business_idea,
            "market_opportunity": market_opportunity,
            "competitive_landscape": competitive_landscape,
            "swot_analysis": swot_analysis,
            "financial_viability": financial_viability,
            "risk_assessment": risk_assessment,
            "recommendation": recommendation
        }


In [4]:
# Update agent with the custom output parser
agent1 = create_react_agent(
    llm=llm,
    tools=tools,
    prompt=prompt,
    output_parser=CustomOutputParser()  # Use the custom output parser
)




NameError: name 'llm' is not defined

In [None]:
# Invoke the agent and parse the response
response = agent1.invoke({
    "input": "Tell me a business idea using AI and healthcare",
    "intermediate_steps": []
})

# Display the structured response
print(response)