<a href="https://colab.research.google.com/github/KaifAhmad1/code-test/blob/main/Search_Sources.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
!pip install --upgrade --quiet sentence-transformers langchain langchain-groq langchain-pinecone langchain_cohere
!pip install --quiet -U "langchain-community>=0.2.16" langchain-exa langchain-google-community goose3 crawl4ai[all]
!pip install --upgrade --quiet faiss-cpu langchain_cohere
!pip install -qU langgraph

In [33]:
import asyncio
from typing import List, Dict, Any, Optional
from pydantic import BaseModel
from datetime import datetime
import pytz
from dotenv import load_dotenv
import nest_asyncio
import os
import getpass

from langchain_groq import ChatGroq
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone

from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.utilities import GoogleSerperAPIWrapper
from langchain_community.tools import TavilySearchResults
from langchain_google_community import GoogleSearchAPIWrapper
from exa_py import Exa

# Apply nest_asyncio to allow nested event loops
nest_asyncio.apply()

# Load environment variables
load_dotenv()

# API Keys (hidden for security purposes)
GROQ_API_KEY = getpass.getpass("Enter your Groq API key: ")
PINECONE_API_KEY = "8e15b925-3b96-497d-b20a-08d308782b83"
PINECONE_ENVIRONMENT = "us-east-1"
ASKNEWS_CLIENT_ID = "a0de4609-b760-4c83-9609-5c04d7743b84"
ASKNEWS_CLIENT_SECRET = "D5Mlhkztk4TcW24diUgcW0FA2w"
SERPER_API_KEY = "d8e815ef6caa94dbef7b977a0ea7d505b43a5a06"
EXA_API_KEY = "953b5801-11be-4b37-a313-f8df8f37027c"
GOOGLE_API_KEY = "AIzaSyBIQo9X6acoBazBfte9jF9Pl0QEZ9oe8pk"
GOOGLE_CSE_ID = "63053004a7e2445c3"
TAVILY_API_KEY = "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"
FIRECRAWL_API_KEY = "fc-9c7bf92d1db44ae1a34f9dc56a6031e6"
COHERE_API_KEY = "7e9js19mjC1pb3dNHKg012u6J9LRl8614KFL4ZmL"

# Set environment variables for Search Tools
os.environ["GROQ_API_KEY"] = GROQ_API_KEY
os.environ["ASKNEWS_CLIENT_ID"] = ASKNEWS_CLIENT_ID
os.environ["ASKNEWS_CLIENT_SECRET"] = ASKNEWS_CLIENT_SECRET
os.environ["SERPER_API_KEY"] = SERPER_API_KEY
os.environ["EXA_API_KEY"] = EXA_API_KEY
os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY
os.environ["GOOGLE_CSE_ID"] = GOOGLE_CSE_ID
os.environ["TAVILY_API_KEY"] = TAVILY_API_KEY
os.environ["FIRECRAWL_API_KEY"] = FIRECRAWL_API_KEY
os.environ["COHERE_API_KEY"] = COHERE_API_KEY

Enter your Groq API key: ··········


In [36]:
# Initialize the Groq model
llm = ChatGroq(
    model="llama-3.2-3b-preview",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

# Initialize the embeddings with advanced BGE model
embeddings = HuggingFaceBgeEmbeddings(
    model_name="BAAI/bge-large-en-v1.5",
    model_kwargs={"device": "cpu"},
    encode_kwargs={"normalize_embeddings": True}
)

# Initialize Pinecone and vector store
pc = Pinecone(api_key=PINECONE_API_KEY, environment=PINECONE_ENVIRONMENT)
pinecone_index = pc.Index("new-cyber-search")
vector_store = PineconeVectorStore(index=pinecone_index, embedding=embeddings)

# Initialize search tools
google_serper = GoogleSerperAPIWrapper()
tavily_search = TavilySearchResults(
    max_results=5,
    search_depth="advanced",
    include_answer=True,
    include_raw_content=True,
    include_images=True,
)
google_search = GoogleSearchAPIWrapper()
exa = Exa(api_key=EXA_API_KEY)

# Define the retriever
retriever = vector_store.as_retriever()

In [37]:
from typing import TypedDict, List, Dict, Any, Optional
from pydantic import BaseModel
class SearchResult(BaseModel):
    source: str
    title: str
    snippet: str
    url: str
    date: Optional[str]
    media: Optional[List[str]] = []
    media_content: Optional[List[Dict[str, str]]] = []
    links: Optional[List[str]] = []
    source_weight: Optional[float] = None
    source_name: Optional[str] = None
    final_score: Optional[float] = None
    metadata: Optional[Dict[str, Any]] = {}

def parse_date(date_str: Optional[str]) -> Optional[datetime]:
    if not date_str:
        return None
    try:
        return datetime.fromisoformat(date_str.replace('Z', '+00:00'))
    except ValueError:
        try:
            return datetime.strptime(date_str, "%Y-%m-%d")
        except ValueError:
            return None

In [38]:
def vector_search(query: str, vector_store: PineconeVectorStore) -> List[SearchResult]:
    if not vector_store:
        print("Vector store is not initialized.")
        return []

    print(f"Performing vector search with query: {query}")
    results = vector_store.similarity_search(query, k=5)
    return [
        SearchResult(
            source="Vector Search",
            title=f"Result {i+1}",
            snippet=doc.page_content,
            url=doc.metadata.get("source", "No URL"),
            date=doc.metadata.get("date"),
            metadata=doc.metadata
        ) for i, doc in enumerate(results)
    ]

def google_serper_search(query: str, google_serper: GoogleSerperAPIWrapper) -> List[SearchResult]:
    print(f"Performing Google Serper search with query: {query}")
    results = google_serper.results(query)
    return [
        SearchResult(
            source="Google Serper",
            title=result.get("title", "No title"),
            snippet=result.get("snippet", "No snippet"),
            url=result.get("link", "No link"),
            date=result.get("date"),
            metadata={
                "author": result.get("author"),
                "location": result.get("location")
            }
        ) for result in results.get("organic", [])
    ]

def exa_search(query: str, exa: Exa) -> List[SearchResult]:
    try:
        print(f"Starting Exa Search with query: {query}")
        response = exa.search_and_contents(
            query, use_autoprompt=True, num_results=5, text=True, highlights=True
        )
        print(f"Raw results from Exa Search: {response}")

        results = response.results  # Extract the list of results from the SearchResponse object

        search_results = [
            SearchResult(
                source="Exa Search",
                title=result.get("title", "No title"),
                snippet=result.get("snippet", "No snippet"),
                url=result.get("link", "No link"),
                date=result.get("date"),
                metadata={
                    "author": result.get("author"),
                    "location": result.get("location")
                }
            ) for result in results
        ]

        print(f"Processed Exa Search results: {search_results}")
        return search_results
    except Exception as e:
        print(f"ERROR in Exa Search: {str(e)}")
        return []

def tavily_search(query: str, tavily_search: TavilySearchResults) -> List[SearchResult]:
    try:
        print(f"Performing Tavily search with query: {query}")
        results = tavily_search.invoke({"query": query})
        return [
            SearchResult(
                source="Tavily Search",
                title=result.get("title", "No title"),
                snippet=result.get("content", "No snippet"),
                url=result.get("url", "No link"),
                date=result.get("date"),
                metadata={
                    "author": result.get("author"),
                    "location": result.get("location")
                }
            ) for result in results
        ]
    except Exception as e:
        print(f"ERROR in Tavily Search: {str(e)}")
        return []

def google_programmable_search(query: str, google_search: GoogleSearchAPIWrapper) -> List[SearchResult]:
    try:
        print(f"Performing Google Programmable search with query: {query}")
        results = google_search.results(query, num_results=5)
        return [
            SearchResult(
                source="Google Programmable Search",
                title=result.get("title", "No title"),
                snippet=result.get("snippet", "No snippet"),
                url=result.get("link", "No link"),
                date=result.get("date"),
                metadata={
                    "author": result.get("author"),
                    "location": result.get("location")
                }
            ) for result in results
        ]
    except Exception as e:
        print(f"ERROR in Google Programmable Search: {str(e)}")
        return []

def google_serper_image_search(query: str) -> List[SearchResult]:
    print(f"Performing Google Serper Image search with query: {query}")
    search_images = GoogleSerperAPIWrapper(type="images")
    results_images = search_images.results(query)
    return [
        SearchResult(
            source="Google Serper Image Search",
            title=result.get("title", "No title"),
            snippet=result.get("snippet", "No snippet"),
            url=result.get("imageUrl", "No link"),
            date=None,
            media=[result.get("imageUrl", "No link")]
        ) for result in results_images.get("images", [])
    ]

def google_programmable_image_search(query: str, google_search: GoogleSearchAPIWrapper) -> List[SearchResult]:
    try:
        print(f"Performing Google Programmable Image search with query: {query}")
        results = google_search.results(query + " image", num_results=5)
        return [
            SearchResult(
                source="Google Programmable Image Search",
                title=result.get("title", "No title"),
                snippet=result.get("snippet", "No snippet"),
                url=result.get("link", "No link"),
                date=None,
                media=[result.get("link", "No link")]
            ) for result in results
        ]
    except Exception as e:
        print(f"ERROR in Google Programmable Image Search: {str(e)}")
        return []

def aggregate_search_results(query: str, *args: List[SearchResult]) -> List[SearchResult]:
    all_results = []
    sources = ['vector', 'serper', 'exa', 'tavily', 'google', 'google_serper_image', 'google_programmable_image']
    weights = [0.6, 1.0, 0.9, 0.85, 0.8, 0.75, 0.7]  # Adjusted weights to prioritize Google Serper, Google Programmable Search, Exa.ai, and Tavily

    for results, source, weight in zip(args, sources, weights):
        all_results.extend([(result, source, weight, result.source_weight or 0, parse_date(result.date)) for result in results])

    seen_urls = set()
    unique_results = []

    for result, source, weight, source_weight, date in all_results:
        if result.url not in seen_urls:
            seen_urls.add(result.url)
            result.source_weight = source_weight
            result.source_name = source
            date_score = calculate_recency_score(date)
            final_score = weight + source_weight + date_score
            result.final_score = final_score
            unique_results.append(result)

    unique_results.sort(reverse=True, key=lambda x: x.final_score)
    return unique_results

def calculate_recency_score(date: Optional[datetime]) -> float:
    if date is None:
        return 0.0
    current_date = datetime.now(pytz.utc)
    days_old = (current_date - date).days
    if days_old < 0:  # Future date
        return 0.0
    return 0.9 ** days_old  # Exponential decay with base 0.9

In [39]:
async def execute_searches(query: str, tools: Dict[str, Any]) -> Dict[str, Any]:
    print(f"Executing searches for query: {query}")

    # Execute all searches in parallel
    search_functions = [
        (google_serper_search, tools["google_serper"]),
        (google_programmable_search, tools["google_search"]),
        (exa_search, tools["exa"]),
        (tavily_search, tools["tavily_search"]),
        (vector_search, tools["vector_store"]),
        (google_serper_image_search, None),
        (google_programmable_image_search, tools["google_search"])
    ]
    search_tasks = [asyncio.to_thread(func, query, tool) if tool else asyncio.to_thread(func, query) for func, tool in search_functions]
    search_results = await asyncio.gather(*search_tasks, return_exceptions=True)

    # Handle exceptions and filter out failed searches
    successful_results = []
    for results in search_results:
        if isinstance(results, Exception):
            print(f"ERROR in search: {str(results)}")
        else:
            successful_results.append(results)

    # Aggregate and deduplicate results with metadata scoring
    combined_results = aggregate_search_results(query, *successful_results)

    # Extract URLs and images from the combined results
    urls = [result.url for result in combined_results]
    media = [media for result in combined_results for media in result.media]

    return {
        "results": combined_results,
        "urls": urls,
        "media": media
    }

def initialize_api_keys():
    # This function is already handled by setting environment variables directly
    pass

def initialize_models_and_tools():
    # Initialize the Groq model
    llm = ChatGroq(
        model="llama-3.2-3b-preview",
        temperature=0,
        max_tokens=None,
        timeout=None,
        max_retries=2,
    )

    # Initialize the embeddings with advanced BGE model
    embeddings = HuggingFaceBgeEmbeddings(
        model_name="BAAI/bge-large-en-v1.5",
        model_kwargs={"device": "cpu"},
        encode_kwargs={"normalize_embeddings": True}
    )

    try:
        # Initialize Pinecone and vector store
        pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"), environment=os.getenv("PINECONE_ENVIRONMENT"))
        pinecone_index = pc.Index("new-cyber-search")
        vector_store = PineconeVectorStore(index=pinecone_index, embedding=embeddings)
    except PineconeConfigurationError as e:
        print(f"ERROR initializing Pinecone: {str(e)}")
        vector_store = None

    # Initialize other models and tools
    google_serper = GoogleSerperAPIWrapper()
    tavily_search = TavilySearchResults(
        max_results=5,
        search_depth="advanced",
        include_answer=True,
        include_raw_content=True,
        include_images=True,
    )
    google_search = GoogleSearchAPIWrapper()
    exa = Exa(api_key=os.getenv("EXA_API_KEY"))

    return {
        "google_serper": google_serper,
        "google_search": google_search,
        "exa": exa,
        "tavily_search": tavily_search,
        "vector_store": vector_store
    }

In [40]:
# Example usage
if __name__ == "__main__":
    initialize_api_keys()
    tools = initialize_models_and_tools()

    if tools:
        query = "Latest Cyber Incidents by Lockbit Ransomware Group?"
        results = asyncio.run(execute_searches(query, tools))

        print("Search Results:")
        for result in results["results"]:
            print(f"Title: {result.title}, URL: {result.url}")

        print("URLs List:")
        print(results["urls"])

        print("Media List:")
        print(results["media"])
    else:
        print("Failed to initialize models and tools.")

NameError: name 'PineconeConfigurationError' is not defined