<a href="https://colab.research.google.com/github/KaifAhmad1/code-test/blob/main/Search_Sources.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
!pip install --quiet -U "langchain-community>=0.2.16" langchain-exa langchain-google-community
!pip install --upgrade --quiet faiss-cpu

In [9]:
import asyncio
from typing import List, Dict, Any, Optional
from pydantic import BaseModel
from datetime import datetime
import pytz
from dotenv import load_dotenv
import nest_asyncio
import os
import getpass

from langchain_community.utilities import GoogleSerperAPIWrapper
from langchain_community.tools import TavilySearchResults
from langchain_google_community import GoogleSearchAPIWrapper
from exa_py import Exa

# Apply nest_asyncio to allow nested event loops
nest_asyncio.apply()

# Load environment variables
load_dotenv()

# API Keys (hidden for security purposes)
SERPER_API_KEY = os.getenv("SERPER_API_KEY", "d8e815ef6caa94dbef7b977a0ea7d505b43a5a06")
EXA_API_KEY = os.getenv("EXA_API_KEY", "953b5801-11be-4b37-a313-f8df8f37027c")
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY", "AIzaSyBIQo9X6acoBazBfte9jF9Pl0QEZ9oe8pk")
GOOGLE_CSE_ID = os.getenv("GOOGLE_CSE_ID", "63053004a7e2445c3")
TAVILY_API_KEY = os.getenv("TAVILY_API_KEY", "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")

# Set environment variables for Search Tools
os.environ["SERPER_API_KEY"] = SERPER_API_KEY
os.environ["EXA_API_KEY"] = EXA_API_KEY
os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY
os.environ["GOOGLE_CSE_ID"] = GOOGLE_CSE_ID
os.environ["TAVILY_API_KEY"] = TAVILY_API_KEY

In [10]:
# Initialize search tools
google_serper = GoogleSerperAPIWrapper()
tavily_search = TavilySearchResults(
    max_results=5,
    search_depth="advanced",
    include_answer=True,
    include_raw_content=True,
    include_images=True,
)
google_search = GoogleSearchAPIWrapper()
exa = Exa(api_key=EXA_API_KEY)

In [11]:
class SearchResult(BaseModel):
    source: str
    title: str
    snippet: str
    url: str
    date: Optional[str]
    media: Optional[List[str]] = []
    media_content: Optional[List[Dict[str, str]]] = []
    links: Optional[List[str]] = []
    source_weight: Optional[float] = None
    source_name: Optional[str] = None
    final_score: Optional[float] = None
    metadata: Optional[Dict[str, Any]] = {}

def parse_date(date_str: Optional[str]) -> Optional[datetime]:
    if not date_str:
        return None
    try:
        return datetime.fromisoformat(date_str.replace('Z', '+00:00'))
    except ValueError:
        try:
            return datetime.strptime(date_str, "%Y-%m-%d")
        except ValueError:
            return None

In [12]:
def google_serper_search(query: str, google_serper: GoogleSerperAPIWrapper) -> List[SearchResult]:
    results = google_serper.results(query)
    return [
        SearchResult(
            source="Google Serper",
            title=result.get("title", "No title"),
            snippet=result.get("snippet", "No snippet"),
            url=result.get("link", "No link"),
            date=result.get("date"),
            metadata={
                "author": result.get("author"),
                "location": result.get("location")
            }
        ) for result in results.get("organic", [])
    ]

def exa_search(query: str, exa: Exa) -> List[SearchResult]:
    try:
        response = exa.search_and_contents(
            query, use_autoprompt=True, num_results=5, text=True, highlights=True
        )
        results = response.results
        return [
            SearchResult(
                source="Exa Search",
                title=result.title,
                snippet=result.snippet,
                url=result.link,
                date=result.published_date,
                metadata={
                    "author": result.author,
                    "location": result.location
                },
                media_content=[{"image_url": result.image}] if result.image else []
            ) for result in results
        ]
    except Exception as e:
        print(f"ERROR in Exa Search: {str(e)}")
        return []

def tavily_search(query: str, tavily_search: TavilySearchResults) -> List[SearchResult]:
    try:
        results = tavily_search.search({"query": query})
        return [
            SearchResult(
                source="Tavily Search",
                title=result.get("title", "No title"),
                snippet=result.get("content", "No snippet"),
                url=result.get("url", "No link"),
                date=result.get("date"),
                metadata={
                    "author": result.get("author"),
                    "location": result.get("location")
                }
            ) for result in results
        ]
    except Exception as e:
        print(f"ERROR in Tavily Search: {str(e)}")
        return []

def google_programmable_search(query: str, google_search: GoogleSearchAPIWrapper) -> List[SearchResult]:
    try:
        results = google_search.results(query, num_results=5)
        return [
            SearchResult(
                source="Google Programmable Search",
                title=result.get("title", "No title"),
                snippet=result.get("snippet", "No snippet"),
                url=result.get("link", "No link"),
                date=result.get("date"),
                metadata={
                    "author": result.get("author"),
                    "location": result.get("location")
                }
            ) for result in results
        ]
    except Exception as e:
        print(f"ERROR in Google Programmable Search: {str(e)}")
        return []

def google_serper_image_search(query: str) -> List[SearchResult]:
    search_images = GoogleSerperAPIWrapper(type="images")
    results_images = search_images.results(query)
    return [
        SearchResult(
            source="Google Serper Image Search",
            title=result.get("title", "No title"),
            snippet=result.get("snippet", "No snippet"),
            url=result.get("imageUrl", "No link"),
            date=None,
            media=[result.get("imageUrl", "No link")],
            media_content=[{"image_url": result.get("imageUrl", "No link")}]
        ) for result in results_images.get("images", [])
    ]

def google_programmable_image_search(query: str, google_search: GoogleSearchAPIWrapper) -> List[SearchResult]:
    try:
        results = google_search.results(query + " image", num_results=5)
        return [
            SearchResult(
                source="Google Programmable Image Search",
                title=result.get("title", "No title"),
                snippet=result.get("snippet", "No snippet"),
                url=result.get("link", "No link"),
                date=None,
                media=[result.get("link", "No link")],
                media_content=[{"image_url": result.get("link", "No link")}]
            ) for result in results
        ]
    except Exception as e:
        print(f"ERROR in Google Programmable Image Search: {str(e)}")
        return []

def aggregate_search_results(query: str, *args: List[SearchResult]) -> List[SearchResult]:
    all_results = []
    media_content = []
    sources = ['serper', 'exa', 'tavily', 'google', 'google_serper_image', 'google_programmable_image']
    weights = [1.0, 0.9, 0.85, 0.8, 0.75, 0.7]

    for results, source, weight in zip(args, sources, weights):
        all_results.extend([(result, source, weight, result.source_weight or 0, parse_date(result.date)) for result in results])
        media_content.extend([media for result in results for media in result.media_content])

    seen_urls = set()
    unique_results = []

    for result, source, weight, source_weight, date in all_results:
        if result.url not in seen_urls:
            seen_urls.add(result.url)
            result.source_weight = source_weight
            result.source_name = source
            date_score = calculate_recency_score(date)
            final_score = weight + source_weight + date_score
            result.final_score = final_score
            unique_results.append(result)

    unique_results.sort(reverse=True, key=lambda x: x.final_score)
    return unique_results, media_content

def calculate_recency_score(date: Optional[datetime]) -> float:
    if date is None:
        return 0.0
    current_date = datetime.now(pytz.utc)
    days_old = (current_date - date).days
    if days_old < 0:
        return 0.0
    return 0.9 ** days_old

In [13]:
async def execute_searches(query: str, tools: Dict[str, Any]) -> Dict[str, Any]:
    search_functions = [
        (google_serper_search, tools["google_serper"]),
        (google_programmable_search, tools["google_search"]),
        (exa_search, tools["exa"]),
        (tavily_search, tools["tavily_search"]),
        (google_serper_image_search, None),
        (google_programmable_image_search, tools["google_search"])
    ]
    search_tasks = [asyncio.to_thread(func, query, tool) if tool else asyncio.to_thread(func, query) for func, tool in search_functions]
    search_results = await asyncio.gather(*search_tasks, return_exceptions=True)

    successful_results = []
    for results in search_results:
        if isinstance(results, Exception):
            print(f"ERROR in search: {str(results)}")
        else:
            successful_results.append(results)

    combined_results, media_content = aggregate_search_results(query, *successful_results)
    urls = [result.url for result in combined_results]

    return {
        "results": combined_results,
        "urls": urls,
        "media_content": media_content
    }

def initialize_models_and_tools():
    return {
        "google_serper": google_serper,
        "google_search": google_search,
        "exa": exa,
        "tavily_search": tavily_search,
    }

In [14]:
if __name__ == "__main__":
    tools = initialize_models_and_tools()
    query = "Latest Cyber Incidents by Lockbit Ransomware Group?"
    results = asyncio.run(execute_searches(query, tools))

    print("Search Results:")
    for result in results["results"]:
        print(f"Title: {result.title}, URL: {result.url}")
        if result.media_content:
            print(f"Media Content: {result.media_content}")

    print("URLs List:")
    print(results["urls"])

    print("Media Content List:")
    print(results["media_content"])

ERROR in Tavily Search: 'function' object has no attribute 'search'
ERROR in Exa Search: 'Result' object has no attribute 'snippet'
Search Results:
Title: United States Charges Dual Russian and Israeli National as ..., URL: https://www.justice.gov/opa/pr/united-states-charges-dual-russian-and-israeli-national-developer-lockbit-ransomware-group
Title: Latest LockBit news - Bleeping Computer, URL: https://www.bleepingcomputer.com/tag/lockbit/
Title: Beware Feb. 3, 2025—Diabolic Ransomware Gang Issues ... - Forbes, URL: https://www.forbes.com/sites/daveywinder/2024/12/21/notorious-ransomware-gang-warns-new-attacks-incoming-on-feb-3-2025/
Title: A notorious ransomware group demanded millions from Fulton ..., URL: https://www.cbsnews.com/news/ransomware-group-lockbit-russia-fulton-county-georgia-robb-pitts/
Title: LockBit Latest News, URL: https://therecord.media/tag/lockbit
Title: U.S. Charges Dual Russian And Israeli National As Developer Of ..., URL: https://www.justice.gov/usao-nj/pr/us