<a href="https://colab.research.google.com/github/KaifAhmad1/code-test/blob/main/Search_Sources.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
!pip install -qU telethon tweepy feedparser google-api-python-client requests tavily-python exa_py python-dotenv

In [6]:
import asyncio
from typing import List, Dict, Any, Optional
from pydantic import BaseModel
from datetime import datetime
import pytz
from dotenv import load_dotenv
import nest_asyncio
import os
import requests
from googleapiclient.discovery import build
from exa_py import Exa
from tavily import TavilyClient
from telethon import TelegramClient
import tweepy
import feedparser

# Apply nest_asyncio to allow nested event loops
nest_asyncio.apply()

# Load environment variables
load_dotenv()

# API Keys (hidden for security purposes)
SERPER_API_KEY = os.getenv("SERPER_API_KEY", "d8e815ef6caa94dbef7b977a0ea7d505b43a5a06")
EXA_API_KEY = os.getenv("EXA_API_KEY", "953b5801-11be-4b37-a313-f8df8f37027c")
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY", "AIzaSyBIQo9X6acoBazBfte9jF9Pl0QEZ9oe8pk")
GOOGLE_CSE_ID = os.getenv("GOOGLE_CSE_ID", "63053004a7e2445c3")
TAVILY_API_KEY = os.getenv("TAVILY_API_KEY", "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")
TELEGRAM_API_ID = os.getenv("TELEGRAM_API_ID", "your_telegram_api_id")
TELEGRAM_API_HASH = os.getenv("TELEGRAM_API_HASH", "your_telegram_api_hash")
TWITTER_API_KEY = os.getenv("TWITTER_API_KEY", "your_twitter_api_key")
TWITTER_API_SECRET = os.getenv("TWITTER_API_SECRET", "your_twitter_api_secret")
TWITTER_ACCESS_TOKEN = os.getenv("TWITTER_ACCESS_TOKEN", "your_twitter_access_token")
TWITTER_ACCESS_TOKEN_SECRET = os.getenv("TWITTER_ACCESS_TOKEN_SECRET", "your_twitter_access_token_secret")

# Set environment variables for Search Tools
os.environ["SERPER_API_KEY"] = SERPER_API_KEY
os.environ["EXA_API_KEY"] = EXA_API_KEY
os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY
os.environ["GOOGLE_CSE_ID"] = GOOGLE_CSE_ID
os.environ["TAVILY_API_KEY"] = TAVILY_API_KEY

In [7]:
# Constants
SECURITY_RSS_FEEDS = [
    "https://www.bleepingcomputer.com/feed/",
    "https://feeds.feedburner.com/TheHackersNews",
    "https://krebsonsecurity.com/feed/",
    "https://www.darkreading.com/rss.xml",
    "https://www.securityweek.com/feed/",
    "https://www.csoonline.com/feed/",
    "https://www.threatpost.com/feed/",
    "https://www.helpnetsecurity.com/feed/",
    "https://www.infosecurity-magazine.com/rss/news/",
    "https://www.cybersecurity-insiders.com/feed/",
    "https://www.zdnet.com/topic/security/rss.xml",
    "https://www.schneier.com/feed/atom/",
    "https://www.theregister.com/security/headlines.atom",
    "https://www.govinfosecurity.com/rss/feeds/rss",
    "https://www.crowdstrike.com/blog/feed/"
]

TELEGRAM_CHANNELS = [
    'cybersecuritynews',
    'ransomware_news',
    'malware_news',
    'infosec_latest',
    'cyber_threat_intel',
    'hacking_news',
    'cyber_security_updates',
    'data_breach_news',
    'cybercrime_updates',
    'security_research_news',
    'cyber_attack_alerts',
    'privacy_news',
    'cyber_defense_updates',
    'threat_intelligence',
    'cyber_news_daily'
]

In [8]:
# Initialize services
exa = Exa(api_key=EXA_API_KEY)
tavily_client = TavilyClient(api_key=TAVILY_API_KEY)
google_service = build("customsearch", "v1", developerKey=GOOGLE_API_KEY)
telegram_client = TelegramClient('session_name', TELEGRAM_API_ID, TELEGRAM_API_HASH)
twitter_auth = tweepy.OAuth1UserHandler(TWITTER_API_KEY, TWITTER_API_SECRET, TWITTER_ACCESS_TOKEN, TWITTER_ACCESS_TOKEN_SECRET)
twitter_api = tweepy.API(twitter_auth)

ValueError: invalid literal for int() with base 10: 'your_telegram_api_id'

In [9]:
class SearchResult(BaseModel):
    source: str
    title: str
    snippet: str
    url: str
    date: Optional[str] = None
    media: Optional[List[str]] = []
    media_content: Optional[List[Dict[str, str]]] = []

In [10]:
def google_serper_search(query: str) -> List[SearchResult]:
    url = "https://google.serper.dev/search"
    payload = {
        "q": query,
        "gl": "us",
        "hl": "en",
        "autocorrect": True
    }
    headers = {
        "X-API-KEY": SERPER_API_KEY,
        "Content-Type": "application/json"
    }

    try:
        response = requests.post(url, headers=headers, json=payload)
        response.raise_for_status()
        data = response.json()

        return [
            SearchResult(
                source="Google Serper",
                title=result.get("title", "No title"),
                snippet=result.get("snippet", "No snippet"),
                url=result.get("link", "No link"),
                date=result.get("date")
            ) for result in data.get("organic", [])
        ]
    except Exception as e:
        print(f"ERROR in Serper Search: {str(e)}")
        return []

def google_programmable_search(query: str) -> List[SearchResult]:
    try:
        search_results = google_service.cse().list(
            q=query,
            cx=GOOGLE_CSE_ID,
            num=5
        ).execute()

        return [
            SearchResult(
                source="Google Programmable Search",
                title=item.get("title", "No title"),
                snippet=item.get("snippet", "No snippet"),
                url=item.get("link", "No link"),
                date=item.get("pagemap", {}).get("metatags", [{}])[0].get("article:published_time")
            ) for item in search_results.get("items", [])
        ]
    except Exception as e:
        print(f"ERROR in Google Programmable Search: {str(e)}")
        return []

def google_serper_image_search(query: str) -> List[SearchResult]:
    url = "https://google.serper.dev/images"
    payload = {
        "q": query,
        "gl": "us",
        "hl": "en"
    }
    headers = {
        "X-API-KEY": SERPER_API_KEY,
        "Content-Type": "application/json"
    }

    try:
        response = requests.post(url, headers=headers, json=payload)
        response.raise_for_status()
        data = response.json()

        return [
            SearchResult(
                source="Google Serper Image Search",
                title=result.get("title", "No title"),
                snippet=result.get("snippet", "No snippet"),
                url=result.get("imageUrl", "No link"),
                media=[result.get("imageUrl", "No link")],
                media_content=[{"image_url": result.get("imageUrl", "No link")}]
            ) for result in data.get("images", [])
        ]
    except Exception as e:
        print(f"ERROR in Serper Image Search: {str(e)}")
        return []

def google_programmable_image_search(query: str) -> List[SearchResult]:
    try:
        search_results = google_service.cse().list(
            q=query,
            cx=GOOGLE_CSE_ID,
            num=5,
            searchType="image"
        ).execute()

        return [
            SearchResult(
                source="Google Programmable Image Search",
                title=item.get("title", "No title"),
                snippet=item.get("snippet", "No snippet"),
                url=item.get("link", "No link"),
                media=[item.get("link", "No link")],
                media_content=[{"image_url": item.get("link", "No link")}]
            ) for item in search_results.get("items", [])
        ]
    except Exception as e:
        print(f"ERROR in Google Programmable Image Search: {str(e)}")
        return []

def exa_search(query: str) -> List[SearchResult]:
    try:
        response = exa.search_and_contents(
            query,
            use_autoprompt=True,
            num_results=5,
            text=True,
            highlights=True
        )
        results = response.results

        return [
            SearchResult(
                source="Exa Search",
                title=result.title,
                snippet=result.highlights[0] if result.highlights else "No snippet",
                url=result.url,
                date=result.publishedDate if hasattr(result, 'publishedDate') else None,
                media_content=[{"image_url": result.url}] if result.url else []
            ) for result in results
        ]
    except Exception as e:
        print(f"ERROR in Exa Search: {str(e)}")
        return []

def tavily_search(query: str) -> List[SearchResult]:
    try:
        response = tavily_client.search(
            query,
            search_depth="advanced",
            include_answer=True,
            include_raw_content=True,
            include_images=True,
            max_results=5
        )

        return [
            SearchResult(
                source="Tavily Search",
                title=result.get("title", "No title"),
                snippet=result.get("content", "No snippet"),
                url=result.get("url", "No link"),
                date=result.get("published_date"),
                media=[result.get("url", "No link")] if result.get("url") else [],
                media_content=[{"image_url": result.get("url", "No link")}] if result.get("url") else []
            ) for result in response.get("results", [])
        ]
    except Exception as e:
        print(f"ERROR in Tavily Search: {str(e)}")
        return []

async def telegram_search(query: str) -> List[SearchResult]:
    try:
        await telegram_client.start()
        all_results = []
        for channel in TELEGRAM_CHANNELS:
            results = await telegram_client.get_messages(channel, search=query, limit=5)
            all_results.extend([
                SearchResult(
                    source=f"Telegram ({channel})",
                    title=result.message,
                    snippet=result.message,
                    url=f"https://t.me/{channel}/{result.id}",
                    date=result.date.isoformat()
                ) for result in results
            ])
        return all_results
    except Exception as e:
        print(f"ERROR in Telegram Search: {str(e)}")
        return []

def twitter_search(query: str) -> List[SearchResult]:
    try:
        tweets = twitter_api.search_tweets(q=query, count=5)
        return [
            SearchResult(
                source="Twitter",
                title=tweet.user.name,
                snippet=tweet.text,
                url=f"https://twitter.com/{tweet.user.screen_name}/status/{tweet.id}",
                date=tweet.created_at.isoformat()
            ) for tweet in tweets
        ]
    except Exception as e:
        print(f"ERROR in Twitter Search: {str(e)}")
        return []

def rss_feed_search(query: str) -> List[SearchResult]:
    try:
        all_results = []
        for feed_url in SECURITY_RSS_FEEDS:
            feed = feedparser.parse(feed_url)
            all_results.extend([
                SearchResult(
                    source=f"RSS Feed ({feed_url})",
                    title=entry.title,
                    snippet=entry.summary,
                    url=entry.link,
                    date=entry.published
                ) for entry in feed.entries if query.lower() in entry.title.lower() or query.lower() in entry.summary.lower()
            ])
        return all_results
    except Exception as e:
        print(f"ERROR in RSS Feed Search: {str(e)}")
        return []

def aggregate_search_results(*args: List[SearchResult]) -> List[SearchResult]:
    all_results = []
    media_content = []

    for results in args:
        all_results.extend(results)
        media_content.extend([media for result in results for media in result.media_content])

    seen_urls = set()
    unique_results = []

    for result in all_results:
        if result.url not in seen_urls:
            seen_urls.add(result.url)
            unique_results.append(result)

    # Sort results by date (most recent first)
    unique_results.sort(key=lambda x: datetime.strptime(x.date, "%Y-%m-%dT%H:%M:%S") if x.date else datetime.min, reverse=True)

    # Sort media content by date (most recent first)
    media_content.sort(key=lambda x: datetime.strptime(x.get("date", ""), "%Y-%m-%dT%H:%M:%S") if x.get("date") else datetime.min, reverse=True)

    return unique_results, media_content

In [11]:
async def execute_searches(query: str) -> Dict[str, Any]:
    search_functions = [
        google_serper_search,
        google_programmable_search,
        exa_search,
        tavily_search,
        google_serper_image_search,
        google_programmable_image_search,
        telegram_search,
        lambda q: twitter_search(q),
        lambda q: rss_feed_search(q)
    ]
    search_tasks = [asyncio.to_thread(func, query) if callable(func) else func(query) for func in search_functions]
    search_results = await asyncio.gather(*search_tasks, return_exceptions=True)

    successful_results = []
    for results in search_results:
        if isinstance(results, Exception):
            print(f"ERROR in search: {str(results)}")
        else:
            successful_results.append(results)

    combined_results, media_content = aggregate_search_results(*successful_results)
    urls = [result.url for result in combined_results]

    return {
        "results": combined_results,
        "urls": urls,
        "media_content": media_content
    }

In [12]:
if __name__ == "__main__":
    query = "Latest Cyber Incidents by Lockbit Ransomware Group?"
    results = asyncio.run(execute_searches(query))

    print("\nSearch Results:")
    for result in results["results"]:
        print(f"\nTitle: {result.title}")
        print(f"Source: {result.source}")
        print(f"URL: {result.url}")
        print(f"Snippet: {result.snippet}")
        if result.media_content:
            print(f"Media Content: {result.media_content}")

    print("\nURLs List:")
    print(results["urls"])

    print("\nMedia Content List:")
    print(results["media_content"])

ERROR in Twitter Search: name 'twitter_api' is not defined
ERROR in RSS Feed Search: object has no attribute 'summary'


TypeError: 'coroutine' object is not iterable