In [None]:
# First, let's make sure we have the necessary libraries installed.
# !pip install langgraph langchain langchain_groq groq sentence-transformers Pillow requests beautifulsoup4 serpapi numpy scikit-learn

import os
import logging
import requests
import json
import re
import time
from datetime import datetime
from typing import Annotated, Sequence, TypedDict, List, Dict, Optional
from urllib.parse import urlparse
from pathlib import Path

# Langchain and LangGraph imports
from langgraph.graph import StateGraph, END
from langgraph.prebuilt import ToolNode
from langgraph.graph.message import add_messages
from langchain_core.messages import BaseMessage, HumanMessage, SystemMessage, AIMessage, ToolMessage
from langchain_core.tools import tool
from langchain_groq import ChatGroq
from groq import Groq

# Image and ML imports
from PIL import Image
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from bs4 import BeautifulSoup

# --- Configuration & Setup ---

# 1. Logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# 2. API Keys
# IMPORTANT: For security, replace "YOUR_..." with your actual keys and consider using environment variables.
os.environ["GROQ_API_KEY"] = "YOUR_GROQ_API_KEY"
SERPER_API_KEY = "c4840737ba73d13a472ae78c5aa196d965a168c2"
SERPAPI_API_KEY = "7d6e5e5f8bf83fc3be53bb89e3f2031188cdb0fec657bde729b9c2cdbbd660f5"

# 3. Directories
BASE_OUTPUT_DIR = Path("osint_results")
LOCAL_IMAGE_DB_DIR = Path("static/images")
BASE_OUTPUT_DIR.mkdir(exist_ok=True)
LOCAL_IMAGE_DB_DIR.mkdir(exist_ok=True)


# --- LLM and Model Initialization (Lazy Loading) ---

llm = None
clip_model = None

def get_llm():
    """Initializes and returns the LLM, checking for API key."""
    global llm
    if llm is None:
        if not os.environ.get("GROQ_API_KEY") or "YOUR_" in os.environ.get("GROQ_API_KEY"):
            raise ValueError("GROQ_API_KEY is not set. Please add your key to the script.")
        try:
            llm = ChatGroq(model="gemma2-9b-it", api_key=os.environ["GROQ_API_KEY"], temperature=0.1)
        except Exception as e:
            logger.error(f"Failed to initialize LLM: {e}")
            raise
    return llm

def get_clip_model():
    """Loads the CLIP model only when needed."""
    global clip_model
    if clip_model is None:
        logger.info("Loading CLIP model for image similarity...")
        clip_model = SentenceTransformer("sentence-transformers/clip-ViT-B-32")
        logger.info("CLIP model loaded.")
    return clip_model


# --- Agent State Definition ---

class ListingAgentState(TypedDict):
    messages: Annotated[Sequence[BaseMessage], add_messages]
    listing_name: str
    search_urls: List[str]
    extracted_data: Dict[str, Dict]
    similarity_report: Dict[str, List[Dict]]
    final_report: str
    completed_tasks: List[str]

# --- Tool Definitions ---

@tool
def listing_url_search_tool(listing_name: str) -> str:
    """
    Performs a search to find URLs related to an Airbnb listing.
    Returns a JSON string of the search results.
    """
    print(f"🔎 URL Search Tool - Searching for: {listing_name}")
    url = "https://google.serper.dev/search"
    payload = json.dumps({"q": listing_name, "location": "Morocco", "gl": "ma", "num": 20})
    headers = {'X-API-KEY': SERPER_API_KEY, 'Content-Type': 'application/json'}
    try:
        response = requests.request("POST", url, headers=headers, data=payload, timeout=20)
        response.raise_for_status()
        results = response.json()
        urls = [item['link'] for item in results.get('organic', []) if any(site in item['link'] for site in ['airbnb.com', 'booking.com', 'instagram.com', 'agoda.com', 'trivago'])]
        print(f"✅ Found {len(urls)} relevant URLs.")
        return json.dumps(urls)
    except Exception as e:
        logger.error(f"URL search failed: {e}")
        return f"Error: Could not perform search. {e}"

@tool
def url_content_extractor_tool(url_to_scrape: str) -> str:
    """
    Browses a single URL, extracts all images, saves them to a unique folder,
    and extracts the host's name if possible.
    """
    print(f"🖼️ Content Extractor Tool - Scraping: {url_to_scrape}")
    try:
        domain = urlparse(url_to_scrape).netloc.replace('.', '_')
        output_dir = BASE_OUTPUT_DIR / domain
        output_dir.mkdir(exist_ok=True)

        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
        response = requests.get(url_to_scrape, headers=headers, timeout=30)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        image_urls = [img['src'] for img in soup.find_all('img') if img.get('src')]
        saved_image_paths = []
        for i, img_url in enumerate(image_urls):
            if not img_url.startswith(('http:', 'https:')):
                img_url = f"{urlparse(url_to_scrape).scheme}://{urlparse(url_to_scrape).netloc}{img_url}"
            try:
                img_response = requests.get(img_url, stream=True, timeout=15)
                if img_response.status_code == 200:
                    img_name = Path(urlparse(img_url).path).name
                    if not img_name or len(img_name) > 100: img_name = f"image_{i}.jpg"
                    save_path = output_dir / img_name
                    with open(save_path, 'wb') as f:
                        for chunk in img_response.iter_content(1024): f.write(chunk)
                    saved_image_paths.append(str(save_path))
            except Exception as e:
                logger.warning(f"Could not download image {img_url}: {e}")

        host_name = "Not found"
        text_content = soup.get_text(" ", strip=True)
        match = re.search(r"(?i)(hosted by|host:|géré par|owner:|propriétaire:)\s*([A-Z][a-zA-Z\s]+)", text_content)
        if match: host_name = match.group(2).strip()

        result = {"source_url": url_to_scrape, "saved_images": saved_image_paths, "found_host": host_name}
        print(f"✅ Extracted {len(saved_image_paths)} images and host '{host_name}' from {url_to_scrape}")
        return json.dumps(result)
    except Exception as e:
        logger.error(f"Failed to scrape {url_to_scrape}: {e}")
        return f"Error: Could not scrape URL. {e}"

@tool
def image_similarity_tool(new_image_paths_json: str) -> str:
    """
    Compares new images against a local DB to find visual similarities using a CLIP model.
    """
    print(f"✨ Image Similarity Tool - Comparing images...")
    new_image_paths = json.loads(new_image_paths_json)
    if not new_image_paths: return "No new images provided to compare."
    try:
        model = get_clip_model()
        db_image_paths = [p for p in LOCAL_IMAGE_DB_DIR.glob('*') if p.suffix.lower() in ['.png', '.jpg', '.jpeg']]
        if not db_image_paths: return "No images found in the local database to compare against."
        db_embeddings = model.encode([Image.open(p) for p in db_image_paths])
        report = {}
        for new_img_path_str in new_image_paths:
            new_img_path = Path(new_img_path_str)
            if not new_img_path.exists(): continue
            new_embedding = model.encode(Image.open(new_img_path)).reshape(1, -1)
            scores = cosine_similarity(new_embedding, db_embeddings)[0]
            top_indices = scores.argsort()[-3:][::-1]
            matches = [{"local_image": str(db_image_paths[i]), "score": float(scores[i])} for i in top_indices if scores[i] > 0.85]
            if matches: report[str(new_img_path)] = matches
        print(f"✅ Similarity check complete. Found {len(report)} potential matches.")
        return json.dumps(report, indent=2)
    except Exception as e:
        logger.error(f"Image similarity check failed: {e}", exc_info=True)
        return f"Error during similarity check: {e}"

@tool
def final_report_generation_tool(listing_name: str, extracted_data_json: str, similarity_report_json: str) -> str:
    """Generates a final OSINT report from all gathered findings."""
    print("📋 Final Report Tool - Compiling all findings...")
    prompt = f"""You are an OSINT analyst. Compile a detailed intelligence report for the Airbnb listing: "{listing_name}".
Structure the report clearly with the following sections. Be factual and only use the provided data.
**Data Provided:**
- Extracted Website Data: {extracted_data_json}
- Image Similarity Analysis: {similarity_report_json}
**Report Structure:**
1. **Executive Summary:** Brief overview of key findings. Was the listing verified? Was a host identified? Are there visual matches to known images?
2. **Online Presence Analysis:** List all URLs where the listing was found.
3. **Visual Verification (Image Matching):** Detail the findings. For each new image that matched a local one, present the evidence: `New Image -> Local DB Image (Similarity Score)`. Conclude if the images from the web strongly match the local database.
4. **Host Identification:** State the name of the host found and from which website.
5. **Conclusion:** Summarize the investigation and provide a confidence level on whether the online listings are for the same entity as the one in the local database."""
    try:
        response = get_llm().invoke([SystemMessage(content=prompt)])
        print("✅ Final Report Generated.")
        return response.content
    except Exception as e:
        logger.error(f"Report generation failed: {e}")
        return f"Error generating report: {e}"

# --- Graph Nodes ---

def coordinator_node(state: ListingAgentState) -> dict:
    """Decides which tool to call next."""
    print("\n🎯 Coordinator Agent - Deciding next step...")
    completed = state.get("completed_tasks", [])
    if "url_search" not in completed:
        print("📋 Task: Search for listing URLs.")
        tool_call = get_llm().bind_tools([listing_url_search_tool]).invoke(f"Find all relevant online URLs for the listing: {state['listing_name']}")
        return {"messages": [tool_call]}
    if "content_extraction" not in completed: return {}
    if "similarity_check" not in completed:
        print("📋 Task: Perform image similarity check.")
        all_new_images = [img for data in state.get("extracted_data", {}).values() for img in data.get("saved_images", [])]
        tool_call = AIMessage("", tool_calls=[{"name": "image_similarity_tool", "args": {"new_image_paths_json": json.dumps(all_new_images)}, "id": f"call_similarity_{time.time()}"}])
        return {"messages": [tool_call]}
    if "final_report" not in completed:
        print("📋 Task: Generate final report.")
        tool_call = AIMessage("", tool_calls=[{"name": "final_report_generation_tool", "args": {"listing_name": state['listing_name'], "extracted_data_json": json.dumps(state.get("extracted_data", {})), "similarity_report_json": json.dumps(state.get("similarity_report", {}))}, "id": f"call_report_{time.time()}"}])
        return {"messages": [tool_call]}
    return {}

def tool_execution_node(state: ListingAgentState) -> dict:
    """Executes tools and updates the state."""
    print("🔧 Tool Execution Node - Running tools...")
    if "url_search" in state["completed_tasks"] and "content_extraction" not in state["completed_tasks"]:
        urls_to_scrape = state.get("search_urls", [])
        if not urls_to_scrape:
            print("⚠️ No URLs to scrape. Skipping content extraction.")
            return {"completed_tasks": state["completed_tasks"] + ["content_extraction"]}
        extracted_data = state.get("extracted_data", {})
        for url in urls_to_scrape:
            result_json = url_content_extractor_tool.invoke({"url_to_scrape": url})
            try: extracted_data[url] = json.loads(result_json)
            except json.JSONDecodeError: extracted_data[url] = {"error": result_json}
        return {"extracted_data": extracted_data, "completed_tasks": state["completed_tasks"] + ["content_extraction"]}
    
    last_message = state["messages"][-1]
    if not hasattr(last_message, "tool_calls") or not last_message.tool_calls: return {}
    
    tool_call = last_message.tool_calls[0]
    all_tools = {"listing_url_search_tool": listing_url_search_tool, "image_similarity_tool": image_similarity_tool, "final_report_generation_tool": final_report_generation_tool}
    if tool_call['name'] not in all_tools: return {"messages": [ToolMessage(content=f"Error: Tool '{tool_call['name']}' not found.", tool_call_id=tool_call['id'])]}
    
    try:
        result = all_tools[tool_call['name']].invoke(tool_call['args'])
        tool_message = ToolMessage(content=str(result), tool_call_id=tool_call['id'])
        updated_state, completed_tasks = {"messages": [tool_message]}, state.get("completed_tasks", []).copy()
        if tool_call['name'] == "listing_url_search_tool":
            updated_state["search_urls"], completed_tasks.append("url_search") = json.loads(result), print("✅ url_search completed.")
        elif tool_call['name'] == "image_similarity_tool":
            updated_state["similarity_report"], completed_tasks.append("similarity_check") = json.loads(result), print("✅ similarity_check completed.")
        elif tool_call['name'] == "final_report_generation_tool":
            updated_state["final_report"], completed_tasks.append("final_report") = result, print("🎉 Final report generated!")
        updated_state["completed_tasks"] = completed_tasks
        return updated_state
    except Exception as e:
        error_msg = f"Error executing tool {tool_call['name']}: {e}"
        logger.error(error_msg, exc_info=True)
        return {"messages": [ToolMessage(content=error_msg, tool_call_id=tool_call['id'])]}

# --- Graph Definition & Execution ---
def should_continue(state: ListingAgentState) -> str:
    if "final_report" in state.get("completed_tasks", []): return "end"
    last_message = state["messages"][-1]
    if hasattr(last_message, 'tool_calls') and last_message.tool_calls: return "execute_tools"
    return "coordinate"

workflow = StateGraph(ListingAgentState)
workflow.add_node("coordinator", coordinator_node)
workflow.add_node("tools", tool_execution_node)
workflow.set_entry_point("coordinator")
workflow.add_conditional_edges("coordinator", should_continue, {"execute_tools": "tools", "end": END, "coordinate": "coordinator"})
workflow.add_edge("tools", "coordinator")
app = workflow.compile()

# --- Main Application Logic ---
def run_listing_investigation(listing_name: str):
    """Starts and runs the entire investigation process."""
    print(f"\n🚀 Launching OSINT investigation for: \"{listing_name}\" 🚀")
    if not any(LOCAL_IMAGE_DB_DIR.iterdir()):
        print("Local image DB is empty. Creating dummy images for demonstration.")
        Image.new('RGB', (100, 100), color = 'red').save(LOCAL_IMAGE_DB_DIR / 'known_image_1.jpg')
        Image.new('RGB', (100, 100), color = 'green').save(LOCAL_IMAGE_DB_DIR / 'known_image_2.jpg')
    initial_state = {"messages": [HumanMessage(content=f"Start investigation for {listing_name}")], "listing_name": listing_name, "completed_tasks": []}
    try:
        final_state = None
        for state in app.stream(initial_state, {"recursion_limit": 15}): final_state = state
        print("\n\n✅ Investigation Complete! ✅")
        report = final_state.get("final_report", "No report was generated.")
        print("\n" + "="*80 + "\nFINAL OSINT REPORT\n" + "="*80 + f"\n{report}")
        report_filename = f"{listing_name.replace(' ', '_')}_osint_report.md"
        with open(BASE_OUTPUT_DIR / report_filename, 'w', encoding='utf-8') as f: f.write(report)
        print(f"\n💾 Report saved to: {BASE_OUTPUT_DIR / report_filename}")
    except Exception as e:
        logger.error(f"An error occurred during the investigation: {e}", exc_info=True)

def main_chat_loop():
    """Main conversational loop to interact with the system."""
    print("💬 OSINT Chatbot is live!")
    print("You can ask me to 'run a report on Riad Green Vines' or 'investigate Riad Cherifa'. Type 'exit' to quit.")
    
    while True:
        try:
            user_input = input("\n👤 You: ")
            if user_input.strip().lower() in ["exit", "quit"]:
                print("👋 Exiting. Goodbye!")
                break

            decision_prompt = f"""You are a dispatch assistant. Your job is to determine if the user wants to run an investigation on a property.
If they mention words like 'investigate', 'report on', 'deep dive', 'research', or 'find info on' followed by a property name, respond with the property name ONLY.
Otherwise, respond with 'NONE'.

User request: "{user_input}" """
            
            response = get_llm().invoke([SystemMessage(content=decision_prompt)]).content.strip()
            
            if response.upper() != "NONE":
                run_listing_investigation(response)
                print("\n💬 You can now continue chatting or request another investigation.")
            else:
                # General chat response
                chat_response = get_llm().invoke([HumanMessage(content=user_input)]).content
                print(f"🤖 {chat_response}")

        except KeyboardInterrupt:
            print("\n👋 Exiting. Goodbye!")
            break
        except Exception as e:
            print(f"❌ An unexpected error occurred: {str(e)}")
            logger.error("Error in main chat loop", exc_info=True)

if __name__ == "__main__":
    try:
        get_llm() # Pre-check for API key at startup
        main_chat_loop()
    except ValueError as e:
        print(f"🚨 Configuration Error: {e}")
    except Exception as e:
        print(f"🚨 A critical error occurred on startup: {e}")