In [1]:
import requests
import json
# from langchain_openai import ChatOpenAI
from langchain_anthropic import ChatAnthropic
from anthropic._exceptions import OverloadedError
import os
from pydantic import BaseModel, Field, model_validator
from typing import List, Dict, Any
from langchain_core.prompts import ChatPromptTemplate
import os
from bs4 import BeautifulSoup
import re
from bs4 import BeautifulSoup
import re
from pydantic import Field
# from __future__ import print_function
# from IPython.display import display
from typing import Optional
from utils import Country, ProviderType
import anthropic
import time
from dotenv import load_dotenv
# import traceback

# llm_model = os.environ["LLM_VERSION"]

# HTML Slicing Pipeline Models and Functions
class Slice(BaseModel):
    start: int = Field(..., ge=0)
    end: int = Field(..., ge=0)

class SliceSet(BaseModel):
    slices: List[Slice] = Field(default_factory=list)

# --- Simple, robust JSON extraction and slicing ---

# Logging utilities
def _resolve_base_dir() -> str:
    candidates = [os.getcwd(), os.path.dirname(os.getcwd())]
    for c in candidates:
        # Prefer a direct prompts folder in the current dir
        if os.path.isdir(os.path.join(c, "prompts")):
            return c
    return os.getcwd()

BASE_DIR = _resolve_base_dir()
LOG_DIR = os.path.join(BASE_DIR, "logs")
os.makedirs(LOG_DIR, exist_ok=True)

def _write_log(filename: str, content: str) -> str:
    """Write log content to file, overwriting on each run"""
    path = os.path.join(LOG_DIR, filename)
    with open(path, "w", encoding="utf-8") as f:
        f.write(content)
    return path

def _append_log(filename: str, content: str) -> str:
    """Append log content to file"""
    path = os.path.join(LOG_DIR, filename)
    with open(path, "a", encoding="utf-8") as f:
        f.write(content + "\n")
    return path

def join_slices(html_content: str, slice_set: SliceSet) -> str:
    """Extract slices from HTML content based on SliceSet"""
    if not slice_set.slices:
        return ""
    lines = html_content.split('\n')
    chunks: List[str] = []
    for sl in slice_set.slices:
        a = max(0, sl.start)
        b = min(len(lines) - 1, sl.end)
        if a <= b:
            chunks.append('\n'.join(lines[a:b+1]))
    return '\n\n'.join(chunks)

def get_relevant_html(html_content: str) -> str:
    """Generate relevant HTML by chunking large input, calling the slice model per chunk, and joining slices."""
    model = "claude-3-haiku-20240307"
    max_html_tokens = 9000
    # Approximate max characters per chunk for the model
    max_chars = max_html_tokens * 4
    final_html_parts: List[str] = []

    if not html_content:
        return ""

    content_len = len(html_content)
    start_index = 0

    while start_index < content_len:
        end_index = min(start_index + max_chars, content_len)
        # Prefer to split on a newline near the end of the window to keep line indices stable
        if end_index < content_len:
            cutoff = html_content.rfind('\n', start_index, end_index)
            if cutoff != -1 and cutoff >= start_index + int(max_chars * 0.9):
                end_index = cutoff

        partial_html_content = html_content[start_index:end_index]

        # Number lines for the model to reference when proposing slices
        lines = partial_html_content.splitlines()
        marked = [f"⟦L{i:03}⟧ {line}" for i, line in enumerate(lines)]
        marked_html = "\n".join(marked)

        t1 = time.time()
        prompt_template = ChatPromptTemplate.from_messages([
            ("system", load_prompt("generate_slices")),
            ("human", "Generate a list of slice pairs indicating the relevant lines of the HTML content.")
        ])
        result = call_llm(
            model=model,
            prompt_template=prompt_template,
            input_dict={"html_content": marked_html},
            structured_output=SliceSet,
            temperature=0,
            prompt_name=f"generate_slices_{start_index}-{end_index}",
        )
        dt = time.time() - t1
        print(f"Slice LLM took {dt:.2f}s. Produced {len(result.slices)} slices.")

        # Accumulate slices for this chunk
        final_html_parts.append(join_slices(partial_html_content, result))

        # Advance safely to the next chunk; skip the split newline if present
        if end_index < content_len and html_content[end_index:end_index+1] == "\n":
            start_index = end_index + 1
        else:
            start_index = end_index

        # Safety guard to avoid infinite loops
        if start_index <= 0:
            break

    final_html = "\n\n".join([part for part in final_html_parts if part])
    return final_html

# search_starters = ["Digital Marketing Firm", "Tax Consultancy Firm", "Cybersecurity Firm", "Legal Services Provider", "Healthcare Consultancy"]

# search_starters = ["Company formation service UAE", "Business formation service MENA", "Business registration services Gulf", "Business setup services Dubai", "Investment migration services UAE", "Second passport services", "Citizenship By investment", "economic passport services", "Golden Visa services", "Tax services UAE", "Tax Consultancy Firm Dubai", "Tax Calculation Services Dubai"]

class ResultRelevance(BaseModel):
    explanation: str = Field(description="Be extremely brief and concise with your explanation.")
    link: str

class RelevanceCheckOutput(BaseModel):
    most_relevant_result: Optional[ResultRelevance]

class SqlCommand(BaseModel):
    command: str


class ServiceProviderMemberDetails(BaseModel):
    name: str = Field(description="Very important. Find the name of the staff member from the CLEAN HTML content.")
    role_description: str = Field(description="Find the role description of the staff member from the CLEAN HTML content.")
    telephone: str = Field(description="Find the telephone number of the staff member from the CLEAN HTML content.")
    mobile: str = Field(description="Find the mobile number of the staff member from the CLEAN HTML content.")
    email: str = Field(description="Find the email address of the staff member from the CLEAN HTML content.")
    linkedin: Optional[str]
    facebook: Optional[str]
    instagram: Optional[str]
    twitter: Optional[str]
    additional_info: Optional[str]

class ServiceProviderOutput(BaseModel):
    service_title: Optional[str] = Field(description="Produce a suitable title for the service.")
    service_description: Optional[str] = Field(description="Do your due diligence on summarising the service description based on a holistic consideration of the scattered relevant service info throughout the entirety of the given CLEAN HTML content. Use a string, not a number. Null if unavailable.")
    rating_score: Optional[float] = Field(description="Must adhere to a rating standard out of 5. Must be a numeric rating score of type float. Use a number, not a string. Null if unavailable.")
    rating_description: Optional[str] = Field(description="Do your due diligence on summarising the rating description based on a holistic consideration of the scattered relevant rating info throughout the entirety of the given CLEAN HTML content. Use a string, not a number. Null if unavailable.")
    pricing: Optional[str]
    service_tags: List[str] = Field(description="Do your due diligence on summarising the service tags based on a holistic consideration of the scattered relevant service info throughout the entirety of the given CLEAN HTML content. Null if unavailable.")
    provider_type: ProviderType = Field("Company", description="ProviderType enum. Should be Company if the provider is a company or Agent if the provider is an agent.")
    # provider_type: ProviderType = Field(description="ProviderType enum. Should be Company if the provider is a company or Agent if the provider is an agent.")
    country: Optional[Country] = Field(description="Country enum.")
    name: Optional[str] = Field(description="Very important. Find the name of the company from the CLEAN HTML content.")
    vision: Optional[str] = Field(description="A brief summary of the company/firm's vision statement from the CLEAN HTML content.")
    logo: Optional[str] = Field(description="Mandatory. Find the absolute URL for the company logo from the CLEAN HTML content.")
    website: Optional[str] = Field(description="Very important. Must find the website link of the company from the HTML content or infer it from the base website link address. Null if unavailable.")
    linkedin: Optional[str] = Field(description="Null if unavailable.")
    facebook: Optional[str] = Field(description="Null if unavailable.")
    instagram: Optional[str] = Field(description="Null if unavailable.")
    telephones: List[str] = Field([], description="Mandatory. Must find at least one telephone number. Field must be a list of telephone numbers or empty list.")
    mobiles: List[str] = Field([], description="Mandatory. Must find at least one mobile number. Field must be a list of mobile numbers or empty list.")
    emails: List[str] = Field(
        [], description="Mandatory. Must find at least one email address of the company from the HTML content. Field must be a list of email addresses or empty list."
    )
    # emails: List[str] = Field(description="Very very important. Must find at least one email address of the company from the HTML content. Output must be a list of email addresses.")
    office_locations: Optional[str] = Field(description="Detailed address locations of the company offices eg. 123 Main St, City, Country, or description if address is not available. Null if unavailable.")
    key_individuals: Optional[str] = Field(description="Names and role descriptions of key people of the company. Null if unavailable.")
    representatives: Optional[str] = Field(description="Names and roles and contact details of representatives of the company. Null if unavailable.")
    # service_provider_member_details: Optional[List[ServiceProviderMemberDetails]]

class SearchTermOutput(BaseModel):
    search_term: str

class AboutUsOutput(BaseModel):
    about_us_link: Optional[List[str]]

def call_llm(
    model = "claude-3-5-sonnet-20241022",
    API_key = None,
    prompt_template = None,
    input_dict = None,
    structured_output = None,
    temperature = 0,
    prompt_name = None,
):
    if API_key is None:
        API_key = os.getenv("ANTHROPIC_API_KEY")
    if not API_key:
        raise RuntimeError("Missing ANTHROPIC_API_KEY. Set it or pass API_key.")
    if prompt_template is None:
        raise ValueError("Prompt template is required")
    if input_dict is None:
        raise ValueError("Input dictionary is required")
    if structured_output is None:
        raise ValueError("Structured output is required")
    
    # Log Anthropic prompt
    if prompt_name:
        try:
            # Format the prompt to see what will be sent
            formatted_messages = prompt_template.invoke(input_dict)
            _write_log(f"anthropic_{prompt_name}_prompt.txt", 
                f"MODEL: {model}\nTEMPERATURE: {temperature}\n--- FORMATTED PROMPT ---\n{formatted_messages}")
        except Exception as e:
            print(f"Error logging Anthropic prompt: {e}")
    
    llm = ChatAnthropic(
        model=model,
        anthropic_api_key=API_key, temperature=temperature).with_structured_output(structured_output)
    llm_chain = prompt_template | llm
    llm_chain = llm_chain.with_retry(
        retry_if_exception_type=(OverloadedError,),
        wait_exponential_jitter=True,    
        stop_after_attempt=6
    )
    
    t0 = time.time()
    result = llm_chain.invoke(input_dict)
    dt = time.time() - t0
    
    # Log Anthropic response
    if prompt_name:
        try:
            response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)
            _append_log(f"anthropic_{prompt_name}_prompt.txt", 
                       f"\n--- ANTHROPIC RESPONSE ({dt:.2f}s) ---\n{response_content}")
        except Exception as e:
            _append_log(f"anthropic_{prompt_name}_prompt.txt", 
                       f"\n--- ANTHROPIC RESPONSE ({dt:.2f}s) ---\n{str(result)}")
    
    return result

def search_serper(search_query):
    url = "https://google.serper.dev/search"
    
    payload = json.dumps({
        "q": search_query,
        "gl": "ae", 
        "num": 10,
        # "tbs": "qdr:w"
    })

    headers = {
        'X-API-KEY': os.environ["SERPER_API_KEY"],
        'Content-Type': 'application/json'
    }

    response = requests.request("POST", url, headers=headers, data=payload, timeout=(3,4))
    results = json.loads(response.text)
    results_list = results['organic']

    all_results = []
    for id, result in enumerate(results_list, 1):
        result_dict = {
            'title': result['title'],
            'link': result['link'],
            'snippet': result['snippet'],
            'search_term': search_query,
            'id': id
        }
        all_results.append(result_dict)
    return all_results


def load_prompt(prompt_name):
    with open(f"prompts/{prompt_name}.md", "r") as f:
        return f.read()


def check_search_relevance(search_term: str, search_results: Dict[str, Any]) -> RelevanceCheckOutput:
    prompt = load_prompt("relevance_check_one_link")
    
    prompt_template = ChatPromptTemplate.from_messages([
        ("system", prompt),
        ("human", f"Check relevance of search results to the given search term (but most importantly make sure it is the actual company's official website, not a search facilitator website or other service of that sort): {search_term}")
    ])
    
    return call_llm(model = "claude-3-haiku-20240307", prompt_template = prompt_template, input_dict = {"search_term": search_term, 'search_results': search_results}, structured_output = RelevanceCheckOutput, temperature=0, prompt_name="relevance_check_one_link")


def convert_html_to_markdown(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Headers
    for h in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
        level = int(h.name[1])
        h.replace_with('#' * level + ' ' + h.get_text() + '\n\n')
    
    # Links
    for a in soup.find_all('a'):
        href = a.get('href', '')
        text = a.get_text()
        if href and text:
            a.replace_with(f'[{text}]({href})')
    
    # Bold
    for b in soup.find_all(['b', 'strong']):
        b.replace_with(f'**{b.get_text()}**')
    
    # Italic
    for i in soup.find_all(['i', 'em']):
        i.replace_with(f'*{i.get_text()}*')
    
    # Lists
    for ul in soup.find_all('ul'):
        for li in ul.find_all('li'):
            li.replace_with(f'- {li.get_text()}\n')
    
    for ol in soup.find_all('ol'):
        for i, li in enumerate(ol.find_all('li'), 1):
            li.replace_with(f'{i}. {li.get_text()}\n')
    
    # Get text and clean up
    text = soup.get_text()
    
    # Remove excess whitespace/newlines
    text = re.sub(r'\n\s*\n', '\n\n', text)
    text = text.strip()
    
    return text

def scrape_and_save_markdown(relevant_results):
    markdown_contents = []
    for result in relevant_results:
        if 'link' in result:
            payload = {
                "api_key": os.environ["SCRAPING_API_KEY"], 
                "url": result['link'],
                "render_js": "true"
            }

            response = requests.get("https://scraping.narf.ai/api/v1/", params=payload)
            if response.status_code == 200:
                markdown_content = convert_html_to_markdown(response.content.decode())
                
                markdown_contents.append({
                    'url': result['link'],
                    'markdown': markdown_content,
                    'title': result.get('title', ''),
                    'id': result.get('id', '')
                })
            else:
                print(f"Failed to fetch {result['link']}: Status code {response.status_code}")

    print(f"Successfully downloaded and saved {len(markdown_contents)} pages as markdown")
    return markdown_contents




from scrapper import HtmlScraper
import tiktoken
import string
import secrets

import os
import django
import sys
from asgiref.sync import sync_to_async
from urllib.parse import urlparse
from django.core.validators import validate_email, URLValidator
from django.core.exceptions import ValidationError

sys.path.insert(0, '../growbal_django')

os.environ.setdefault("DJANGO_SETTINGS_MODULE", "growbal.settings")

django.setup()

from services.models import Service
from services.serializers import ServiceSerializer
from accounts.serializers import ServiceProviderProfileSerializer
from accounts.models import CustomUser, ServiceProviderProfile
from asgiref.sync import sync_to_async
# from django.db import transaction
from scraper.models import Scrape          # adjust import path as needed
from scraper.serializers import ScrapeSerializer
# from django.core.files import File

def validate_emails(email_list):
    valid_emails = []
    for email in email_list:
        try:
            validate_email(email)
            valid_emails.append(email)
        except ValidationError:
            error_message = f"Error validating email {email}"
            print(error_message)
            with open(os.environ["LOG_FILE_PATH"], "a") as log_file:
                log_file.write(error_message + "\n")
    return valid_emails


_url_validator = URLValidator()
def validate_url(raw_url: str | None) -> bool:
    if not raw_url:
        return False

    raw_url = raw_url.strip()
    if not urlparse(raw_url).scheme:
        raw_url = "http://" + raw_url

    try:
        _url_validator(raw_url)
        return True
    except ValidationError:
        return False


@sync_to_async
def get_or_create_user(name=None, email=None, username=None, password=None):
    user, created = CustomUser.objects.get_or_create_user(
        name=name,
        email=email,
        username=username,
        password=password
    )
    return user, created

@sync_to_async
def create_service(data):
    serializer = ServiceSerializer(data=data)
    if serializer.is_valid():
        service = serializer.save()
        return service
    else:
        error_message = f"Error validating service data {serializer.errors}"
        print(error_message)
        with open(os.environ["LOG_FILE_PATH"], "a") as log_file:
            log_file.write(error_message + "\n")

@sync_to_async
def create_service_provider_profile(data):
    serializer = ServiceProviderProfileSerializer(data=data)
    if serializer.is_valid():
        profile = serializer.save()
        return profile
    else:
        error_message = f"Error validating service provider profile data {serializer.errors}"
        print(error_message)
        with open(os.environ["LOG_FILE_PATH"], "a") as log_file:
            log_file.write(error_message + "\n")
        # raise ValueError(serializer.errors)

@sync_to_async
def create_scrape(data):
    serializer = ScrapeSerializer(data=data)
    if serializer.is_valid():
        scrape = serializer.save()
        return scrape
    else:
        error_message = f"Error validating scrape data {serializer.errors}"
        print(error_message)
        with open(os.environ["LOG_FILE_PATH"], "a") as log_file:
            log_file.write(error_message + "\n")
        # raise ValueError(serializer.errors)

@sync_to_async
def check_similar_scrapes(base_url):
    return Scrape.check_similar_base_url(base_url=base_url)

def generate_password(length=8):
    characters = string.ascii_letters + string.digits + string.punctuation
    password = ''.join(secrets.choice(characters) for _ in range(length))
    return password

# def count_tokens(text, model="gpt-4"):
#     encoding = tiktoken.encoding_for_model(model)
#     tokens = encoding.encode(text)
#     return len(tokens)

anthropic_client = anthropic.Anthropic()

def count_tokens(text, model="claude-3-5-sonnet-20241022", system_prompt=""):
    messages = [{"role": "user", "content": text}]
    
    response = anthropic_client.messages.count_tokens(
        model=model,
        system=system_prompt,
        messages=messages
    )

    return response.input_tokens

scraper = HtmlScraper(scraping_api_key=os.environ["SCRAPING_API_KEY"])

async def process_relevant_links(relevant_link, row_dict):
    if await check_similar_scrapes(relevant_link.link):
        return
    else:
        print(f"Processing {relevant_link.link}")
        
        nav_html_list = []
        try:
                num_tokens_allowed = 30000
                i = 1
                clean_html_sum_links = []
                clean_html_sum = ""

                try:
                    print("----------BASE PAGE----------")
                    print(relevant_link.link)
                    nav_html_content_base = scraper.scrape_html_base(relevant_link.link)
                    
                    # Log raw scraped HTML for base page
                    if nav_html_content_base:
                        _write_log("base_raw_scraped_base_html.txt", nav_html_content_base)
                    
                    clean_text, clean_html = scraper.clean_html_content(nav_html_content_base)
                    print(f"clean_text: {count_tokens(clean_text)} tokens")
                    print(f"clean_html: {count_tokens(clean_html)} tokens")

                    sliced_html = get_relevant_html(clean_html)
                    _write_log("sliced_html.txt", sliced_html)
                    print(f"sliced_html: {count_tokens(sliced_html)} tokens")
                    
                    clean_html_sum += f"CLEAN HTML PAGE {i}: ({relevant_link.link})\n\n"
                    clean_html_sum += sliced_html + "\n\n"
                    clean_html_sum_links.append(relevant_link.link)


                    if nav_html_content_base:
                        nav_html_list.append(nav_html_content_base)
                except Exception as e:
                    error_message = f"Error processing base page: {e}"
                    print(error_message)
                    with open(os.environ["LOG_FILE_PATH"], "a") as log_file:
                        log_file.write(error_message + "\n")

                nav_links = []
                if len(nav_html_list) == 0 or not nav_html_list[0]: 
                    return
                for html_content in nav_html_list:
                    try:
                        nav_links.extend(scraper.get_nav_links(html_content))
                    except Exception as e:
                        error_message = f"Error processing nav links: {e}"
                        print(error_message)
                        with open(os.environ["LOG_FILE_PATH"], "a") as log_file:
                            log_file.write(error_message + "\n")
                print("----------NAV LINKS----------")
                print(nav_links)
                nav_links_str = "\n".join([f"{title}: {link}" for title, link, _ in nav_links])

                prompt = load_prompt("generate_about_us_link")
                prompt_template = ChatPromptTemplate.from_messages([
                    ("system", prompt),
                    ("human", f'Filter "about us" page, contact us page, services page, team page, or any similar page.')
                ])
                

                result = call_llm(prompt_template = prompt_template, input_dict = {"nav_links": nav_links_str, "base_url": relevant_link.link}, structured_output = AboutUsOutput, temperature=0, prompt_name="generate_about_us_link")
                print(len(result.about_us_link))
                print(result.about_us_link)
                print("\n")
                for i, link in enumerate(result.about_us_link):
                    if link in clean_html_sum_links:
                        continue
                    if clean_html_sum and count_tokens(clean_html_sum) >= num_tokens_allowed:
                        continue
                    try:
                        html_content = scraper.scrape_html_base(link)
                        
                        # Log raw scraped HTML for additional pages
                        if html_content:
                            _write_log("raw_scraped_additional_html.txt", html_content)
                        
                        clean_text, clean_html = scraper.clean_html_content(html_content)

                        print(f"clean_text: {count_tokens(clean_text)} tokens")
                        print(f"clean_html: {count_tokens(clean_html)} tokens")
                        
                        if clean_html: print(f"count_tokens(clean_html) {i} before approval: {count_tokens(clean_html)}")
                        else: print(f"not clean_text {i} before approval: {clean_html}")

                        sliced_html = get_relevant_html(clean_html)
                        print(f"sliced_html: {count_tokens(sliced_html)} tokens")
                        clean_html_sum += f"CLEAN HTML PAGE {i}: ({link})\n\n"
                        clean_html_sum += sliced_html + "\n\n"
                        clean_html_sum_links.append(link)
                        i += 1
                    except Exception as e:
                        error_message = f"Error processing {link}: {e}"
                        print(error_message)
                        with open(os.environ["LOG_FILE_PATH"], "a") as log_file:
                            log_file.write(error_message + "\n")
                
                # print("##########PROPRIETARY DATA RECORD OF THE ORGANIZATION##########\n")
                # print(str(row_dict))
                # with open(os.environ["LOG_FILE_PATH"], "a") as log_file:
                #     log_file.write("PROPRIETARY DATA RECORD OF THE ORGANIZATION\n\n" + str(row_dict) + "\n\n")
                # html_appendex = "PROPRIETARY DATA RECORD OF THE ORGANIZATION\n\n" + str(row_dict)
                clean_html_sum = clean_html_sum[:int((num_tokens_allowed*4)/2)]
                if clean_html_sum != "":
                    # Log the final clean HTML sum before field generation
                    _write_log("final_clean_html_sum.txt", clean_html_sum)
                    
                    prompt = load_prompt("generate_fields_with_email")
                    prompt_template = ChatPromptTemplate.from_messages([
                        ("system", prompt),
                        ("human", f"Generate fields from the given HTML content.")
                    ])
                    
                    result = call_llm(prompt_template = prompt_template, input_dict = {"html_content": clean_html_sum, "proprietary_data": str(row_dict)}, structured_output = ServiceProviderOutput, temperature=0, prompt_name="generate_fields_with_email")
                    if result.emails == None:
                        print("result.email is Null")
                        result.emails = []
                    if result.telephones == None:
                        print("result.telephones is Null")
                        result.telephones = []
                    if result.mobiles == None:
                        print("result.mobiles is Null")
                        result.mobiles = []
                    if result.logo:
                        try:
                            result.logo = scraper.save_logo(result.logo)
                        except Exception as e:
                            error_message = f"Error saving logo: {e}"
                            print(error_message)
                            with open(os.environ["LOG_FILE_PATH"], "a") as log_file:
                                log_file.write(error_message + "\n")
                            result.logo = None

                result.emails = validate_emails(result.emails)
                if hasattr(result, 'name') and result.name and result.name != "":
                    if hasattr(result, 'name') and result.name and result.name != "":
                        user = None
                        if validate_url(result.website):
                            result.website = result.website.strip()
                            if not urlparse(result.website).scheme:
                                result.website = "http://" + result.website
                            try:
                                profile = await ServiceProviderProfile.objects.aget(website=result.website)
                                if profile:
                                    log_message = f"Profile already exists for this website: {result.website}"
                                    print(log_message)
                                    with open(os.environ["LOG_FILE_PATH"], "a") as log_file:
                                        log_file.write(log_message + "\n")
                                    user = profile.user
                            except Exception as e:
                                profile = None
                                error_message = f"Error getting profile from website: {e}"
                                print(error_message)
                                with open(os.environ["LOG_FILE_PATH"], "a") as log_file:
                                    log_file.write(error_message + "\n")
                        else:
                            result.website = None
                        
                        if not user and hasattr(result, 'emails') and result.emails and result.emails[0] != "":
                            print("creating user with name and email")
                            user_email = result.emails[0]
                            user_password = generate_password(8)
                            user, created = await get_or_create_user(name=result.name, email=user_email, username=user_email, password=user_password)
                            print("created user with email and name")
                        elif not user:
                            user_password = generate_password(8)
                            user, created = await get_or_create_user(name=result.name, password=user_password)
                            print("created user with name only")                        

                        if user:
                            # if not result.emails: result.emails = []
                            try:
                                profile = await ServiceProviderProfile.objects.aget(user_id=user.id)
                            except ServiceProviderProfile.DoesNotExist:
                                profile = None
                                error_message = f"Error profile does not exist!"
                                print(error_message)
                                with open(os.environ["LOG_FILE_PATH"], "a") as log_file:
                                    log_file.write(error_message + "\n")
                            except Exception as e:
                                profile = None
                                error_message = f"Error getting profile: {e}"
                                print(error_message)
                                with open(os.environ["LOG_FILE_PATH"], "a") as log_file:
                                    log_file.write(error_message + "\n")
                            
                            if not created and profile:
                                data = {
                                    "provider_type": result.provider_type[:50] if result.provider_type else profile.provider_type,
                                    "country": result.country[:100] if result.country else profile.country,
                                    "session_status": profile.session_status,
                                    "name": result.name[:255] if result.name else profile.name,
                                    "vision": result.vision or profile.vision,
                                    "website": result.website if validate_url(result.website) else profile.website,
                                    "logo": result.logo or profile.logo,
                                    "linkedin": result.linkedin if validate_url(result.linkedin) else profile.linkedin,
                                    "facebook": result.facebook if validate_url(result.facebook) else profile.facebook,
                                    "instagram": result.instagram if validate_url(result.instagram) else profile.instagram,
                                    "telephones": [x[:30] for x in result.telephones] if result.telephones else profile.telephones,
                                    "mobiles": [x[:30] for x in result.mobiles] if result.mobiles else profile.mobiles,
                                    "emails": result.emails if result.emails else profile.emails,
                                    "office_locations": result.office_locations or profile.office_locations,
                                    "key_individuals": result.key_individuals or profile.key_individuals,
                                    "representatives": result.representatives or profile.representatives
                                }
                                saved_profile, created_profile = await ServiceProviderProfile.objects.aupdate_or_create(
                                    user_id=user.id,             # lookup part – must stay unique
                                    defaults=data            # values to update/insert
                                )   
                                print(f"Profile updated: {saved_profile.name}")
                            else:
                                data = {
                                    "user": user.username,
                                    "provider_type": result.provider_type[:50] if result.provider_type else None,
                                    "country": result.country[:100] if result.country else None,
                                    "session_status": "inactive",
                                    "name": result.name[:255] if result.name else None,
                                    "vision": result.vision,
                                    "website": result.website if validate_url(result.website) else None,
                                    "logo": result.logo,
                                    "linkedin": result.linkedin if validate_url(result.linkedin) else None,
                                    "facebook": result.facebook if validate_url(result.facebook) else None,
                                    "instagram": result.instagram if validate_url(result.instagram) else None,
                                    "telephones": [x[:30] for x in result.telephones] if result.telephones else [],
                                    "mobiles": [x[:30] for x in result.mobiles] if result.mobiles else [],
                                    "emails": result.emails,
                                    "office_locations": result.office_locations,
                                    "key_individuals": result.key_individuals,
                                    "representatives": result.representatives
                                }
                                saved_profile = await create_service_provider_profile(data)
                                print(f"Profile created: {saved_profile.name}")

                            service_data_json = {
                                "profile": saved_profile.id,
                                "service_title": result.service_title[:255] if result.service_title else None,
                                "service_description": result.service_description,
                                "service_tags": result.service_tags,
                                "rating_score": result.rating_score,
                                "rating_description": result.rating_description,
                                "pricing": result.pricing
                            }
                            # Direct async call (works in Jupyter/IPython):
                            saved_service = await create_service(service_data_json)
                            print(f"Service created: {saved_service.service_title}")
                            scrape_data = {
                                "base_url": relevant_link.link,
                                "provider": saved_profile.id,
                                "service": saved_service.id,
                                "cleaned_html": clean_html_sum
                            }
                            saved_scrape = await create_scrape(scrape_data)
                            print(f"Scrape created: {saved_scrape.base_url}")
        except Exception as e:
            # raise ValueError(e)
            error_message = f"Error processing {relevant_link.link}: {e}"
            print(error_message)
            with open(os.environ["LOG_FILE_PATH"], "a") as log_file:
                log_file.write(error_message + "\n")
                # traceback.print_exc(file=log_file)

import argparse
import asyncio
import pandas as pd

async def main(env_path: str = "../envs/1.env") -> None:
    print(load_dotenv(env_path))
    print(f"Loaded envs from {env_path}")

    with open(os.environ["LOG_FILE_PATH"], "w") as log_file:
        log_file.write("")

    missed_df = pd.DataFrame()
    
    # Create or overwrite the missed entries file with an empty CSV
    missed_df_path = os.environ["MISSED_ENTRIES_PATH"]
    missed_df.to_csv(missed_df_path, index=False)
    
    df = pd.read_csv(os.environ["EMAIL_LIST_PATH"])
    df = df[['ORGANIZATION','INDUSTRY', 'WEBSITE', 'COUNTRY', 'CITY', 'FIRSTNAME', 'LASTNAME', 'EMAIL', 'LINKEDIN', 'DESIGNATION']]
    df['ORGANIZATION_COPY'] = df['ORGANIZATION']
    df = df.set_index('ORGANIZATION_COPY')
    df = df.dropna(axis=1, how='all')
    grouped = (
        df.groupby(df.index)
        .agg(lambda x: x.dropna().iloc[0] if x.notna().any() else None)
    )
    i = 1
    successful_entries = 0
    for organization, group in grouped.iterrows():
        if organization in ["ATN MAC", "ATH Consultants", "ATC Emirates (Ashraf AbdelGhany Tax Consultants)", "	ARA Associates Accountants & Consultants", "AMA Accounting", "	AKA Management Consultancy", "A & M Alansari Auditing"]:
            continue
        log_message = f"Processing Entry {i}/{len(grouped)} Organization: {organization}"
        print(log_message)
        with open(os.environ["LOG_FILE_PATH"], "a") as log_file:
            log_file.write(log_message + "\n")
        row_dict = group.to_dict()
        prompt = load_prompt("generate_a_search_term")
        prompt_template = ChatPromptTemplate.from_messages([
            ("system", prompt),
            ("human", f"Generate only one suitable google search term to fetch the official website of the company/service provider/orginization: {row_dict['ORGANIZATION']}")
        ])

        result = call_llm(prompt_template = prompt_template, input_dict = {"row_dict": row_dict}, structured_output = SearchTermOutput, temperature=0, prompt_name="generate_a_search_term")

        try:
            print(f"Processing search term: {result.search_term}")
            links = search_serper(result.search_term)
            # print(f"######### Found {len(links)} links ###########")
            # print(links)
            
            # Log search results
            _write_log("search_results.txt", json.dumps(links, ensure_ascii=False, indent=2))
            
            most_relevant_result = check_search_relevance(result.search_term, links).most_relevant_result
            if most_relevant_result:
                print(most_relevant_result)
                await process_relevant_links(most_relevant_result, row_dict)
                successful_entries += 1
                log_message = f"Successful processed entries {successful_entries}/{len(grouped)} Organization: {organization}"
                print(log_message)
                with open(os.environ["LOG_FILE_PATH"], "a") as log_file:
                    log_file.write(log_message + "\n")
            else:
                missed_df = pd.concat([missed_df, group.to_frame().T], ignore_index=True)
                missed_df_path = os.environ["MISSED_ENTRIES_PATH"]
                missed_df.to_csv(missed_df_path, index=False)
                log_message = f"No relevant link found for search term: {result.search_term}"
                print(log_message)
                with open(os.environ["LOG_FILE_PATH"], "a") as log_file:
                    log_file.write(log_message + "\n")
        except Exception as e:
            missed_df = pd.concat([missed_df, group.to_frame().T], ignore_index=True)
            missed_df_path = os.environ["MISSED_ENTRIES_PATH"]
            missed_df.to_csv(missed_df_path, index=False)
            error_message = f"Error processing Entry {i}/{len(grouped)} Organization: {organization}: {e}"
            print(error_message)
            with open(os.environ["LOG_FILE_PATH"], "a") as log_file:
                log_file.write(error_message + "\n")
        i += 1

In [2]:
await main()

True
Loaded envs from ../envs/1.env
Processing Entry 1/632 Organization: A & A Financial Solutions


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Processing search term: A & A Financial Solutions Dubai


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


explanation="The most relevant search result appears to be the first result, 'AA Financials: Reliable Financial Solutions in Dubai'. This link directly points to the official website of the company 'AA Financials' which provides financial solutions in Dubai, matching the search term. The title and snippet indicate this is the official company website, not a search facilitator or other service." link='https://aafinancials.ae/'
Processing https://aafinancials.ae/
----------BASE PAGE----------
https://aafinancials.ae/
clean_text: 979 tokens
clean_html: 168597 tokens


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 10.98s. Produced 6 slices.


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 10.46s. Produced 2 slices.


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 3.16s. Produced 6 slices.


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 18.39s. Produced 6 slices.


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 4.24s. Produced 6 slices.


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 8.24s. Produced 6 slices.


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 4.23s. Produced 6 slices.


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 2.85s. Produced 6 slices.


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 8.98s. Produced 6 slices.
sliced_html: 200881 tokens
----------NAV LINKS----------
[('Who We Are', 'https://aafinancials.ae/about-us/', 'about'), ('Our Services', 'https://aafinancials.ae/our-services/', 'services'), ('Corporate Tax Services', 'https://aafinancials.ae/corporate-tax-consultant-dubai/', 'services'), ('Audit Services', 'https://aafinancials.ae/audit-services-dubai/', 'services'), ('Contact', 'https://aafinancials.ae/contact/', 'contact'), ('Services', 'https://www.linkedin.com/company/aa-financial-solutions', 'services'), ('Our Services', 'https://aafinancials.ae/our-service/', 'services'), ('Team Member', 'https://aafinancials.ae/team-details/', 'team')]


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


6
['https://aafinancials.ae/about-us/', 'https://aafinancials.ae/contact/', 'https://aafinancials.ae/our-services/', 'https://aafinancials.ae/corporate-tax-consultant-dubai/', 'https://aafinancials.ae/audit-services-dubai/', 'https://aafinancials.ae/team-details/']




/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Logo saved to ../growbal_django/media/logos/Color_0.png
Error getting profile from website: ServiceProviderProfile matching query does not exist.
creating user with name and email
created user with email and name
Error profile does not exist!
Profile created: AA Financials
Service created: Financial and Audit Services in Dubai
Scrape created: https://aafinancials.ae/
Successful processed entries 1/632 Organization: A & A Financial Solutions
Processing Entry 2/632 Organization: A G L FINANCIAL CONSULTANCIES EST.


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Processing search term: AGL Financial Consultancies Dubai


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


explanation='The most relevant search result appears to be the first result, which is the official website of AGL Financial Consultancies at https://www.aglaccounts.ae/. The title and snippet indicate that this is the official website of the tax consultancy firm AGL Financial Consultancies located in Dubai.' link='https://www.aglaccounts.ae/'
Processing https://www.aglaccounts.ae/
----------BASE PAGE----------
https://www.aglaccounts.ae/
clean_text: 895 tokens
clean_html: 33218 tokens


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 3.49s. Produced 6 slices.


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 28.28s. Produced 3 slices.


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 18.50s. Produced 6 slices.
sliced_html: 9162 tokens
----------NAV LINKS----------
[('services', 'https://www.aglaccounts.ae/services/', 'services'), ('About', 'https://www.aglaccounts.ae/about/', 'about'), ('VAT & Tax Services in UAE', 'https://www.aglaccounts.ae/vat-tax-services-in-uae/', 'services'), ('Accounting Services in UAE', 'https://www.aglaccounts.ae/accounting-services-in-uae/', 'services'), ('Contact', 'https://www.aglaccounts.ae/contact/', 'contact')]


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


5
['https://www.aglaccounts.ae/about/', 'https://www.aglaccounts.ae/contact/', 'https://www.aglaccounts.ae/services/', 'https://www.aglaccounts.ae/vat-tax-services-in-uae/', 'https://www.aglaccounts.ae/accounting-services-in-uae/']


clean_text: 797 tokens
clean_html: 32231 tokens
count_tokens(clean_html) 0 before approval: 32231


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 2.87s. Produced 6 slices.


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 8.17s. Produced 6 slices.


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 3.58s. Produced 6 slices.
sliced_html: 10271 tokens
clean_text: 317 tokens
clean_html: 14556 tokens
count_tokens(clean_html) 1 before approval: 14556


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 19.26s. Produced 6 slices.


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 1.78s. Produced 6 slices.
sliced_html: 11435 tokens


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Logo saved to ../growbal_django/media/logos/Untitled-design-30_0.png
Error getting profile from website: ServiceProviderProfile matching query does not exist.
creating user with name and email
created user with email and name
Error profile does not exist!
Profile created: AGL Financial Consultancies
Service created: Tax and Accounting Services in UAE
Scrape created: https://www.aglaccounts.ae/
Successful processed entries 2/632 Organization: A G L FINANCIAL CONSULTANCIES EST.
Processing Entry 3/632 Organization: A K A Management Consultancy L.L.C


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Processing search term: AKA Management Consultancy Dubai


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


explanation="The first search result, 'AKA: Business Consultants in Dubai | VAT Experts in Dubai', appears to be the official website of AKA Management Consultancy in Dubai. The title and snippet indicate that this is the company's website, which provides consulting services related to the search term." link='https://aka.ae/'
Successful processed entries 3/632 Organization: A K A Management Consultancy L.L.C
Processing Entry 4/632 Organization: A L I F ACCOUNTING AND TAX CONSULTANTS L.L.C


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Processing search term: ALIF Accounting Tax Consultants Dubai


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


explanation="The first search result, 'ALIF Accounting & Tax Consultants', appears to be the official website of the accounting and tax consulting firm based in Dubai. The title and snippet indicate that this is the company's own website, not a search facilitator or other service. This result is the most relevant to the given search term." link='https://alifconsulting.ae/'
Processing https://alifconsulting.ae/
----------BASE PAGE----------
https://alifconsulting.ae/
clean_text: 2418 tokens
clean_html: 98170 tokens


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 6.79s. Produced 4 slices.


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 18.31s. Produced 5 slices.


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 2.89s. Produced 6 slices.


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 10.93s. Produced 6 slices.


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 3.35s. Produced 6 slices.


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 7.08s. Produced 4 slices.


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 41.86s. Produced 6 slices.
sliced_html: 51088 tokens
----------NAV LINKS----------
[('Services', 'https://www.cookieyes.com/product/cookie-consent', 'services'), ('Our Services', 'https://alifconsulting.ae/service/', 'services'), ('Economic Substance Regulation (ESR) Consulting Services', 'https://alifconsulting.ae/tax-consultancy-planning/economic-substance-regulation-consulting-services/', 'services'), ('Financial Advisory Services', 'https://alifconsulting.ae/business-advisory/financial-advisory-services/', 'services'), ('Banking Services', 'https://alifconsulting.ae/business-advisory/banking-services/', 'services'), ('AML Compliance Services', 'https://alifconsulting.ae/business-advisory/aml-compliance-services/', 'services'), ('About Us', 'https://alifconsulting.ae/about-us/', 'about'), ('Contact', 'https://alifconsulting.ae/contact/', 'contact'), ('Career', 'https://alifconsulting.ae/career/', 'careers'), ('View All Services', 'https://alifconsulting.ae/services/',

/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


4
['https://alifconsulting.ae/about-us/', 'https://alifconsulting.ae/contact/', 'https://alifconsulting.ae/service/', 'https://alifconsulting.ae/services/']




/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Logo saved to ../growbal_django/media/logos/Alif-logo-original-colour-1_0.png
Error getting profile from website: ServiceProviderProfile matching query does not exist.
creating user with name and email
created user with email and name
Error profile does not exist!
Profile created: ALIF Accounting & Tax Consultants
Service created: Accounting, Tax and Business Advisory Services
Scrape created: https://alifconsulting.ae/
Successful processed entries 4/632 Organization: A L I F ACCOUNTING AND TAX CONSULTANTS L.L.C
Processing Entry 5/632 Organization: A M A ACCOUNTING & BOOKKEEPING


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Processing search term: AMA Accounting Bookkeeping Dubai


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


explanation="The most relevant search result appears to be the first result, which is the official website of AMA Accounting, a company that provides accounting and bookkeeping services in Dubai. The title, link, and snippet all indicate that this is the company's official website and it is directly relevant to the search term." link='https://amaaccounting.ae/'
Successful processed entries 5/632 Organization: A M A ACCOUNTING & BOOKKEEPING
Processing Entry 6/632 Organization: A M J TAX CONSULTANCY AND BOOKS KEEPING - L.L.C -  ...


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Processing search term: AMJ Tax Consultancy Bookkeeping LLC Abu Dhabi


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


explanation="The most relevant search result appears to be the first result, 'Financial Services Company Dubai & India | Audit Firm - AMJ'. This link directly points to the official website of AMJ Consultancy, which provides tax consultancy and bookkeeping services in Abu Dhabi. The title and snippet indicate that this is the company's official website, not a search facilitator or other service." link='https://amjconsultancy.ae/'
Processing https://amjconsultancy.ae/
----------BASE PAGE----------
https://amjconsultancy.ae/
clean_text: 5601 tokens
clean_html: 90693 tokens


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 8.08s. Produced 6 slices.


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 34.05s. Produced 6 slices.


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 19.51s. Produced 6 slices.


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 2.99s. Produced 6 slices.


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 2.48s. Produced 6 slices.


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 3.69s. Produced 1 slices.


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 6.88s. Produced 6 slices.
sliced_html: 206468 tokens
----------NAV LINKS----------
[('About', 'https://amjconsultancy.ae/about/', 'about'), ('Accounting Services', 'https://amjconsultancy.ae/accounting-services/', 'services'), ('Bookkeeping Services', 'https://amjconsultancy.ae/bookkeeping-services/', 'services'), ('Compliance Services', 'https://amjconsultancy.ae/compliance-services/', 'services'), ('Contact Us', 'https://amjconsultancy.ae/contact/', 'contact'), ('Read more', 'https://amjconsultancy.ae/services#accounting', 'services'), ('Read more', 'https://amjconsultancy.ae/services#book', 'services'), ('Read more', 'https://amjconsultancy.ae/services#compliance', 'services'), ('Read more', 'https://amjconsultancy.ae/services#registration', 'services'), ('Services', 'https://www.instagram.com/amjconsultancy.services/%20', 'services'), ('Services', 'https://amjconsultancy.ae/services/', 'services'), ('Accounting', 'https://amjconsultancy.ae/services/#accounting', 'ser

/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


6
['https://amjconsultancy.ae/about/', 'https://amjconsultancy.ae/contact/', 'https://amjconsultancy.ae/services/', 'https://amjconsultancy.ae/accounting-services/', 'https://amjconsultancy.ae/bookkeeping-services/', 'https://amjconsultancy.ae/compliance-services/']




/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Logo saved to ../growbal_django/media/logos/final-logo-1_0.png
Error getting profile from website: ServiceProviderProfile matching query does not exist.
creating user with name and email
created user with email and name
Error profile does not exist!
Profile created: AMJ Consultancy
Service created: Professional Accounting & Financial Services
Scrape created: https://amjconsultancy.ae/
Successful processed entries 6/632 Organization: A M J TAX CONSULTANCY AND BOOKS KEEPING - L.L.C -  ...
Processing Entry 7/632 Organization: A R A Associates Accountants And Consultants


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Processing search term: ARA Associates Accountants Dubai


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


explanation="The second search result, 'ARA Associates Accountants & Consultants | Dubai', appears to be the most relevant. The title and snippet indicate that this is the official website of the ARA Associates accounting firm in Dubai, which directly matches the search term." link='https://www.facebook.com/aradubai/'
Successful processed entries 7/632 Organization: A R A Associates Accountants And Consultants
Processing Entry 8/632 Organization: A T C TAX CONSULTANTS


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Processing search term: ATC Tax Consultants Dubai


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


explanation="The third search result, 'ATC – Advisors Tax Consultancy', appears to be the official website of ATC Tax Consultants in Dubai. The title and snippet indicate that this is the company's website, which provides tax consultancy services in the UAE." link='https://atc-uae.com/'
Processing https://atc-uae.com/
----------BASE PAGE----------
https://atc-uae.com/
clean_text: 849 tokens
clean_html: 36443 tokens


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 8.95s. Produced 6 slices.


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 3.76s. Produced 6 slices.


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 10.59s. Produced 6 slices.
sliced_html: 10735 tokens
----------NAV LINKS----------
[('About', 'https://atc-uae.com/about/', 'about'), ('Services', 'https://atc-uae.com/offer/', 'services'), ('Contact', 'https://atc-uae.com/contact/', 'contact')]


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


3
['https://atc-uae.com/about/', 'https://atc-uae.com/contact/', 'https://atc-uae.com/offer/']


clean_text: 421 tokens
clean_html: 5531 tokens
count_tokens(clean_html) 0 before approval: 5531


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 2.94s. Produced 6 slices.
sliced_html: 1686 tokens
clean_text: 303 tokens
clean_html: 5737 tokens
count_tokens(clean_html) 1 before approval: 5737


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 2.75s. Produced 6 slices.
sliced_html: 2303 tokens
clean_text: 877 tokens
clean_html: 7487 tokens
count_tokens(clean_html) 2 before approval: 7487


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 3.31s. Produced 12 slices.
sliced_html: 4640 tokens


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Logo saved to ../growbal_django/media/logos/ACT-6666_0.png
Error getting profile from website: ServiceProviderProfile matching query does not exist.
creating user with name and email
created user with email and name
Error profile does not exist!
Profile created: ATC - Advisors Tax Consultancy
Service created: Tax and Accounting Consultancy Services
Scrape created: https://atc-uae.com/
Successful processed entries 8/632 Organization: A T C TAX CONSULTANTS
Processing Entry 9/632 Organization: A T H Accounting & Bookkeeping


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Processing search term: ATH Accounting Bookkeeping Dubai


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


explanation="The second search result, 'ATH Consultants | Audit and assurance, ERP, tax services and ...', appears to be the most relevant. The title and snippet indicate that this is the official website of ATH Consultants, an accounting and business services firm based in Dubai. This aligns well with the search term 'ATH Accounting Bookkeeping Dubai'." link='https://www.athconsultants.com/'
Successful processed entries 9/632 Organization: A T H Accounting & Bookkeeping
Processing Entry 10/632 Organization: A T N-MAC ACCOUNTING & BOOK KEEPING


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Processing search term: ATN-MAC Accounting Bookkeeping Dubai


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


explanation="The first search result, 'ATNMAC', appears to be the official website of the ATN-MAC accounting and bookkeeping firm in Dubai. The title, link, and snippet all indicate that this is the company's official website, which is directly relevant to the search term." link='https://atn-mac.com/'
Successful processed entries 10/632 Organization: A T N-MAC ACCOUNTING & BOOK KEEPING
Processing Entry 11/632 Organization: A T T consultancy LLC


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Processing search term: ATT consultancy LLC Dubai


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


explanation="The first search result, 'ATT Consultancy: Home', appears to be the official website of the ATT Consultancy LLC company based in Dubai. The title and snippet indicate that this is the company's main website, which is highly relevant to the search term." link='https://attconsultancy.com/'
Processing https://attconsultancy.com/
----------BASE PAGE----------
https://attconsultancy.com/
clean_text: 465 tokens
clean_html: 22570 tokens


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 2.90s. Produced 6 slices.


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 3.26s. Produced 6 slices.
sliced_html: 5773 tokens
----------NAV LINKS----------
[('About', 'https://attconsultancy.com/about-us/', 'about'), ('Services', 'https://attconsultancy.com/services/', 'services'), ('Contact', 'https://attconsultancy.com/contact-us/', 'contact'), ('Services', 'https://attconsultancy.com/services/corporate-tax-for-free-zone-companies/', 'services'), ('Services', 'https://attconsultancy.com/services/corporate-tax/', 'services'), ('Services', 'https://attconsultancy.com/services/advisory-services/', 'services'), ('Services', 'https://attconsultancy.com/services/auditing/', 'services'), ('Services', 'https://attconsultancy.com/services/value-added-tax/', 'services')]


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


8
['https://attconsultancy.com/about-us/', 'https://attconsultancy.com/contact-us/', 'https://attconsultancy.com/services/', 'https://attconsultancy.com/services/corporate-tax-for-free-zone-companies/', 'https://attconsultancy.com/services/corporate-tax/', 'https://attconsultancy.com/services/advisory-services/', 'https://attconsultancy.com/services/auditing/', 'https://attconsultancy.com/services/value-added-tax/']


clean_text: 440 tokens
clean_html: 11301 tokens
count_tokens(clean_html) 0 before approval: 11301


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 2.88s. Produced 6 slices.
sliced_html: 957 tokens
clean_text: 214 tokens
clean_html: 12134 tokens
count_tokens(clean_html) 1 before approval: 12134


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 3.03s. Produced 6 slices.
sliced_html: 2948 tokens
clean_text: 253 tokens
clean_html: 11921 tokens
count_tokens(clean_html) 2 before approval: 11921


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 2.75s. Produced 6 slices.
sliced_html: 2041 tokens
clean_text: 264 tokens
clean_html: 10488 tokens
count_tokens(clean_html) 3 before approval: 10488


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 2.82s. Produced 7 slices.
sliced_html: 3506 tokens
clean_text: 208 tokens
clean_html: 10374 tokens
count_tokens(clean_html) 4 before approval: 10374


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 3.00s. Produced 6 slices.
sliced_html: 2512 tokens
clean_text: 176 tokens
clean_html: 10339 tokens
count_tokens(clean_html) 5 before approval: 10339


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 2.74s. Produced 6 slices.
sliced_html: 1345 tokens
clean_text: 173 tokens
clean_html: 10338 tokens
count_tokens(clean_html) 6 before approval: 10338


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 3.49s. Produced 8 slices.
sliced_html: 3279 tokens
clean_text: 233 tokens
clean_html: 10406 tokens
count_tokens(clean_html) 7 before approval: 10406


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 3.37s. Produced 5 slices.
sliced_html: 2390 tokens


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Logo saved to ../growbal_django/media/logos/background-removed_0.png
Error getting profile from website: ServiceProviderProfile matching query does not exist.
creating user with name and email
created user with email and name
Error profile does not exist!
Profile created: Association of Taxation Technicians Consultancy
Service created: Tax Consultancy and Compliance Services
Scrape created: https://attconsultancy.com/
Successful processed entries 11/632 Organization: A T T consultancy LLC
Processing Entry 12/632 Organization: A T W A ACCOUNTING AND TAXATION L.L.C


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Processing search term: ATWA Accounting and Taxation LLC Dubai


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


explanation="The second search result, 'About us' on the website atwa.ae, appears to be the most relevant. The title and link indicate that this is the official website of ATWA Accounting and Taxation LLC in Dubai, which directly matches the search term." link='https://atwa.ae/about-us/'
Processing https://atwa.ae/about-us/
----------BASE PAGE----------
https://atwa.ae/about-us/
Error processing base page: Error code: 400 - {'type': 'error', 'error': {'type': 'invalid_request_error', 'message': 'messages.0: all messages must have non-empty content except for the optional final assistant message'}, 'request_id': 'req_011CTHdujVJZsTDceoMG84CG'}
Successful processed entries 12/632 Organization: A T W A ACCOUNTING AND TAXATION L.L.C
Processing Entry 13/632 Organization: A&S TAX CONSULTANT (FZE)


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Processing search term: A&S Tax Consultant FZE Sharjah


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


explanation="The first search result appears to be the most relevant, as it directly mentions 'A&S TAX CONSULTANT (FZE)' which seems to be the company name and location specified in the search term. The title and snippet indicate this is likely the official website or listing for this tax consultant firm in Sharjah." link='https://naqood.ae/accountants/sharjah'
Processing https://naqood.ae/accountants/sharjah
----------BASE PAGE----------
https://naqood.ae/accountants/sharjah
clean_text: 1542 tokens
clean_html: 34362 tokens


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 2.95s. Produced 6 slices.


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 10.63s. Produced 6 slices.


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 1.92s. Produced 4 slices.
sliced_html: 62470 tokens
----------NAV LINKS----------
[('Product', '../Product-overview', 'services'), ('AB CAPITAL SERVICES  FZESharjahabcapital.aeinfo@abcapital.ae971523655193', './ab-capital-services-fze', 'services'), ('AL RIYADAH TAX & MANAGEMENT CONSULTANCYSharjahalriyadahae.comWaszoubiWalzoubi@alriyadahae.comWaszoubi@gmail.com', './al-riyadah-tax-management-consultancy', 'team'), ('Almulla For Accounting & Tax ServicesSharjahFTATAXAGENT@GMAIL.COM', './almulla-for-accounting-tax-services', 'services'), ('Our Story', '../about', 'about'), ('Terms of Service', '../Terms-of-Service', 'services')]


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


1
['https://naqood.ae/about']




/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Failed to download logo: 404 Client Error: Not Found for url: https://naqood.ae/logo.svg
creating user with name and email
created user with email and name
Error profile does not exist!
Profile created: A&S TAX CONSULTANT (FZE)
Service created: Accounting and Tax Consultancy Services
Scrape created: https://naqood.ae/accountants/sharjah
Successful processed entries 13/632 Organization: A&S TAX CONSULTANT (FZE)
Processing Entry 14/632 Organization: AB CAPITAL SERVICES  FZE


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Processing search term: AB Capital Services FZE Sharjah


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


explanation="The most relevant search result appears to be the first result, which is the official website of AB Capital Services FZE located in Sharjah. The title and snippet indicate that this is the company's official website, offering business setup and other services in Dubai and the UAE." link='https://abcapital.ae/'
Processing https://abcapital.ae/
----------BASE PAGE----------
https://abcapital.ae/
clean_text: 3894 tokens
clean_html: 93673 tokens


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 6.98s. Produced 6 slices.


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 3.52s. Produced 6 slices.


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 18.66s. Produced 6 slices.


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 3.98s. Produced 6 slices.


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 2.74s. Produced 6 slices.


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 4.83s. Produced 6 slices.


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 2.31s. Produced 6 slices.
sliced_html: 107573 tokens
----------NAV LINKS----------
[('Low Cost Business Setup in Dubai, UAE by AB Capital Services', 'https://abcapital.ae', 'services'), ('Business Banking Solutions', 'https://abcapital.ae/business-banking-solutions/', 'services'), ('About us', 'https://abcapital.ae/about-us/', 'about'), ('Services', 'https://clutch.co/profile/ab-capital-services-fze', 'services'), ('Contact', 'https://abcapital.ae/contact/', 'contact')]


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


4
['https://abcapital.ae/about-us/', 'https://abcapital.ae/contact/', 'https://abcapital.ae/business-banking-solutions/', 'https://clutch.co/profile/ab-capital-services-fze']




/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Logo saved to ../growbal_django/media/logos/abcap-logo-e1703014705681_0.png
Error getting profile from website: ServiceProviderProfile matching query does not exist.
creating user with name and email
created user with email and name
Error profile does not exist!
Profile created: AB Capital Services FZE
Service created: Business Setup and Corporate Services in Dubai
Scrape created: https://abcapital.ae/
Successful processed entries 14/632 Organization: AB CAPITAL SERVICES  FZE
Processing Entry 15/632 Organization: AB Tax and Accounting LLC


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Processing search term: AB Tax and Accounting LLC Sharjah


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


explanation="The second search result, 'About - Adil Badshah Tax and Accounting (ABTA)', appears to be the most relevant. The title and snippet indicate that this is the official website of the company 'AB Tax & Accounting LLC' located in Sharjah, UAE, which matches the search term." link='https://abta.ae/about/'
Processing https://abta.ae/about/
----------BASE PAGE----------
https://abta.ae/about/
clean_text: 1323 tokens
clean_html: 33191 tokens


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 3.60s. Produced 6 slices.


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 11.14s. Produced 6 slices.


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 2.30s. Produced 5 slices.
sliced_html: 11182 tokens
----------NAV LINKS----------
[('Book Schedule Now', 'https://abta.ae/contact/', 'contact'), ('Services', 'https://abta.ae/services/', 'services'), ('Bookkeeping', 'https://abta.ae/bookkeeping-services-uae/', 'services'), ('Audit & Assurance', 'https://abta.ae/audit-services-in-dubai/', 'services'), ('About', 'https://abta.ae/about/', 'about')]


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


3
['https://abta.ae/about/', 'https://abta.ae/contact/', 'https://abta.ae/services/']


clean_text: 322 tokens
clean_html: 25159 tokens
count_tokens(clean_html) 1 before approval: 25159


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 7.12s. Produced 6 slices.


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 2.51s. Produced 4 slices.
sliced_html: 3774 tokens
clean_text: 1065 tokens
clean_html: 32485 tokens
count_tokens(clean_html) 2 before approval: 32485


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 3.23s. Produced 6 slices.


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 3.64s. Produced 6 slices.


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 9.06s. Produced 2 slices.
sliced_html: 9568 tokens


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Logo saved to ../growbal_django/media/logos/34wrsd43_0.png
Error getting profile from website: ServiceProviderProfile matching query does not exist.
creating user with name and email
created user with email and name
Error profile does not exist!
Profile created: AB Tax & Accounting LLC
Service created: UAE Tax & Accounting Solutions
Scrape created: https://abta.ae/about/
Successful processed entries 15/632 Organization: AB Tax and Accounting LLC
Processing Entry 16/632 Organization: ABACUS ACCOUNTING LLC


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Processing search term: abacus accounting llc alain uae


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


explanation="The first search result at http://www.abacusvat.com/ appears to be the official website of Abacus Accounting LLC, a company that provides accounting, bookkeeping, and tax services in the UAE. The title and snippet indicate that this is the company's official website, and it is directly relevant to the search term." link='http://www.abacusvat.com/'
Processing http://www.abacusvat.com/
----------BASE PAGE----------
http://www.abacusvat.com/
Error processing base page: 500 Server Error: Internal Server Error for url: https://scraping.narf.ai/api/v1/?api_key=AlVaU0Htv6s2osyMMZrZCBkG8X62ewQcYDyPA9B76ZXyBgFw1kXmgUx8JwLJMWpIQ1KGmWnOr1UQY3DK8s&url=http%3A%2F%2Fwww.abacusvat.com%2F&js_scenario=%7B%22steps%22%3A+%5B%7B%22wait%22%3A+1500%7D%2C+%7B%22scroll%22%3A+2500%7D%2C+%7B%22wait%22%3A+1200%7D%2C+%7B%22scroll%22%3A+2500%7D%2C+%7B%22wait%22%3A+1200%7D%2C+%7B%22scroll%22%3A+2500%7D%2C+%7B%22wait%22%3A+1200%7D%2C+%7B%22scroll%22%3A+2500%7D%2C+%7B%22wait%22%3A+1200%7D%2C+%7B%22scroll

/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Processing search term: Abdul Majeed Al Marzooqy Auditing Office Abu Dhabi


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


explanation='The most relevant search result appears to be the first result, which links to the official website of Abdul Majeed Al Marzooqi Auditing at https://www.amauae.com/. The title and snippet indicate that this is the official website of the auditing firm, which is directly relevant to the search query.' link='https://www.amauae.com/'
Processing https://www.amauae.com/
----------BASE PAGE----------
https://www.amauae.com/
clean_text: 1674 tokens
clean_html: 20169 tokens


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 3.66s. Produced 6 slices.


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 3.65s. Produced 6 slices.
sliced_html: 9348 tokens
----------NAV LINKS----------
[('Company', 'about.html', 'about'), ('Financial Services', 'financial-services.html', 'services'), ('Accounting Services', 'accounting-services.html', 'services'), ('Human Resource Services', 'hr-services.html', 'services'), ('Contact', 'contact.html', 'contact'), ('Internal/External Auditing', 'service-detail-page.html', 'services'), ('VIew All Services', 'services.html', 'services'), ('Audit Service', 'audit-assuarance.html', 'services')]


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


7
['https://www.amauae.com/about.html', 'https://www.amauae.com/contact.html', 'https://www.amauae.com/services.html', 'https://www.amauae.com/financial-services.html', 'https://www.amauae.com/accounting-services.html', 'https://www.amauae.com/hr-services.html', 'https://www.amauae.com/audit-assuarance.html']


clean_text: 647 tokens
clean_html: 6384 tokens
count_tokens(clean_html) 0 before approval: 6384


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 3.16s. Produced 6 slices.
sliced_html: 723 tokens
clean_text: 499 tokens
clean_html: 6611 tokens
count_tokens(clean_html) 1 before approval: 6611


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 8.65s. Produced 6 slices.
sliced_html: 590 tokens
clean_text: 652 tokens
clean_html: 6933 tokens
count_tokens(clean_html) 2 before approval: 6933


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 10.50s. Produced 6 slices.
sliced_html: 5502 tokens
clean_text: 887 tokens
clean_html: 6994 tokens
count_tokens(clean_html) 3 before approval: 6994


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 3.25s. Produced 6 slices.
sliced_html: 964 tokens
clean_text: 876 tokens
clean_html: 6923 tokens
count_tokens(clean_html) 4 before approval: 6923


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 2.68s. Produced 6 slices.
sliced_html: 2814 tokens
clean_text: 987 tokens
clean_html: 7120 tokens
count_tokens(clean_html) 5 before approval: 7120


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 2.50s. Produced 6 slices.
sliced_html: 1556 tokens
clean_text: 869 tokens
clean_html: 6918 tokens
count_tokens(clean_html) 6 before approval: 6918


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 2.65s. Produced 6 slices.
sliced_html: 6040 tokens


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Logo saved to ../growbal_django/media/logos/logo-dark_1.png
Error getting profile from website: ServiceProviderProfile matching query does not exist.
created user with name only
Error profile does not exist!
Profile created: Abdul Majeed Al Marzooqi
Service created: Comprehensive Audit & Business Services
Scrape created: https://www.amauae.com/
Successful processed entries 17/632 Organization: ABDUL MAJEED AL MARZOOQY AUDITING OFFICE - SOLE PR ...
Processing Entry 18/632 Organization: ABDULAZIZ PANIS AND SHAH ASSOCIATES CHARTERED ACCO ...


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Processing search term: APNS Dubai chartered accountants


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


explanation="The most relevant search result appears to be the link 'https://apnsdubai.com/'. This website seems to be the official website of the APNS Dubai chartered accountants firm, as the title and snippet indicate that it is a firm of experienced professionals providing accounting and advisory services. The other search results do not appear to be the official website, but rather other service providers or listings related to chartered accountants in Dubai." link='https://apnsdubai.com/'
Processing https://apnsdubai.com/
----------BASE PAGE----------
https://apnsdubai.com/
clean_text: 148 tokens
clean_html: 21219 tokens


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 4.61s. Produced 6 slices.


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 1.23s. Produced 2 slices.
sliced_html: 21219 tokens
----------NAV LINKS----------
[('Who We Are', '/who-we-are', 'about'), ('Services', '/services', 'services'), ('Contact Us', '/contact-us', 'contact')]


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


3
['https://apnsdubai.com/who-we-are', 'https://apnsdubai.com/services', 'https://apnsdubai.com/contact-us']


clean_text: 694 tokens
clean_html: 27494 tokens
count_tokens(clean_html) 0 before approval: 27494


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 4.01s. Produced 2 slices.


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 2.59s. Produced 6 slices.
sliced_html: 22248 tokens


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Logo saved to ../growbal_django/media/logos/e0dc013e-f767-4eed-b48d-ca04424826da_0.jpg
Error getting profile from website: ServiceProviderProfile matching query does not exist.
creating user with name and email
created user with email and name
Error profile does not exist!
Profile created: APNS Dubai
Service created: Audit, Tax & Advisory Services
Scrape created: https://apnsdubai.com/
Successful processed entries 18/632 Organization: ABDULAZIZ PANIS AND SHAH ASSOCIATES CHARTERED ACCO ...
Processing Entry 19/632 Organization: ABDULRAHMAN KARMUSTAJI CHARTERED ACCOUNTANTS


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Processing search term: ABDULRAHMAN KARMUSTAJI CHARTERED ACCOUNTANTS Dubai


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


explanation="The second search result, 'Abdul Rahman Karmustaji Auditing - Accounting Services', appears to be the most relevant as it directly mentions 'Abdul Rahman Karmustaji Auditing' which seems to be the official website of the chartered accountants firm based in Dubai." link='https://www.hidubai.com/businesses/abdul-rahman-karmustaji-auditing-finance-legal-accounting-services-al-ras-dubai-4'
Processing https://www.hidubai.com/businesses/abdul-rahman-karmustaji-auditing-finance-legal-accounting-services-al-ras-dubai-4
----------BASE PAGE----------
https://www.hidubai.com/businesses/abdul-rahman-karmustaji-auditing-finance-legal-accounting-services-al-ras-dubai-4
clean_text: 1042 tokens
clean_html: 39630 tokens


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 4.02s. Produced 4 slices.


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 3.59s. Produced 5 slices.


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 3.43s. Produced 3 slices.
sliced_html: 10586 tokens
----------NAV LINKS----------
[('Auto', 'https://www.hidubai.com/categories/auto-transport-vehicle-services-dubai', 'services'), ('Manufacturing', 'https://www.hidubai.com/categories/manufacturing-b2b-services-dubai', 'services'), ('Beauty & Spa', 'https://deals.hidubai.com/search-page/offer_cat/buy-beauty-spa-salon-services-online-dubai/', 'services'), ('Write a review', '/businesses/abdul-rahman-karmustaji-auditing-finance-legal-accounting-services-al-ras-dubai-4/add-review', 'services'), ('Accounting Services', '/categories/accounting-services-finance-legal-dubai', 'services'), ('Payroll Services', '/search?q=Payroll Services&resource=localBusiness', 'services'), ('Audit & Management Assurance', '/search?q=Audit & Management Assurance&resource=localBusiness', 'team'), ('0Reviews', '/businesses/abdul-rahman-karmustaji-auditing-finance-legal-accounting-services-al-ras-dubai-4/reviews', 'services'), ('Overview', '/busin

/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


5
['https://www.hidubai.com/about-hidubai', 'https://www.hidubai.com/businesses/abdul-rahman-karmustaji-auditing-finance-legal-accounting-services-al-ras-dubai-4#overview', 'https://www.hidubai.com/businesses/abdul-rahman-karmustaji-auditing-finance-legal-accounting-services-al-ras-dubai-4#reviews', 'https://www.hidubai.com/businesses/abdul-rahman-karmustaji-auditing-finance-legal-accounting-services-al-ras-dubai-4/reviews', 'https://www.hidubai.com/support-centre-dubai#/user']


clean_text: 629 tokens
clean_html: 12301 tokens
count_tokens(clean_html) 0 before approval: 12301


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 3.17s. Produced 6 slices.
sliced_html: 24523 tokens


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Logo saved to ../growbal_django/media/logos/93b52066-70e5-4bd9-bd9d-fc32575ca3c9-t_0.png
Error getting profile from website: ServiceProviderProfile matching query does not exist.
creating user with name and email
created user with email and name
Error profile does not exist!
Profile created: Abdul Rahman Karmustaji Auditing
Service created: Auditing, Finance, Legal & Accounting Services
Scrape created: https://www.hidubai.com/businesses/abdul-rahman-karmustaji-auditing-finance-legal-accounting-services-al-ras-dubai-4
Successful processed entries 19/632 Organization: ABDULRAHMAN KARMUSTAJI CHARTERED ACCOUNTANTS
Processing Entry 20/632 Organization: ABSTRACT ACCOUNTING & AUDITING


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Processing search term: abstract accounting auditing dubai


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


explanation="The search results indicate that the most relevant result is the website for 'Abstract Accounting & Auditing', which appears to be the official website of an accounting and auditing firm based in Dubai, UAE. The title, snippet, and links all consistently point to this being the company's official website, which is highly relevant to the search term." link='https://abstractauditing.com/'
Processing https://abstractauditing.com/
----------BASE PAGE----------
https://abstractauditing.com/
clean_text: 2857 tokens
clean_html: 54930 tokens


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 2.98s. Produced 4 slices.


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 4.18s. Produced 6 slices.


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 10.66s. Produced 6 slices.


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 18.44s. Produced 4 slices.


/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


Slice LLM took 2.06s. Produced 6 slices.
sliced_html: 29661 tokens
----------NAV LINKS----------
[('About Us', 'https://abstractauditing.com/about-us/', 'about'), ('VAT Services', 'https://abstractauditing.com/vat-services/', 'services'), ('Tax Services', 'https://abstractauditing.com/tax-services/', 'services'), ('Corporate Finance Services', 'https://abstractauditing.com/corporate-finance-services/', 'services'), ('goAML compliance services', 'https://abstractauditing.com/goaml-compliance-services/', 'services'), ('Cash Flow & Forecasting Services', 'https://abstractauditing.com/cash-flow-forecasting-services/', 'services'), ('Audit & Assurance', 'https://abstractauditing.com/audit-assurance-services/', 'services'), ('Business Valuation Services', 'https://abstractauditing.com/business-valuation-services/', 'services'), ('Company Formation Services', 'https://abstractauditing.com/company-formation-services/', 'services'), ('Contact Us', 'https://abstractauditing.com/contact-us/', 'co

/tmp/ipykernel_64118/638342718.py:246: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  response_content = json.dumps(result.dict() if hasattr(result, 'dict') else result.__dict__, ensure_ascii=False, indent=2)


10
['https://abstractauditing.com/about-us/', 'https://abstractauditing.com/contact-us/', 'https://abstractauditing.com/vat-services/', 'https://abstractauditing.com/tax-services/', 'https://abstractauditing.com/corporate-finance-services/', 'https://abstractauditing.com/goaml-compliance-services/', 'https://abstractauditing.com/cash-flow-forecasting-services/', 'https://abstractauditing.com/audit-assurance-services/', 'https://abstractauditing.com/business-valuation-services/', 'https://abstractauditing.com/company-formation-services/']


clean_text: 1760 tokens
clean_html: 45211 tokens
Error processing https://abstractauditing.com/about-us/: Error code: 400 - {'type': 'error', 'error': {'type': 'invalid_request_error', 'message': 'Your credit balance is too low to access the Anthropic API. Please go to Plans & Billing to upgrade or purchase credits.'}, 'request_id': 'req_011CTHfR5voBGhKS4XmSYdzZ'}
Error processing https://abstractauditing.com/: Error code: 400 - {'type': 'error', 'er

BadRequestError: Error code: 400 - {'type': 'error', 'error': {'type': 'invalid_request_error', 'message': 'Your credit balance is too low to access the Anthropic API. Please go to Plans & Billing to upgrade or purchase credits.'}, 'request_id': 'req_011CTHfR9EyukuWsBStwovRe'}

In [None]:
# from __future__ import annotations

# import os
# import re
# from pathlib import Path
# from typing import Optional

# import requests

# try:
#     import lxml.etree as _ET  # type: ignore
#     import lxml.html as _LH  # type: ignore
#     _HAS_LXML = True
# except Exception:
#     _ET = None  # type: ignore
#     _LH = None  # type: ignore
#     _HAS_LXML = False

# try:
#     from bs4 import BeautifulSoup  # type: ignore
#     _HAS_BS4 = True
# except Exception:
#     BeautifulSoup = None  # type: ignore
#     _HAS_BS4 = False


# _VOID_TAGS = {
#     "area", "base", "br", "col", "embed", "hr", "img", "input",
#     "link", "meta", "param", "source", "track", "wbr"
# }


# def _simple_format_html(html: str, indent: int = 2) -> str:
#     """Very lightweight HTML formatter if lxml/bs4 are unavailable.

#     - Inserts newlines between adjacent tags
#     - Applies naive indentation based on opening/closing tags
#     """
#     # Normalize line endings and collapse long whitespace between tags
#     text = html.replace("\r\n", "\n").replace("\r", "\n")
#     text = re.sub(r">\s*<", ">\n<", text)

#     lines = text.splitlines()
#     result_lines: list[str] = []
#     depth = 0

#     for raw_line in lines:
#         line = raw_line.strip()
#         if not line:
#             continue

#         # Detect starting/ending tags for indentation adjustment
#         is_closing = bool(re.match(r"^</[A-Za-z0-9]", line))
#         is_comment = line.startswith("<!--") and line.endswith("-->")
#         is_doctype = line.upper().startswith("<!DOCTYPE")

#         if is_closing and depth > 0:
#             depth -= 1

#         # Render with current depth
#         result_lines.append((" " * (indent * depth)) + line)

#         if is_comment or is_doctype:
#             continue

#         # Detect opening tag name
#         m = re.match(r"^<([A-Za-z0-9]+)(\s|>|/>)", line)
#         if m:
#             tag = m.group(1).lower()
#             self_closed = line.endswith("/>") or tag in _VOID_TAGS
#             is_opening = (not line.startswith("</")) and (not self_closed)
#             if is_opening:
#                 depth += 1

#     return "\n".join(result_lines) + "\n"


# def _postprocess(text: str, max_consecutive_blanks: int = 1) -> str:
#     """Normalize whitespace after pretty printing.

#     - Normalize line endings
#     - Limit consecutive blank lines
#     - Strip trailing spaces
#     """
#     text = text.replace("\r\n", "\n").replace("\r", "\n")
#     out_lines: list[str] = []
#     blank_run = 0
#     for line in text.split("\n"):
#         if line.strip() == "":
#             blank_run += 1
#             if blank_run <= max_consecutive_blanks:
#                 out_lines.append("")
#         else:
#             blank_run = 0
#             out_lines.append(line.rstrip())
#     return "\n".join(out_lines).strip() + "\n"


# def format_html(html: str, prefer: str = "auto") -> str:
#     """Format arbitrary HTML into a readable, indented form.

#     Strategy:
#     - Try lxml pretty_print (best indentation/structure)
#     - Fallback to BeautifulSoup.prettify()
#     - Fallback to a simple regex/indent-based formatter

#     Args:
#         html: Raw HTML string
#         prefer: 'auto' | 'lxml' | 'bs4' | 'simple'

#     Returns:
#         Readable, indented HTML as a string
#     """
#     prefer = prefer.lower().strip()

#     if prefer in ("auto", "lxml") and _HAS_LXML:
#         try:
#             # lxml.html can fix up broken markup better than etree.HTML sometimes
#             doc = _LH.fromstring(html)
#             pretty = _LH.tostring(doc, encoding="unicode", method="html", pretty_print=True)
#             return _postprocess(pretty)
#         except Exception:
#             # Fall through to next method
#             pass

#     if prefer in ("auto", "bs4") and _HAS_BS4:
#         try:
#             parser_choice = "lxml" if _HAS_LXML else "html.parser"
#             soup = BeautifulSoup(html, parser_choice)  # type: ignore[arg-type]
#             pretty = soup.prettify()
#             return _postprocess(pretty)
#         except Exception:
#             pass

#     # Simple fallback
#     simple = _simple_format_html(html)
#     return _postprocess(simple)


# def fetch_html(url: str, timeout: int = 20) -> str:
#     """Fetch HTML with a reasonable desktop user-agent and timeouts."""
#     headers = {
#         "User-Agent": (
#             "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
#             "(KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36"
#         ),
#         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
#         "Accept-Language": "en-US,en;q=0.9",
#         "Cache-Control": "no-cache",
#         "Pragma": "no-cache",
#     }
#     resp = requests.get(url, headers=headers, timeout=timeout)
#     resp.raise_for_status()
#     # requests will decode based on headers/encoding; resp.text is fine here
#     return resp.text


# def save_formatted(html_text: str, destination: Path) -> None:
#     destination.parent.mkdir(parents=True, exist_ok=True)
#     destination.write_text(html_text, encoding="utf-8")


# # --- Demo: fetch and format a couple of real pages ---
# try:
#     project_root = Path("/home/mohammed/Desktop/tech_projects/growbal")
#     logs_dir = project_root / "crawler_v2" / "logs"

#     test_urls = [
#         "https://am-alansari.com/",
#     ]

#     for url in test_urls:
#         try:
#             print(f"\nFetching: {url}")
#             raw = fetch_html(url)
#             formatted = format_html(raw)

#             domain = re.sub(r"^https?://", "", url).split("/")[0]
#             out_path = logs_dir / f"formatted_{domain}.html"
#             save_formatted(formatted, out_path)

#             # Print a short preview to the notebook
#             # preview_lines = formatted.splitlines()[:200]
#             print(f"Saved formatted HTML to: {out_path}")
#             print(f"Preview (first 200 lines):\n{formatted}")
#         except Exception as e:
#             print(f"Error processing {url}: {e}")
# except Exception as outer_e:
#     print(f"Setup error: {outer_e}")

