In [1]:
import requests
import json
# from langchain_openai import ChatOpenAI
import os
from pydantic import BaseModel
from typing import List, Dict, Any
from langchain_core.prompts import ChatPromptTemplate
import os
import pathlib
from bs4 import BeautifulSoup
import re
from bs4 import BeautifulSoup
import re
from langchain_openai import ChatOpenAI
from pydantic import Field
from typing import Literal
from typing import Annotated
from typing_extensions import TypedDict
from langgraph.graph import StateGraph, START, END
from langgraph.graph.message import add_messages
from langchain.schema import HumanMessage, AIMessage
from __future__ import print_function
import sib_api_v3_sdk
from sib_api_v3_sdk.rest import ApiException
from IPython.display import Markdown, display
from typing import Optional

llm_model = os.environ["LLM_VERSION"]

search_starters = ["Digital Marketing Agencies", "Renewable Energy Companies", "Cybersecurity Firms", "Legal Services Providers", "Healthcare Consultancies"]

class ResultRelevance(BaseModel):
    explanation: str
    link: str

class RelevanceCheckOutput(BaseModel):
    relevant_results: List[ResultRelevance]

class SqlCommand(BaseModel):
    command: str

class ServiceProviderMemberDetails(BaseModel):
    name: Optional[str]
    role_description: Optional[str]
    telephone: Optional[str]
    mobile: Optional[str]
    email: Optional[str]
    linkedin: Optional[str]
    facebook: Optional[str]
    instagram: Optional[str]
    twitter: Optional[str]
    additional_info: Optional[str]

class ServiceProviderOutput(BaseModel):
    service_description: Optional[str]
    rating: Optional[str]
    pricing: Optional[str]
    provider_type: Optional[str] = "Company"
    country: Optional[str]
    name: Optional[str]
    logo: Optional[str]
    website: Optional[str]
    linkedin: Optional[str]
    facebook: Optional[str]
    instagram: Optional[str]
    telephone: Optional[str]
    mobile: Optional[str]
    emails: Optional[str]
    office_locations: Optional[str]
    key_individuals: Optional[str]
    service_provider_member_details: Optional[List[ServiceProviderMemberDetails]]

class SearchTermsOutput(BaseModel):
    search_terms: List[str]

class AboutUsOutput(BaseModel):
    about_us_link: Optional[List[str]]

def search_serper(search_query):
    url = "https://google.serper.dev/search"
    
    payload = json.dumps({
        "q": search_query,
        "gl": "ae", 
        "num": 10,
        "tbs": "qdr:d"
    })

    headers = {
        'X-API-KEY': os.environ["SERPER_API_KEY"],
        'Content-Type': 'application/json'
    }

    response = requests.request("POST", url, headers=headers, data=payload)
    results = json.loads(response.text)
    results_list = results['organic']

    all_results = []
    for id, result in enumerate(results_list, 1):
        result_dict = {
            'title': result['title'],
            'link': result['link'],
            'snippet': result['snippet'],
            'search_term': search_query,
            'id': id
        }
        all_results.append(result_dict)
    return all_results


# def load_prompt(prompt_name, html_content=None, sql_schema=None):
#     if prompt_name == "generate_sql":
#         if not html_content or not sql_schema:
#             raise ValueError("html_content and sql_schema must be provided for 'generate_sql' prompt.")

#         prompt = f"""
# You are given HTML webpage content. Extract the details precisely:

# - Business name, description, category, country of origin.
# - Each office location city and country.
# - Staff member details: name, position, email, phone, and social media accounts.

# Assume the PostgreSQL tables (`businesses`, `office_locations`, `staff`, `staff_social_media`) are already created with the following schema:

# {sql_schema}

# Do NOT include any CREATE TABLE commands. Assume `business_id` and `staff_id` are SERIAL PRIMARY KEYS, retrieved using RETURNING clauses in PostgreSQL.

# Provide exactly ONE executable PostgreSQL transaction (wrapped in BEGIN; ... COMMIT;) containing only valid INSERT commands, precisely formatted and ready to run through a Python script executing SQL commands. No additional explanations or formatting outside the SQL code.

# HTML Content:
# {html_content}
# """
#         return prompt

#     else:
#         raise ValueError(f"Unsupported prompt_name '{prompt_name}'")

def load_prompt(prompt_name):
    with open(f"prompts/{prompt_name}.md", "r") as f:
        return f.read()



def check_search_relevance(search_term: str, search_results: Dict[str, Any]) -> RelevanceCheckOutput:
    """
    Analyze search results and determine the most relevant ones.
    
    Args:
        search_results: Dictionary containing search results to analyze
        
    Returns:
        RelevanceCheckOutput containing the most relevant results and explanation
    """
    prompt = load_prompt("relevance_check")
    
    prompt_template = ChatPromptTemplate.from_messages([
        ("system", prompt)
    ])

    llm = ChatOpenAI(openai_api_key=os.environ["OPENAI_API_KEY"], model=llm_model).with_structured_output(RelevanceCheckOutput)
    llm_chain = prompt_template | llm
    
    return llm_chain.invoke({"search_term": search_term, 'search_results': search_results})


def convert_html_to_markdown(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Headers
    for h in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
        level = int(h.name[1])
        h.replace_with('#' * level + ' ' + h.get_text() + '\n\n')
    
    # Links
    for a in soup.find_all('a'):
        href = a.get('href', '')
        text = a.get_text()
        if href and text:
            a.replace_with(f'[{text}]({href})')
    
    # Bold
    for b in soup.find_all(['b', 'strong']):
        b.replace_with(f'**{b.get_text()}**')
    
    # Italic
    for i in soup.find_all(['i', 'em']):
        i.replace_with(f'*{i.get_text()}*')
    
    # Lists
    for ul in soup.find_all('ul'):
        for li in ul.find_all('li'):
            li.replace_with(f'- {li.get_text()}\n')
    
    for ol in soup.find_all('ol'):
        for i, li in enumerate(ol.find_all('li'), 1):
            li.replace_with(f'{i}. {li.get_text()}\n')
    
    # Get text and clean up
    text = soup.get_text()
    
    # Remove excess whitespace/newlines
    text = re.sub(r'\n\s*\n', '\n\n', text)
    text = text.strip()
    
    return text

def scrape_and_save_markdown(relevant_results):
    """
    Scrapes HTML content from URLs in relevant_results and saves as markdown files.
    
    Args:
        relevant_results: List of dictionaries containing search results with URLs
        
    Returns:
        List of dictionaries containing markdown content and metadata
    """
    # Create scraped_html directory if it doesn't exist
    # pathlib.Path("scraped_markdown").mkdir(exist_ok=True)

    markdown_contents = []
    for result in relevant_results:
        if 'link' in result:
            payload = {
                "api_key": os.environ["SCRAPING_API_KEY"], 
                "url": result['link'],
                "render_js": "true"
            }

            response = requests.get("https://scraping.narf.ai/api/v1/", params=payload)
            if response.status_code == 200:
                # Create filename from ID or URL if ID not available
                # filename = f"{result.get('id', hash(result['link']))}.md"
                # filepath = os.path.join("scraped_markdown", filename)
                
                # Convert HTML to markdown
                markdown_content = convert_html_to_markdown(response.content.decode())
                
                # Save markdown content to file
                # with open(filepath, 'w', encoding='utf-8') as f:
                #     f.write(markdown_content)
                
                markdown_contents.append({
                    'url': result['link'],
                    # 'filepath': filepath,
                    'markdown': markdown_content,
                    'title': result.get('title', ''),
                    'id': result.get('id', '')
                })
            else:
                print(f"Failed to fetch {result['link']}: Status code {response.status_code}")

    # print(f"Successfully downloaded and saved {len(markdown_contents)} pages as markdown to scraped_markdown/")
    print(f"Successfully downloaded and saved {len(markdown_contents)} pages as markdown")
    return markdown_contents

In [None]:
relevant_results = []
for search_term in search_terms:
    python_results = search_serper(search_term)
    results = check_search_relevance(python_results)
    
    relevant_ids = [r.id for r in results.relevant_results]
    
    filtered_results = [r for r in python_results if str(r['id']) in relevant_ids]
    
    relevant_results.extend(filtered_results)

markdown_contents = scrape_and_save_markdown(relevant_results)

In [None]:
# def generate_sql(html_content, sql_schema):
#     """
#     Generates SQL for markdown content using GPT-4.
    
#     Args:
#         html_content: HTML content of the webpage
#         sql_schema: SQL schema for the database
        
#     Returns:
#         SQL commands for the database
#     """
#     # Load the summary prompt
#     sql_generation_prompt = load_prompt("generate_sql", html_content, sql_schema)

#     # Create prompt template
#     sql_generation_template = ChatPromptTemplate.from_messages([
#         ("system", sql_generation_prompt)
#     ])

#     # Initialize LLM
#     llm = ChatOpenAI(openai_api_key=os.environ["OPENAI_API_KEY"], model=llm_model)
#     summary_chain = sql_generation_template | llm

#     # Generate and save summaries
#     summaries = []
#     for content in markdown_contents:
#         try:
#             # Generate summary, limiting to first 2000 words
#             summary = summary_chain.invoke({
#                 'markdown_input': ' '.join(content['markdown'].split()[:3000])
#                 # 'markdown_input': content['markdown']
#             })
            
#             # Create filename for summary
#             # summary_filename = f"summary_{content['id']}.md"
#             # summary_filepath = os.path.join("markdown_summaries", summary_filename)
            
#             # Save summary to file
#             # with open(summary_filepath, 'w', encoding='utf-8') as f:
#             #     f.write(summary.content)
            
#             # Add to summaries list
#             summaries.append({
#                 'markdown_summary': summary.content,
#                 'url': content['url']
#             })
                
#         except Exception as e:
#             print(f"Failed to summarize {content['url']}: {str(e)}")

#     # print(f"Successfully generated summaries for {len(markdown_contents)} pages in markdown_summaries/")
#     print(f"Successfully generated summaries for {len(markdown_contents)} pages")
#     return summaries

# def generate_field_values(html_content, fields):
#     # Load the summary prompt
#     field_values_prompt = load_prompt("generate_sql", html_content, sql_schema)

#     # Create prompt template
#     field_values_template = ChatPromptTemplate.from_messages([
#         ("system", field_values_prompt)
#     ])

#     # Initialize LLM
#     llm = ChatOpenAI(openai_api_key=os.environ["OPENAI_API_KEY"], model=llm_model)
#     field_values_chain = field_values_template | llm

#     # Generate and save summaries
#     summaries = []
#     for content in markdown_contents:
#         try:
#             # Generate summary, limiting to first 2000 words
#             summary = field_values_chain.invoke({
#                 'markdown_input': ' '.join(content['markdown'].split()[:3000])
#                 # 'markdown_input': content['markdown']
#             })
            
#             # Create filename for summary
#             # summary_filename = f"summary_{content['id']}.md"
#             # summary_filepath = os.path.join("markdown_summaries", summary_filename)
            
#             # Save summary to file
#             # with open(summary_filepath, 'w', encoding='utf-8') as f:
#             #     f.write(summary.content)
            
#             # Add to summaries list
#             summaries.append({
#                 'markdown_summary': summary.content,
#                 'url': content['url']
#             })
                
#         except Exception as e:
#             print(f"Failed to summarize {content['url']}: {str(e)}")

#     # print(f"Successfully generated summaries for {len(markdown_contents)} pages in markdown_summaries/")
#     print(f"Successfully generated summaries for {len(markdown_contents)} pages")
#     return summaries

# # summaries = generate_summaries(markdown_contents)

In [3]:
html_content = html_strings[1]
sql_schema = """-- Main table for Service model
CREATE TABLE services_service (
    id SERIAL PRIMARY KEY,
    service_description TEXT NOT NULL DEFAULT '',
    ratings VARCHAR(50) NOT NULL DEFAULT '',
    pricing TEXT NOT NULL DEFAULT ''
);

-- Table for tags
CREATE TABLE taggit_tag (
    id SERIAL PRIMARY KEY,
    name VARCHAR(100) NOT NULL UNIQUE,
    slug VARCHAR(100) NOT NULL UNIQUE
);

-- Through-table for tags relation (TaggableManager)
CREATE TABLE taggit_taggeditem (
    id SERIAL PRIMARY KEY,
    tag_id INTEGER NOT NULL REFERENCES taggit_tag(id) ON DELETE CASCADE,
    content_type_id INTEGER NOT NULL REFERENCES django_content_type(id) ON DELETE CASCADE,
    object_id INTEGER NOT NULL
);

-- Django automatically creates indexes:
CREATE INDEX taggit_taggeditem_content_type_id_object_id_idx ON taggit_taggeditem(content_type_id, object_id);

-- accounts/models.py
CREATE TABLE accounts_serviceproviderprofile (
    id SERIAL PRIMARY KEY,
    user_id INTEGER UNIQUE REFERENCES auth_user(id) ON DELETE CASCADE,
    service_id INTEGER REFERENCES services_service(id) ON DELETE SET NULL,
    provider_type VARCHAR(50) NOT NULL DEFAULT 'Company',
    country VARCHAR(100),
    session_status VARCHAR(8) NOT NULL DEFAULT 'inactive',
    tier VARCHAR(50),
    name VARCHAR(255) NOT NULL DEFAULT 'Default Name',
    logo VARCHAR(100),  -- ImageField stored as path string
    website VARCHAR(200),
    linkedin VARCHAR(200),
    facebook VARCHAR(200),
    instagram VARCHAR(200),
    telephone VARCHAR(30),
    mobile VARCHAR(30),
    emails TEXT,
    office_locations TEXT,
    key_individuals TEXT
);

CREATE TABLE accounts_serviceprovidermemberprofile (
    id SERIAL PRIMARY KEY,
    user_id INTEGER UNIQUE REFERENCES auth_user(id) ON DELETE CASCADE,
    company_id INTEGER REFERENCES accounts_serviceproviderprofile(id) ON DELETE CASCADE,
    role_description VARCHAR(255),
    telephone VARCHAR(30),
    mobile VARCHAR(30),
    email VARCHAR(254),
    linkedin VARCHAR(200),
    facebook VARCHAR(200),
    instagram VARCHAR(200),
    twitter VARCHAR(200),
    additional_info TEXT
);"""

In [1]:
# # Example usage:
# from .django.accounts.management.create_or_get_user import get_or_create_user
# user_email = "jonathon.davidson@example.com"
# user_username = "jonathon.davidson"
# user_password = "securepassword123"

# user_id = get_or_create_user(user_email, user_username, user_password)

# print(f"User ID: {user_id}")

# import sys
# sys.path.insert(0, '/home/mohammed/Desktop/tech_projects/growbal')
# import django
# print(django.__path__)

# try:
from growbal_django.accounts.management.create_or_get_user import get_or_create_user

# Now use the function directly
user_email = "jonathon.davidson@example.com"
user_username = "jonathon.davidson"
user_password = "securepassword123"

user_id = get_or_create_user(email=user_email, username=user_username, password=user_password)
print(f"User ID: {user_id}")

# finally:
#     sys.path.remove('/home/mohammed/Desktop/tech_projects/growbal')


ModuleNotFoundError: No module named 'growbal'

In [6]:
import os
import django
import sys
from asgiref.sync import sync_to_async

# # Path to your Django project root (adjust if necessary)
sys.path.insert(0, '/home/mohammed/Desktop/tech_projects/growbal/growbal_django')

# Set Django settings module (replace with your actual settings module path)
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "growbal.settings")

# Initialize Django environment
django.setup()

# from growbal_django.accounts.models import CustomUser
from accounts.models import CustomUser

# Wrap synchronous ORM calls in sync_to_async
@sync_to_async
def get_or_create_user(email, username, password):
    user, created = CustomUser.objects.get_or_create(
        email=email,
        defaults={'username': username}
    )
    if created and password:
        user.set_password(password)
        user.save()
    return user.id

user_email = "jonathon.davidson@example.com"
user_username = "jonathon.davidson"
user_password = "securepassword123"

# If running in Jupyter or an async context, call as follows:
user_id = await get_or_create_user(user_email, user_username, user_password)

print(f"User ID: {user_id}")

User ID: 3


In [3]:
prompt = load_prompt("generate_fields")

prompt_template = ChatPromptTemplate.from_messages([
    ("system", prompt)
])

llm = ChatOpenAI(openai_api_key=os.environ["OPENAI_API_KEY"], model=llm_model).with_structured_output(ServiceProviderOutput)
llm_chain = prompt_template | llm

result = llm_chain.invoke({"html_content": html_strings[1]})

print("prompt:\n")
print(prompt)
print("\n")
# print("sql_schema:\n")
# print(sql_schema)
print("\n")
print("html_content:\n")
print(html_strings[1])
print("\n")
print("result:\n")
print(result)
print("\n")




prompt:

# Role

You are an accurate and experienced data engineer.

# Task

You will receive HTML webpage content. Extract the following fields exactly:

- service_description, rating, pricing.
- provider_type (has to be either Company or Agent), country (has to be one of ('UAE', 'UK', 'USA')), name, logo (logo link), website, linkedin, facebook, instagram, telephone, mobile, emails, office locations, key individuals.
- service_provider_member_details: name, role_description, telephone, mobile, email, linkedin, facebook, instagram, twitter, additional_info.

Provide exactly a JSON object containing the extracted fields.

HTML Content:

{html_content}

# Output

Executable PostgreSQL transaction (wrapped in BEGIN; ... COMMIT;) containing only valid INSERT commands.





html_content:

<div>
<p>Who We Are</p>
<p>The region’s foremost boutique commercial and private client law firm. Our strength lies in providing expert guidance to our clients belonging to an array of industries.</p>
<p>

In [2]:
prompt = load_prompt("generate_search_terms")
service = search_starters[3]

prompt_template = ChatPromptTemplate.from_messages([
    ("system", prompt)
])

llm = ChatOpenAI(openai_api_key=os.environ["OPENAI_API_KEY"], model=llm_model).with_structured_output(SearchTermsOutput)
llm_chain = prompt_template | llm

result = llm_chain.invoke({"service": service})

print("prompt:\n")
print(prompt)
print("\n")
print("result:\n")
print(result)
print("\n")




prompt:

# Role

You are an expert at generating concise, targeted Google search terms specifically intended to discover official websites of companies or service providers within a given industry or those providing specific services.

# Task

Your generated search terms should effectively yield official company websites when entered into Google.

Provide exactly a JSON object containing the extracted fields.

Provided industry or service:
{service}

# Output

A JSON object containing the extracted fields.



result:

search_terms=['Legal Services Providers official website', 'Law firms official site', 'Legal service company website', 'Authentic website of legal counselors', 'Official website attorney services', 'Official Law firm sites', 'Legal service provider online presence', 'Official legal consultancies sites', 'Best legal services company websites', 'Web portal official legal services']




In [4]:
print("prompt:\n")
print(prompt)
print("\n")
print("result:\n")
print(result)
print("\n")

prompt:



For each provided industry or service, generate exactly five targeted Google search queries.


# Role

You are an expert at generating concise, targeted Google search terms specifically intended to discover the "About Us," "Company Information," or general informational pages for companies within a given industry or those providing specific services.

# Task

Your generated search terms should effectively yield company profiles, service descriptions, or about pages when entered into Google.

Provide exactly a JSON object containing the extracted fields.

Provided industry or service:
{service}

# Output

A JSON object containing the extracted fields.



result:

search_terms=['Digital Marketing Agencies company profile', 'About Us Digital Marketing Agencies', 'Digital Marketing services description', 'Company Information of Digital Marketing Agencies', 'Top Digital Marketing Agencies About Us']




In [3]:
# links = []
for search_term in result.search_terms:
    links = search_serper(search_term)
    relevant_links = check_search_relevance(search_term, links)
    # print(relevant_links)
    break

# print(links)




In [4]:
print(relevant_links)

relevant_results=[ResultRelevance(explanation="The title ('Dentons - Home') hints that this is the official site for Dentons, an international legal practice. The snippet further confirms that they offer legal services worldwide. Definitely worth exploring.", link='https://www.dentons.com/en/'), ResultRelevance(explanation="Hill Dickinson's site seems worth visiting as the title reflects that it's an official website of an international law firm and the snippet emphasizes their role as a trusted legal service provider.", link='https://www.hilldickinson.com/'), ResultRelevance(explanation='K&L Gates appear to be a global legal services provider and their title and snippet indicates this is their official website, mentioning both their global presence and client services like legal counsel.', link='https://www.klgates.com/'), ResultRelevance(explanation="Pinsent Masons' site is relevant because the title does not appear to be generic and suggests that this is an official Pinsent Masons w

In [None]:
from scrapper import HtmlScraper
import tiktoken

def count_tokens(text, model="gpt-4"):
    encoding = tiktoken.encoding_for_model(model)
    tokens = encoding.encode(text)
    return len(tokens)

scraper = HtmlScraper(scraping_api_key=os.environ["SCRAPING_API_KEY"])
for relevant_link in relevant_links.relevant_results:
    html_content = scraper.scrape_html(relevant_link.link)
    nav_links = scraper.get_nav_links(html_content)
    nav_links_str = "\n".join([f"{text}: {link}" for text, link in nav_links])

    prompt = load_prompt("generate_about_us_link")
    prompt_template = ChatPromptTemplate.from_messages([
        ("system", prompt)
    ])
    llm = ChatOpenAI(openai_api_key=os.environ["OPENAI_API_KEY"], model=llm_model).with_structured_output(AboutUsOutput)
    llm_chain = prompt_template | llm

    result = llm_chain.invoke({"nav_links": nav_links_str})
    print(result)
    print(result.about_us_link)
    print("\n")
    print("\n")
    clean_html_sum = ""
    i = 1
    num_tokens_allowed = 9000
    for link in result.about_us_link:
        try:
            html_content = scraper.scrape_html(link)
            clean_text, clean_html = scraper.clean_html_content(html_content)
            print(count_tokens(clean_html))
            if count_tokens(clean_html) < num_tokens_allowed/len(result.about_us_link):
                clean_html_sum += f"HTML PAGE {i}: ({link})\n\n"
                clean_html_sum += clean_html + "\n\n"
                print("here")
            i += 1
        except Exception as e:
            print(f"Error processing {link}: {e}")
    
    # clean_html_sum = clean_html_sum[:num_tokens_allowed]
    if clean_html_sum != "":
        prompt = load_prompt("generate_fields")
        prompt_template = ChatPromptTemplate.from_messages([
            ("system", prompt)
        ])
        llm = ChatOpenAI(openai_api_key=os.environ["OPENAI_API_KEY"], model=llm_model).with_structured_output(ServiceProviderOutput)
        llm_chain = prompt_template | llm
        result = llm_chain.invoke({"html_content": clean_html_sum})
        # print(result)
        print(result)
    # print(nav_links)
    break



about_us_link=['https://www.dentons.com/en/about-dentons', 'https://www.dentons.com/en/services-and-solutions']
['https://www.dentons.com/en/about-dentons', 'https://www.dentons.com/en/services-and-solutions']




1297
here
1496
here




AttributeError: 'ServiceProviderOutput' object has no attribute 'ServiceProviderOutput'

In [5]:
print(type(result))
print(result)

<class '__main__.ServiceProviderOutput'>
service_description="Dentons is the world's largest global law firm, offering a variety of services and solutions to help clients grow, protect, operate and finance their businesses. Their content and solutions are organized around the business agenda of the clients rather than their organizational structure. They offer practical toolkits and takeaways on various issues that General Counsels and in-house legal teams may find helpful which are all available in their on-demand offering 'CenterX'. They also stress on their continuous improvement methodology to ensure that the experience the clients have with Dentons is constantly advancing." rating=None pricing=None provider_type='Company' country=None name='Dentons' logo=None website='https://www.dentons.com/' linkedin=None facebook=None instagram=None telephone=None mobile=None emails=None office_locations=None key_individuals=None service_provider_member_details=None


In [None]:
for relevant_link in relevant_links.relevant_results:
    html_content = scraper.scrape_html(relevant_link.link)
    clean_text, clean_html = scraper.clean_html_content(html_content)
    print(clean_html)

    prompt = load_prompt("generate_about_us_link")

    prompt_template = ChatPromptTemplate.from_messages([
        ("system", prompt)
    ])

    llm = ChatOpenAI(openai_api_key=os.environ["OPENAI_API_KEY"], model=llm_model).with_structured_output(AboutUsOutput)
    llm_chain = prompt_template | llm

    result = llm_chain.invoke({"html_content": clean_html})
    print(result)
    print(result.about_us_link)
    print("\n")
    print("\n")
    

    # filtered_links.append(links[int(relevant_result.id)]["link"])

# print(filtered_links)
    
# print(links[int(relevant_links.relevant_results[0].id)]["link"])
# print(relevant_links)
# scrape_and_save_markdown(relevant_results)

<div>
<p>Join over 3,000 of your peers who have already registered for this award-winning global webinar program. Sessions are focused on bringing you practical insights on the biggest challenges and opportunities facing General Counsel and in-house legal teams.</p>
<p>Our outstanding Ukrainian team continues to work with international clients investing in Ukraine. Follow our videos and podcasts with key market participants to hear more about the current business and investment climate. Up to date insights on key legal and market developments can also be found in our hub.</p>
<p>We are consistently hearing from clients that AI is a key topic for in-house legal to address in 2025. Our report highlights the key areas of legal risk and gives a digestible and pragmatic global overview. Anyone with responsibility for their organization's legal or risk agenda will benefit from reviewing this report.</p>
<p>How can you structure your global business in the most tax-efficient manner? This edit



about_us_link=None
None




<div>
<html><body><p>We use cookies to personalise content and ads, to provide social media features and to analyse our traffic. We also share information about your use of our site with our social media, advertising and analytics partners who may combine it with other information that you’ve provided to them or that they’ve collected from your use of their services.</p></body></html>
<html><body><p>optimizely_data$pending_eventsThis cookie is set to make split-tests on the website, which optimizes the website's relevance towards the visitor – the cookie can also be set to improve the visitor's experience on a website.</p></body></html>
<html><body><p>Maximum Storage Duration: PersistentType: HTML Local Storage</p></body></html>
<html><body><p>optimizelyEndUserIdUsed to measure how selected users react to targeted changes to the website's content and functionality, in order to determine what variation is most efficacious in terms of converting users to custo



about_us_link=None
None




<div>
<html><body><p>Our ability to understand people makes us who we are. We work together to build deep and trusted relationships that deliver meaningful value.</p></body></html>
<html><body><p>Some of our clients</p></body></html>
<h2 id="pc-title">Cookie Preference Centre</h2>
<h3 id="privacy-text">Your Privacy</h3>
<h3 id="privacy-text">Your Privacy</h3>
<p class="group-description" id="pc-policy-text">When you visit any website, it may store or retrieve information on your browser, mostly in the form of cookies. This information might be about you, your preferences or your device and is mostly used to make the site work as you expect it to. The information does not usually directly identify you, but it can give you a more personalized web experience. Because we respect your right to privacy, you can choose not to allow some types of cookies. Click on the different category headings to find out more and change our default settings. However, blocking som



about_us_link=None
None




Failed to fetch https://www.klgates.com/: Status code 500


TypeError: Incoming markup is of an invalid type: None. Markup must be a string, a bytestring, or an open filehandle.

In [None]:

scraper = HtmlScraper(scraping_api_key=os.environ["SCRAPING_API_KEY"])
html_strings = scraper.load_saved_html()
for i, html_string in enumerate(html_strings):
    clean_text, clean_html = scraper.clean_html_content(html_string)
    html_strings[i] = clean_html

In [None]:
prompt = load_prompt("generate_formal_site_link")
service = search_starters[3]

prompt_template = ChatPromptTemplate.from_messages([
    ("system", prompt)
])

llm = ChatOpenAI(openai_api_key=os.environ["OPENAI_API_KEY"], model=llm_model).with_structured_output(AboutUsOutput)
llm_chain = prompt_template | llm

result = llm_chain.invoke({"service": service})

print("prompt:\n")
print(prompt)
print("\n")
print("result:\n")
print(result)
print("\n")



relevant_results=[ResultRelevance(explanation='This link is relevant because it is an international commercial law firm which is most likely to offer legal services.', id='2'), ResultRelevance(explanation='This link is relevant as it leads to a global law firm offering various legal services and also allows users to find a lawyer.', id='3'), ResultRelevance(explanation='This link is relevant as it leads to an international and offshore professional services firm with expertise to handle demanding and complex transactions, making it seem like a promising candidate for a legal service provider.', id='4'), ResultRelevance(explanation="This link is an official website for Thomson Reuters' legal resource center, making it relevant as Thomson Reuters is known for its legal services and resources.", id='5'), ResultRelevance(explanation='The link appears to belong to an international law firm, indicating a likelihood of providing legal services and making it relevant.', id='7'), ResultRelevanc