In [1]:
html_content = """<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Innovatech Solutions Inc.</title>
</head>
<body>
    <header>
        <h1>Innovatech Solutions Inc.</h1>
        <p><strong>Category:</strong> Technology & Software Development</p>
        <p><strong>Country of Origin:</strong> Canada</p>
    </header>

    <section id="description">
        <h2>About Us</h2>
        <p>Innovatech Solutions specializes in custom software, AI integration, cloud solutions, and digital transformation services tailored for businesses aiming for accelerated growth and operational efficiency.</p>
    </section>

    <section id="office-locations">
        <h2>Our Offices</h2>
        <ul>
            <li>Toronto, Canada</li>
            <li>Vancouver, Canada</li>
            <li>Berlin, Germany</li>
        </ul>
    </section>

    <section id="staff">
        <h2>Meet Our Team</h2>
        
        <div class="staff-member">
            <h3>Dr. Emily Porter</h3>
            <p><strong>Position:</strong> Chief Technology Officer (CTO)</p>
            <p><strong>Email:</strong> eporter@innovatech.ca</p>
            <p><strong>Phone:</strong> +1-416-555-0145</p>
            <p><strong>Social Media:</strong>
                <a href="https://linkedin.com/in/emilyporter">LinkedIn</a>,
                <a href="https://twitter.com/emilyporter">Twitter</a>
            </p>
        </div>

        <div class="staff-member">
            <h3>Mark Reynolds</h3>
            <p><strong>Position:</strong> Lead AI Engineer</p>
            <p><strong>Email:</strong> mreynolds@innovatech.ca</p>
            <p><strong>Phone:</strong> +1-604-555-0221</p>
            <p><strong>Social Media:</strong>
                <a href="https://linkedin.com/in/markreynolds">LinkedIn</a>
            </p>
        </div>

        <div class="staff-member">
            <h3>Sophia Khan</h3>
            <p><strong>Position:</strong> Head of Digital Transformation</p>
            <p><strong>Email:</strong> skhan@innovatech.ca</p>
            <p><strong>Phone:</strong> +49-30-555-0877</p>
            <p><strong>Social Media:</strong>
                <a href="https://linkedin.com/in/sophiakhan">LinkedIn</a>,
                <a href="https://twitter.com/sophiakhan">Twitter</a>,
                <a href="https://instagram.com/sophiakhan">Instagram</a>
            </p>
        </div>
    </section>
</body>
</html>
"""
sql_schema = """CREATE TABLE businesses (
    business_id SERIAL PRIMARY KEY,
    name VARCHAR(255) UNIQUE NOT NULL,
    description TEXT,
    category VARCHAR(100),
    country_of_origin VARCHAR(100)
);

CREATE TABLE office_locations (
    location_id SERIAL PRIMARY KEY,
    business_id INTEGER REFERENCES businesses(business_id),
    city VARCHAR(100),
    country VARCHAR(100)
);

CREATE TABLE staff (
    staff_id SERIAL PRIMARY KEY,
    business_id INTEGER REFERENCES businesses(business_id),
    full_name VARCHAR(100),
    position VARCHAR(100),
    email VARCHAR(100),
    phone VARCHAR(30)
);

CREATE TABLE staff_social_media (
    social_id SERIAL PRIMARY KEY,
    staff_id INTEGER REFERENCES staff(staff_id),
    platform VARCHAR(50),
    url VARCHAR(255)
);
"""

prompt = f"""
You are given HTML webpage content. Extract the details precisely:

- Business name, description, category, country of origin.
- Each office location city and country.
- Staff member details: name, position, email, phone, and social media accounts.

Assume the PostgreSQL tables (`businesses`, `office_locations`, `staff`, `staff_social_media`) are already created with the following schema:

{sql_schema}

Do NOT include any CREATE TABLE commands. Assume `business_id` and `staff_id` are SERIAL PRIMARY KEYS, retrieved using RETURNING clauses in PostgreSQL.

Provide exactly ONE executable PostgreSQL transaction (wrapped in BEGIN; ... COMMIT;) containing only valid INSERT commands, precisely formatted and ready to run through a Python script executing SQL commands. No additional explanations or formatting outside the SQL code.

HTML Content:
{html_content}
"""

In [2]:
import os
import json
import requests
# from pydantic import BaseModel

class Singleton(type):
    _instances = {}
    def __call__(cls, *args, **kwargs):
        if cls not in cls._instances:
            cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs)
        return cls._instances[cls]

class LLMWrapper(metaclass=Singleton):
    def __init__(self, config_file="ollama_config.json"):
        self.loaded = False
        self.config_data = self.load_config(config_file)
        self.load_response = self.load_model(self.config_data["LLM"])
        if self.load_response.status_code == 200:
            self.loaded = True

    def load_config(self, config_file):
        with open(config_file, 'r') as f:
            config_data = json.load(f)
        for key, value in config_data.items():
            os.environ[key] = str(value)
        return config_data

    def load_model(self, model_name: str):
        url = self.config_data["LOAD_MODEL_API_PATH"]
        payload = {"model": model_name}
        response = requests.post(url, json=payload)
        return response

    def run(self, prompt: str, chat=True):
        if self.loaded:
            url = self.config_data["CHAT_API_PATH"] if chat else self.config_data["GENERATE_API_PATH"]
            payload = {
                "model": self.config_data["LLM"],
                "messages": [{"role": "user", "content": prompt}],
                "stream": False,
                "options": {"temperature": self.config_data["TEMPERATURE"]}
            }
            response = requests.post(url, json=payload)
            if response.ok:
                return response.json()
            else:
                print(f"Request failed: {response.status_code} - {response.text}")
                return None
        else:
            print("Model not loaded.")
            return None

llm = LLMWrapper()

print(f"Prompt: {prompt}")
response = llm.run(prompt)
if response:
    generated_sql = response['message']['content']
    print(f"Generated SQL: {generated_sql}")
else:
    print("Failed to get response from LLM.")

Prompt: 
You are given HTML webpage content. Extract the details precisely:

- Business name, description, category, country of origin.
- Each office location city and country.
- Staff member details: name, position, email, phone, and social media accounts.

Assume the PostgreSQL tables (`businesses`, `office_locations`, `staff`, `staff_social_media`) are already created with the following schema:

CREATE TABLE businesses (
    business_id SERIAL PRIMARY KEY,
    name VARCHAR(255) UNIQUE NOT NULL,
    description TEXT,
    category VARCHAR(100),
    country_of_origin VARCHAR(100)
);

CREATE TABLE office_locations (
    location_id SERIAL PRIMARY KEY,
    business_id INTEGER REFERENCES businesses(business_id),
    city VARCHAR(100),
    country VARCHAR(100)
);

CREATE TABLE staff (
    staff_id SERIAL PRIMARY KEY,
    business_id INTEGER REFERENCES businesses(business_id),
    full_name VARCHAR(100),
    position VARCHAR(100),
    email VARCHAR(100),
    phone VARCHAR(30)
);

CREATE TAB

In [3]:
from ollama_wrapper import ChatOllama
import json
import os

def load_config(config_file="ollama_config.json"):
    with open(config_file, 'r') as f:
        config_data = json.load(f)
    # for key, value in config_data.items():
    #     os.environ[key] = str(value)
    return config_data

ollama_config_data = load_config()

# Before refactor (using OpenAI Chat models):
# from langchain.chat_models import ChatOpenAI
# llm = ChatOpenAI(model_name="gpt-4", temperature=0.7)
# llm_summariser = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.7)
# llm_reviewer = ChatOpenAI(model_name="gpt-4", temperature=0.7)

# After refactor (using local Ollama models via ChatOllama):

# Initialize the main LLM (e.g., a larger model for primary tasks)
sql_generator = ChatOllama(
    model=ollama_config_data["LLM"],            # name of the model as known to Ollama (e.g., "llama2" if pulled)
    base_url=ollama_config_data["BASE_URL"],
    temperature=ollama_config_data["TEMPERATURE"],
    # max_tokens=1024 
)


from pydantic import BaseModel

# Define desired output schema as a Pydantic model
class AnalysisResult(BaseModel):
    sql_command: str

# Wrap the LLM with structured output requirement
structured_llm = sql_generator.with_structured_output(AnalysisResult)
# Now use structured_llm in a chain or workflow:
result = structured_llm.invoke(prompt)
print(result)
# -> AnalysisResult(summary="...something...", sentiment="Positive")

sql_command="INSERT INTO businesses (name, description, category, country_of_origin) VALUES ('Innovatech Solutions Inc.', 'Custom software, AI integration, cloud solutions, and digital transformation services tailored for businesses aiming for accelerated growth and operational efficiency.', 'Technology & Software Development', 'Canada');"


In [5]:
print(result.sql_command)

INSERT INTO businesses (name, description, category, country_of_origin) VALUES ('Innovatech Solutions Inc.', 'Innovatech Solutions specializes in custom software, AI integration, cloud solutions, and digital transformation services tailored for businesses aiming for accelerated growth and operational efficiency.', 'Technology & Software Development', 'Canada');


In [4]:
import requests
import json
# from langchain_openai import ChatOpenAI
import os
from pydantic import BaseModel
from typing import List, Dict, Any
from langchain_core.prompts import ChatPromptTemplate
import os
from bs4 import BeautifulSoup
import re
from bs4 import BeautifulSoup
import re
from langchain_openai import ChatOpenAI
from pydantic import Field
from typing import Literal
from typing import Annotated
from typing_extensions import TypedDict
from langgraph.graph import StateGraph, START, END
from langgraph.graph.message import add_messages
from langchain.schema import HumanMessage, AIMessage
from __future__ import print_function
import sib_api_v3_sdk
from sib_api_v3_sdk.rest import ApiException
from IPython.display import display

llm_model = os.environ["LLM_VERSION"]

search_terms = ["recent phylosophical trends", "new cognitive bias study", "recent studies on intelligence", "New Reinforcement Learning", "Reinforcement Learning LinkedIn", "Agentic Reinforcement"]

class ResultRelevance(BaseModel):
    explanation: str
    id: str

class RelevanceCheckOutput(BaseModel):
    relevant_results: List[ResultRelevance]
    

def search_serper(search_query):
    url = "https://google.serper.dev/search"
    
    payload = json.dumps({
        "q": search_query,
        "gl": "ae", 
        "num": 4,
        "tbs": "qdr:d"
    })

    headers = {
        'X-API-KEY': os.environ["SERPER_API_KEY"],
        'Content-Type': 'application/json'
    }

    response = requests.request("POST", url, headers=headers, data=payload)
    results = json.loads(response.text)
    results_list = results['organic']

    all_results = []
    for id, result in enumerate(results_list, 1):
        result_dict = {
            'title': result['title'],
            'link': result['link'],
            'snippet': result['snippet'],
            'search_term': search_query,
            'id': id
        }
        all_results.append(result_dict)
    return all_results


def load_prompt(prompt_name):
    with open(f"prompts/{prompt_name}.md", "r") as file:
        return file.read()


def check_search_relevance(search_results: Dict[str, Any]) -> RelevanceCheckOutput:
    """
    Analyze search results and determine the most relevant ones.
    
    Args:
        search_results: Dictionary containing search results to analyze
        
    Returns:
        RelevanceCheckOutput containing the most relevant results and explanation
    """
    prompt = load_prompt("relevance_check")
    
    prompt_template = ChatPromptTemplate.from_messages([
        ("system", prompt)
    ])

    llm = ChatOpenAI(openai_api_key=os.environ["OPENAI_API_KEY"], model=llm_model).with_structured_output(RelevanceCheckOutput)
    llm_chain = prompt_template | llm
    
    return llm_chain.invoke({"search_terms": search_terms, 'search_results': search_results})



relevant_results = []
for search_term in search_terms:
    python_results = search_serper(search_term)
    results = check_search_relevance(python_results)
    
    relevant_ids = [r.id for r in results.relevant_results]
    
    filtered_results = [r for r in python_results if str(r['id']) in relevant_ids]
    
    relevant_results.extend(filtered_results)
  


def convert_html_to_markdown(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Headers
    for h in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
        level = int(h.name[1])
        h.replace_with('#' * level + ' ' + h.get_text() + '\n\n')
    
    # Links
    for a in soup.find_all('a'):
        href = a.get('href', '')
        text = a.get_text()
        if href and text:
            a.replace_with(f'[{text}]({href})')
    
    # Bold
    for b in soup.find_all(['b', 'strong']):
        b.replace_with(f'**{b.get_text()}**')
    
    # Italic
    for i in soup.find_all(['i', 'em']):
        i.replace_with(f'*{i.get_text()}*')
    
    # Lists
    for ul in soup.find_all('ul'):
        for li in ul.find_all('li'):
            li.replace_with(f'- {li.get_text()}\n')
    
    for ol in soup.find_all('ol'):
        for i, li in enumerate(ol.find_all('li'), 1):
            li.replace_with(f'{i}. {li.get_text()}\n')
    
    # Get text and clean up
    text = soup.get_text()
    
    # Remove excess whitespace/newlines
    text = re.sub(r'\n\s*\n', '\n\n', text)
    text = text.strip()
    
    return text

def scrape_and_save_markdown(relevant_results):
    """
    Scrapes HTML content from URLs in relevant_results and saves as markdown files.
    
    Args:
        relevant_results: List of dictionaries containing search results with URLs
        
    Returns:
        List of dictionaries containing markdown content and metadata
    """
    # Create scraped_html directory if it doesn't exist
    # pathlib.Path("scraped_markdown").mkdir(exist_ok=True)

    markdown_contents = []
    for result in relevant_results:
        if 'link' in result:
            payload = {
                "api_key": os.environ["SCRAPING_API_KEY"], 
                "url": result['link'],
                "render_js": "true"
            }

            response = requests.get("https://scraping.narf.ai/api/v1/", params=payload)
            if response.status_code == 200:
                # Create filename from ID or URL if ID not available
                # filename = f"{result.get('id', hash(result['link']))}.md"
                # filepath = os.path.join("scraped_markdown", filename)
                
                # Convert HTML to markdown
                markdown_content = convert_html_to_markdown(response.content.decode())
                
                # Save markdown content to file
                # with open(filepath, 'w', encoding='utf-8') as f:
                #     f.write(markdown_content)
                
                markdown_contents.append({
                    'url': result['link'],
                    # 'filepath': filepath,
                    'markdown': markdown_content,
                    'title': result.get('title', ''),
                    'id': result.get('id', '')
                })
            else:
                print(f"Failed to fetch {result['link']}: Status code {response.status_code}")

    # print(f"Successfully downloaded and saved {len(markdown_contents)} pages as markdown to scraped_markdown/")
    print(f"Successfully downloaded and saved {len(markdown_contents)} pages as markdown")
    return markdown_contents

markdown_contents = scrape_and_save_markdown(relevant_results)

FileNotFoundError: [Errno 2] No such file or directory: 'prompts/relevance_check.md'

In [5]:
import os
import json
import requests
from bs4 import BeautifulSoup

with open(".env", "r") as f:
    for line in f:
        key, value = line.strip().split("=")
        os.environ[key] = value

relevant_results = [
    {
        "id": "1",
        "title": "Raalc About",
        "link": "https://www.raalc.ae/about"
    },
    {
        "id": "2",
        "title": "Hadef Partners About",
        "link": "https://hadefpartners.com/about-us/who-we-are/"
    },
    {
        "id": "3",
        "title": "The Firm Dubai About",
        "link": "https://www.thefirmdubai.com/about#AboutUs"
    },
    {
        "id": "4",
        "title": "Davidson Colaw About",
        "link": "https://davidsoncolaw.com/about-us/"
    }
]

# # Saving HTML pages locally
# def scrape_and_save_html(relevant_results, directory="saved_html"):
#     os.makedirs(directory, exist_ok=True)
#     saved_files = []

#     for result in relevant_results:
#         if 'link' in result:
#             payload = {
#                 "api_key": os.environ["SCRAPING_API_KEY"], 
#                 "url": result['link'],
#                 "render_js": "true"
#             }

#             response = requests.get("https://scraping.narf.ai/api/v1/", params=payload)
#             if response.status_code == 200:
#                 filename = f"{result.get('id', hash(result['link']))}.html"
#                 filepath = os.path.join(directory, filename)
#                 with open(filepath, 'w', encoding='utf-8') as file:
#                     file.write(response.content.decode())
#                 saved_files.append(filepath)
#             else:
#                 print(f"Failed to fetch {result['link']}: Status code {response.status_code}")

#     print(f"Successfully downloaded and saved {len(saved_files)} pages as HTML to '{directory}'/")
#     return saved_files

# # Loading HTML pages back as string objects
# def load_saved_html(filepaths):
#     html_strings = []

#     for filepath in filepaths:
#         with open(filepath, 'r', encoding='utf-8') as file:
#             html_strings.append(file.read())

#     print(f"Successfully loaded {len(html_strings)} HTML pages into strings")
#     return html_strings

# # Example usage:
# saved_file_paths = scrape_and_save_html(relevant_results)
# loaded_html_strings = load_saved_html(saved_file_paths)


In [None]:
# import os
# import glob

# def load_saved_html(directory_path):
#     html_files = glob.glob(os.path.join(directory_path, "*.html"))
#     loaded_html_strings = []
#     for file_path in html_files:
#         with open(file_path, 'r', encoding='utf-8') as file:
#             loaded_html_strings.append(file.read())
#     return loaded_html_strings

# # Example usage:
# directory_path = '/home/mohammed/Desktop/tech_projects/growbal/saved_html'

# loaded_html_strings = load_saved_html(directory_path)

# # Example of printing a preview of each HTML file content
# for idx, content in enumerate(loaded_html_strings, start=1):
#     print(f"--- Content preview of HTML file {idx} ---\n{content[:200]}...\n")


--- Content preview of HTML file 1 ---
<!DOCTYPE html><html class="no-js" lang="en" dir="ltr"><head>
    <meta charset="utf-8">
    <title>Hadef &amp; Partners - A leading independent UAE law firm</title>
    <meta name="viewport" content=...

--- Content preview of HTML file 2 ---
<!DOCTYPE html><html><head><meta charset="utf-8"><script>if(navigator.userAgent.match(/MSIE|Internet Explorer/i)||navigator.userAgent.match(/Trident\/7\..*?rv:11/i)){var href=document.location.href;if...

--- Content preview of HTML file 3 ---
<!DOCTYPE html><html xmlns="http://www.w3.org/1999/xhtml" lang="en" class="pointer skrollr skrollr-desktop"><head>
    <style id="customRules"></style><meta charset="utf-8">
    





















...

--- Content preview of HTML file 4 ---
<!DOCTYPE html><html lang="en" class="lenis lenis-smooth"><head><style data-rc-order="prependQueue" data-rc-priority="-999" data-css-hash="1mqvjt9" data-token-hash="fkne30">:where(.css-m4timi) a{color...



In [None]:
# from bs4 import BeautifulSoup
# import justext

# html_content = loaded_html_strings[3]

# # 2. Parse the HTML content with BeautifulSoup (using lxml parser for robustness)
# soup = BeautifulSoup(html_content, "lxml")

# # 3. Remove unwanted elements by tag name (scripts, styles, navbars, footers, etc.)
# for tag in soup(["script", "style"]):
#     tag.decompose()  # remove script and style entirely
# for tag in soup(["header", "footer", "nav", "aside"]):
#     tag.decompose()  # remove common layout sections like header, footer, nav, sidebar

# # Optionally, remove elements by specific class or id patterns (e.g., ads or banners)
# for ad in soup.find_all(attrs={"class": lambda c: c and "advertisement" in c.lower()}):
#     ad.decompose()
# for ad in soup.find_all(id=lambda i: i and i.lower().startswith("ad_")):
#     ad.decompose()
# # (The above are examples; you can adjust the filtering criteria based on the page's HTML structure.)

# # 4. Use jusText to remove boilerplate and keep main textual content
# paragraphs = justext.justext(str(soup), justext.get_stoplist("English"))
# clean_chunks = []
# for para in paragraphs:
#     if not para.is_boilerplate:  # not classified as boilerplate (navigation/menu/etc.)
#         text = para.text.strip()
#         if text:  # if non-empty
#             clean_chunks.append(text)

# # At this point, clean_chunks is a list of textual paragraphs in English that jusText considered content.

# # 5. Output the cleaned content.
# # Option A: Join as plain text paragraphs
# clean_text = "\n\n".join(clean_chunks)
# print("Cleaned Text Content:\\n", clean_text)

# # Option B: Reconstruct minimal HTML with basic structure (e.g., wrap each paragraph in <p> tags)
# clean_html = "<div>\n" + "\n".join(f"<p>{para}</p>" for para in clean_chunks) + "\n</div>"
# print("Cleaned HTML Content:\\n", clean_html)


Cleaned Text Content:\n RAALC - Tradition of excellence

RAALC’s journey began in 2013, when it was co-founded in the Emirate of Sharjah. Our success continued through years and we opened our two newbranches in Ras Al Khaimah and Dubai, which in 2018, became our headquarter.
The tradition of excellence recognised and intended by the founders continues to this very day. RAALC has cultivated a fantastic reputation for excellence over a collective experience of 30 years. Our core determination is creating partnerships that engross the clients in a peaceful mind where we take care of the zealous representation and expert advice.
We started out with core litigation practice, and after that, it grew to encompass the umbrella of experience, such as corporate, commercial, banking and finance, restructuring and restructuring of business, exit strategy, and the list goes on.

We seek to become UAE’s first smart-law firm!

We carry the long-standing tradition of excellence with experience and inn

In [3]:
from scrapper import HtmlScraper
scraper = HtmlScraper(scraping_api_key=os.environ["SCRAPING_API_KEY"])
html_strings = scraper.load_saved_html()
for i, html_string in enumerate(html_strings):
    clean_text, clean_html = scraper.clean_html_content(html_string)
    html_strings[i] = clean_html
    # print(clean_html)

Successfully loaded 4 HTML pages into strings


In [4]:
prompts = []

for html_content in html_strings:
    prompt = f"""
You are given HTML webpage content. Extract the details precisely:

- Business name, description, category, country of origin.
- Each office location city and country.
- Staff member details: name, position, email, phone, and social media accounts.

Assume the PostgreSQL tables (`businesses`, `office_locations`, `staff`, `staff_social_media`) are already created with the following schema:

{sql_schema}

Do NOT include any CREATE TABLE commands. Assume `business_id` and `staff_id` are SERIAL PRIMARY KEYS, retrieved using RETURNING clauses in PostgreSQL.

Provide exactly ONE executable PostgreSQL transaction (wrapped in BEGIN; ... COMMIT;) containing only valid INSERT commands, precisely formatted and ready to run through a Python script executing SQL commands. No additional explanations or formatting outside the SQL code.

HTML Content:
{html_content}
"""
    prompts.append(prompt)

In [1]:
import threading
from ollama_wrapper import ChatOllama
import json
import os
from pydantic import BaseModel

# Load configuration
def load_config(config_file="ollama_config.json"):
    with open(config_file, 'r') as f:
        config_data = json.load(f)
    return config_data

ollama_config_data = load_config()

# Initialize the main LLM
sql_generator = ChatOllama(
    model=ollama_config_data["LLM"],
    base_url=ollama_config_data["BASE_URL"],
    temperature=ollama_config_data["TEMPERATURE"],
)

# Define desired output schema as a Pydantic model
class AnalysisResult(BaseModel):
    sql_command: str

# Structured LLM
structured_llm = sql_generator.with_structured_output(AnalysisResult)

# Lock to ensure proper synchronization when printing
print_lock = threading.Lock()

# Function to process and print each prompt
def process_prompt(prompt):
    result = structured_llm.invoke(prompt)
    with print_lock:
        print("\nPrompt:")
        print("="*80)
        print(prompt)
        print("="*80)
        print("\nGenerated SQL Command:")
        print("-"*80)
        print(result.sql_command)
        print("-"*80)

# # Using threading to parallelly process prompts
# threads = []
# for prompt in prompts:
#     thread = threading.Thread(target=process_prompt, args=(prompt,))
#     threads.append(thread)
#     thread.start()

# # Wait for all threads to finish
# for thread in threads:
#     thread.join()


In [2]:
industry_or_service = "Financial Consulting"

In [3]:
prompt = f"""
You are tasked with generating optimal candidate google search terms designed to maximize the retrieval of 'About Us' or 'Company Information' webpages specifically for businesses within a specified industry or service sector. 

Given the following target industry/service:

"{industry_or_service}"

Produce a concise, highly relevant list of candidate search terms, each carefully tailored to improve the likelihood of retrieving accurate, informative business pages that typically contain company descriptions, histories, and key organizational information.

The terms should be versatile enough to be used effectively in common web search engines like Google or Bing.
"""
print(prompt)


You are tasked with generating optimal candidate google search terms designed to maximize the retrieval of 'About Us' or 'Company Information' webpages specifically for businesses within a specified industry or service sector. 

Given the following target industry/service:

"Financial Consulting"

Produce a concise, highly relevant list of candidate search terms, each carefully tailored to improve the likelihood of retrieving accurate, informative business pages that typically contain company descriptions, histories, and key organizational information.

The terms should be versatile enough to be used effectively in common web search engines like Google or Bing.



In [4]:
# import threading
# from ollama_wrapper import ChatOllama
# import json
# import os
# from pydantic import BaseModel

# # Load configuration
# def load_config(config_file="ollama_config.json"):
#     with open(config_file, 'r') as f:
#         config_data = json.load(f)
#     return config_data

# ollama_config_data = load_config()

# # Initialize the main LLM
# sql_generator = ChatOllama(
#     model=ollama_config_data["LLM"],
#     base_url=ollama_config_data["BASE_URL"],
#     temperature=ollama_config_data["TEMPERATURE"],
# )

# Define desired output schema as a Pydantic model
class SearchTerms(BaseModel):
    search_terms: list[str]
    # search_terms: str

# Structured LLM
structured_llm = sql_generator.with_structured_output(SearchTerms)

# # Lock to ensure proper synchronization when printing
# print_lock = threading.Lock()

# # Function to process and print each prompt
# def process_prompt(prompt):
#     result = structured_llm.invoke(prompt)
#     with print_lock:
#         print("\nPrompt:")
#         print("="*80)
#         print(prompt)
#         print("="*80)
#         print("\nGenerated SQL Command:")
#         print("-"*80)
#         print(result.sql_command)
#         print("-"*80)

# # Using threading to parallelly process prompts
# threads = []
# for prompt in prompts:
#     thread = threading.Thread(target=process_prompt, args=(prompt,))
#     threads.append(thread)
#     thread.start()

# # Wait for all threads to finish
# for thread in threads:
#     thread.join()

result = structured_llm.invoke(prompt)
print("\nPrompt:")
print("="*80)
print(prompt)
print("="*80)
print("\nGenerated SQL Command:")
print("-"*80)
print(result.search_terms)
print("-"*80)

JSONDecodeError: Expecting value: line 2229 column 1 (char 177859)

In [6]:
import threading
from ollama_wrapper import ChatOllama
import json
import os
from pydantic import BaseModel

# Load configuration
def load_config(config_file="ollama_config.json"):
    with open(config_file, 'r') as f:
        config_data = json.load(f)
    return config_data

ollama_config_data = load_config()

# Initialize the main LLM
search_term_generator = ChatOllama(
    model=ollama_config_data["LLM"],
    base_url=ollama_config_data["BASE_URL"],
    temperature=ollama_config_data["TEMPERATURE"],
)

# Define desired output schema as a Pydantic model
class SearchTermsResult(BaseModel):
    search_terms: list[str]

# Structured LLM
structured_llm = search_term_generator.with_structured_output(SearchTermsResult)

# Lock to ensure proper synchronization when printing
print_lock = threading.Lock()

# Function to process and print each prompt
def process_prompt(prompt):
    # prompt = f"""
    # You are tasked with generating optimal candidate search terms designed to maximize the retrieval of 'About Us' or 'Company Information' webpages specifically for businesses within the following industry or service sector: "{industry_or_service}".

    # Produce a concise, highly relevant list of candidate search terms, each carefully tailored to improve the likelihood of retrieving accurate, informative business pages that typically contain company descriptions, histories, and key organizational information.

    # The terms should be versatile enough to be used effectively in common web search engines like Google or Bing.

    # Respond with your output formatted as a JSON list.
    # """
    result = structured_llm.invoke(prompt)
    with print_lock:
        print("\nPrompt:")
        print("="*80)
        print(prompt)
        print("="*80)
        print("\nGenerated Search Terms:")
        print("-"*80)
        print(result.search_terms)
        print("-"*80)

# Example industries/services
# industries_or_services = ["Renewable Energy", "Legal Services", "Financial Consulting"]
# industries_or_services = ["Financial Consulting"]

# # Using threading to parallelly process prompts
# threads = []
# for item in industries_or_services:
#     thread = threading.Thread(target=process_prompt, args=(item,))
#     threads.append(thread)
#     thread.start()

# # Wait for all threads to finish
# for thread in threads:
#     thread.join()

# process_prompt(prompt)

In [None]:
result = structured_llm.invoke(prompt)
print(result)

In [None]:
def generate_summaries(markdown_contents):
    """
    Generates summaries for markdown content using GPT-4.
    
    Args:
        markdown_contents: List of dictionaries containing markdown content and metadata
        
    Returns:
        List of dictionaries containing summaries and URLs
    """
    # Create markdown_summaries directory if it doesn't exist
    # pathlib.Path("markdown_summaries").mkdir(exist_ok=True)

    # Load the summary prompt
    summary_prompt = load_prompt("summarise_markdown_page")

    # Create prompt template
    summary_template = ChatPromptTemplate.from_messages([
        ("system", summary_prompt)
    ])

    # Initialize LLM
    llm = ChatOpenAI(openai_api_key=os.environ["OPENAI_API_KEY"], model=llm_model)
    summary_chain = summary_template | llm

    # Generate and save summaries
    summaries = []
    for content in markdown_contents:
        try:
            # Generate summary, limiting to first 2000 words
            summary = summary_chain.invoke({
                'markdown_input': ' '.join(content['markdown'].split()[:3000])
                # 'markdown_input': content['markdown']
            })
            
            # Create filename for summary
            # summary_filename = f"summary_{content['id']}.md"
            # summary_filepath = os.path.join("markdown_summaries", summary_filename)
            
            # Save summary to file
            # with open(summary_filepath, 'w', encoding='utf-8') as f:
            #     f.write(summary.content)
            
            # Add to summaries list
            summaries.append({
                'markdown_summary': summary.content,
                'url': content['url']
            })
                
        except Exception as e:
            print(f"Failed to summarize {content['url']}: {str(e)}")

    # print(f"Successfully generated summaries for {len(markdown_contents)} pages in markdown_summaries/")
    print(f"Successfully generated summaries for {len(markdown_contents)} pages")
    return summaries


summaries = generate_summaries(markdown_contents)



class State(TypedDict):
    messages: Annotated[list, add_messages]
    summaries: List[dict]
    approved: bool
    created_summaries: Annotated[List[dict], Field(description="The summaries that have been created by the summariser")]

graph_builder = StateGraph(State)

# Initialize components
llm = ChatOpenAI(openai_api_key=os.environ["OPENAI_API_KEY"], model=llm_model)




# Load templates and prompts
with open("email_template.md", "r") as f:
    email_template = f.read()

class SummariserOutput(BaseModel):
    email_summary: str = Field(description="The summary email of the content")
    message: str = Field(description="A message to the reviewer, asking for feedback on the summary")

summariser_prompt = ChatPromptTemplate.from_messages([
    ("system", load_prompt("summariser")),
    # ("placeholder", "{messages}"),
])

llm_summariser = summariser_prompt | llm.with_structured_output(SummariserOutput)

def summariser(state: State):
    converted_messages = []
    for msg in state["messages"]:
        if isinstance(msg, AIMessage):
            converted_messages.append(HumanMessage(content=msg.content))
        elif isinstance(msg, HumanMessage):
            converted_messages.append(AIMessage(content=msg.content))
        else:
            converted_messages.append(msg)
    state["messages"] = converted_messages
    summariser_output = llm_summariser.invoke({"messages": state["messages"], "list_of_summaries": state["summaries"], "input_template": email_template})
    new_messages = [AIMessage(content=summariser_output.email_summary), AIMessage(content=summariser_output.message)]
    return {"messages": new_messages, "created_summaries": [summariser_output.email_summary]}




class ReviewerOutput(BaseModel):
    approved: bool = Field(description="Whether the summary is approved or not")
    message: str = Field(description="A message to the reviewer, asking for feedback on the summary")

reviewer_prompt = ChatPromptTemplate.from_messages([
    ("system", load_prompt("reviewer")),
    # ("placeholder", "{messages}"),
])


llm_reviewer = reviewer_prompt | llm.with_structured_output(ReviewerOutput)


def reviewer(state: State):
    # Convert AIMessages to HumanMessages and vice versa
    converted_messages = []
    for msg in state["messages"]:
        if isinstance(msg, AIMessage):
            converted_messages.append(HumanMessage(content=msg.content))
        elif isinstance(msg, HumanMessage):
            converted_messages.append(AIMessage(content=msg.content))
        else:
            converted_messages.append(msg)
    state["messages"] = converted_messages
    reviewer_output = llm_reviewer.invoke({"messages": state["messages"]})
    new_messages = [AIMessage(content=reviewer_output.message)]
    return {"messages": new_messages, "approved": reviewer_output.approved}




def conditional_edge(state: State) -> Literal["summariser", END]:
    if state["approved"]:
        return END
    else:
        return "summariser"


# Create and configure the graph
graph_builder.add_node("summariser", summariser)
graph_builder.add_node("reviewer", reviewer)
graph_builder.add_edge(START, "summariser")
graph_builder.add_edge("summariser", "reviewer")
graph_builder.add_conditional_edges('reviewer', conditional_edge)

# Compile and run the graph
graph = graph_builder.compile()


output = graph.invoke({"summaries": summaries})



def send_email(email_content: str):
    configuration = sib_api_v3_sdk.Configuration()
    
    api_key = os.getenv("SENDINGBLUE_API_KEY")
    if not api_key:
        raise ValueError("SENDINGBLUE_API_KEY environment variable not set.")
    
    configuration.api_key['api-key'] = api_key
    
    sender_email = os.getenv("SENDER_EMAIL")
    recipient_email = os.getenv("DESTINATION_EMAIL")
    
    if not sender_email or not recipient_email:
        raise ValueError("Sender or Destination email environment variable not set.")

    api_instance = sib_api_v3_sdk.TransactionalEmailsApi(sib_api_v3_sdk.ApiClient(configuration))

    email_params = {
        "subject": "Daily Summary",
        "sender": {"name": "Mohammed Elsiddig", "email": sender_email},
        "html_content": email_content,
        "to": [{"email": recipient_email, "name": "Mohammed Elsiddig"}],
        "params": {"subject": "Daily Summary"}
    }

    send_smtp_email = sib_api_v3_sdk.SendSmtpEmail(**email_params)

    try:
        api_response = api_instance.send_transac_email(send_smtp_email)
        print(api_response)
    except ApiException as e:
        print(f"Exception: {e}\n")

send_email(final_summary)