In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from neo4j import GraphDatabase
from dotenv import load_dotenv
from firecrawl_scraping import *
from utility import *
from llm_extraction import *
from tqdm import tqdm
import glob
import tiktoken
import instructor
from pydantic import BaseModel
import instructor
from openai import OpenAI
import ast


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_openai import ChatOpenAI
from langchain.output_parsers.json import SimpleJsonOutputParser
from langchain.schema import StrOutputParser
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_community.chat_message_histories import ChatMessageHistory
import json

- gpt-4o: "o200k_base",
- gpt-4: "cl100k_base",
- gpt-3.5-turbo: "cl100k_base",
- gpt-3.5: "cl100k_base",  # Common shorthand
- gpt-35-turbo : "cl100k_base",  # Azure deployment name

gpt-4o US$5.00 / 1M input tokens； US$15.00 / 1M output tokens

gpt-4o context length: 128K tokens


In [4]:
encoding = tiktoken.encoding_for_model("gpt-4o")
print(encoding)

<Encoding 'o200k_base'>


In [18]:
df = pd.read_csv('companies_urls_info.csv')
sample = df[df['url'].isin(['https://www.vertice.one', 
                   'https://www.estimize.com',
                   'https://www.newconstructs.com',
                   'https://www.chargebee.com',
                   'https://www.bennie.com',
                   'https://www.aercompliance.com',
                   'https://www.missionmark.com',
                   'https://www.joinmassive.com',
                   'https://www.hemlane.com',
                   'https://www.vesta.com',
                   'https://www.adaptive.build',
                   'https://www.additive.ai',
                   'https://www.9fin.com',
                   'https://www.niloom.ai',
                   'https://www.nexben.com',
                   'https://www.naturealpha.ai',
                   'https://www.lworks.io',
                   'https://www.infogrid.io',
                   'https://www.harnessproperty.com',
                   'https://www.directsoftware.com',
                   'https://www.dexitcorp.com'])]

sample

Unnamed: 0,company,processed_name,url,related_urls_str,related_urls,num_of_related_urls,all_urls,num_of_all_urls
5,Vertice,vertice,https://www.vertice.one,https://www.vertice.one/product/saas-purchasin...,['https://www.vertice.one/product/saas-purchas...,6,['https://www.vertice.one/explore/cloud-manage...,31
6,Massive,massive,https://www.joinmassive.com,"https://www.joinmassive.com/casestudies,https:...","['https://www.joinmassive.com/casestudies', 'h...",3,"['https://www.joinmassive.com/faq#users', 'htt...",25
14,Additive,additive,https://www.additive.ai,https://www.additive.ai,['https://www.additive.ai'],1,"['https://www.additive.ai', 'https://www.addit...",4
105,Nexben,nexben,https://www.nexben.com,https://www.nexben.com/payment-solutions/ichra...,['https://www.nexben.com/payment-solutions/ich...,16,"['https://www.nexben.com/about/meet-the-team',...",32
142,Direct,direct,https://www.directsoftware.com,"https://www.directsoftware.com/partners,https:...","['https://www.directsoftware.com/partners', 'h...",13,"['https://www.directsoftware.com/partners', 'h...",16
168,Ledger Works,ledger_works,https://www.lworks.io,"https://www.lworks.io/customers-partners,https...","['https://www.lworks.io/customers-partners', '...",10,"['https://www.lworks.io/customers-partners', '...",20
196,Vesta,vesta,https://www.vesta.com,"https://www.vesta.com/partners,https://www.ves...","['https://www.vesta.com/partners', 'https://ww...",3,"['https://www.vesta.com/privacy', 'https://www...",9
197,Niloom.ai,niloom_ai,https://www.niloom.ai,https://www.niloom.ai,['https://www.niloom.ai'],1,"['https://www.niloom.ai', 'https://www.niloom....",7
226,Hemlane,hemlane,https://www.hemlane.com,"https://www.hemlane.com/realtor-partners/,http...","['https://www.hemlane.com/realtor-partners/', ...",2,['https://www.hemlane.com/features/rental-adve...,31
247,Harness Data Intelligence,harness_data_intelligence,https://www.harnessproperty.com,https://www.harnessproperty.com/search/service...,['https://www.harnessproperty.com/search/servi...,2,"['https://www.harnessproperty.com/contact-us',...",31


In [44]:
import os
import time
import json
from dotenv import load_dotenv
from requests.exceptions import HTTPError

def crawl_data(base_url, url_list: list, file_path: str, overwrite: bool = False):
    load_dotenv()
    # Initialize the FirecrawlApp with your API key
    app = FirecrawlApp(api_key=os.getenv('FIRECRAWL_KEY'))
    
    # Load existing data if the file exists
    if os.path.exists(file_path):
        with open(file_path, 'r') as file:
            result = json.load(file)
    else:
        result = {}

    rate_limit_reset_time = 0
    
    for url in url_list:
        # Determine the endpoint
        if base_url == url:
            endpoint = 'main_page'
        else:
            if base_url in url:
                endpoint = url.replace(base_url, '')
            else:
                endpoint = url
        
        # Check if the endpoint already exists in the result
        if endpoint in result and not overwrite:
            print(f"Skipping {url} as it already exists and overwrite is set to False.")
            continue  # Skip this URL and move to the next one

        # Respect rate limit by waiting until the reset time
        if time.time() < rate_limit_reset_time:
            wait_time = rate_limit_reset_time - time.time()
            print(f"Rate limit exceeded. Waiting for {wait_time} seconds.")
            time.sleep(wait_time)
        
        try:
            # Scrape a single URL
            print(f"Scraping {url}.")
            response = app.scrape_url(url, {'pageOptions': {'onlyMainContent': True}})
            
            try:
                scraped_data = response.json()  # Attempt to parse JSON response
            except ValueError:
                print(f"Failed to decode JSON response for {url}")
                continue  # Skip to the next URL

            # Check if 'markdown' key exists in the scraped data
            if 'markdown' in scraped_data:
                result[endpoint] = scraped_data['markdown']
        
        except HTTPError as e:
            # Handle rate limit exceeded error
            if e.response.status_code == 429:
                rate_limit_reset_time = int(e.response.headers.get('Retry-After', 60)) + time.time()
                print(f"Rate limit exceeded. Retrying after {rate_limit_reset_time - time.time()} seconds.")
                time.sleep(rate_limit_reset_time - time.time())
                continue  # Skip the rest of the code in this iteration and retry scraping the same URL
            else:
                print(f"Unexpected error: {e}")
    
    # Write the updated JSON data back to the file
    with open(file_path, 'w') as file:
        json.dump(result, file, indent=4)
    
    return result

# Example usage:
# result = crawl_data(base_url, url_list, 'scraped_data.json', overwrite=False)


In [19]:
for index, row in sample.iloc[1:].iterrows():
    base_url = row['url']
    url_list = ast.literal_eval(row['related_urls'])
    result = crawl_data(base_url, url_list, f'scraping_output_v2_raw/{row["processed_name"]}.json', overwrite=False)

Skipping https://www.joinmassive.com/casestudies as it already exists and overwrite is set to False.
Skipping https://www.joinmassive.com/partners as it already exists and overwrite is set to False.
Skipping https://www.joinmassive.com as it already exists and overwrite is set to False.
Scraping https://www.additive.ai.


JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [None]:
res = scrape_data('https://www.vertice.one/')

In [13]:
data = read_json_file('scraping_output_v2_raw/hemlane.json')

print(clean_scraped_content(data['/realtor-partners/']))

Talk to a human: [(866) 387-1629]
[Sign in]
REALTORS®' Partner in Property Management
**The best agents** help their clients get the most out of their rental properties.
![Voted Capterra's Top 20 Property Management Solutions]![Software Advice most recommended Property Management Solution badge]![Software Advice Real Estate Property Management Front Runner Badge]![Software Advice Badge - Best Customer Support for Property Management]![GetApp Badge - Best Functionality and Features]
Back
How do you support your clients with their rental properties?
I want to refer landlords
(and get paid for it)
I offer leasing
(and want free leads and tools)
I offer property management
(and want to eliminate trust accounts)
Check out other REALTORS® partnering with us
![]![Headshot of Timothy Hampson]
Timothy Hampson
License #9008072 (TX)
HP2 RESIDENTIAL
Experience
Leasing
12 years
Management
Real estate
![]![Headshot of Sandy Wickware]
Sandy Wickware
License #253554 (TX)
Fathom Realty, LLC
14 years
2 

In [15]:
for url, content in data.items():
    print(url)
    print(f'Estimated GPT4-o cost: ${calculate_cost(data[url])}')
    print(f'Estimated GPT4-o cost after cleaning: ${calculate_cost(clean_scraped_content(data[url]))}')
    print('------------------------')
    

/realtor-partners/
Estimated GPT4-o cost: $0.08219499999999999
Estimated GPT4-o cost after cleaning: $0.00167
------------------------
main_page
Estimated GPT4-o cost: $0.01288
Estimated GPT4-o cost after cleaning: $0.005535
------------------------


### Exploration of first shorten the page by extracting relevant information
Issue: The output of the content might be shorten too much

In [39]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_openai import ChatOpenAI
from langchain.output_parsers.json import SimpleJsonOutputParser
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_community.chat_message_histories import ChatMessageHistory
import os
import json


def llm_summary(text, model_name="gpt-4o"):
    system_message = """
    You are an intelligent text extraction and conversion assistant. Your task is to extract information 
    from the given text and convert it into a text (string) format. 
    The output response should contain only the data extracted from the text, with no additional commentary, explanations, or extraneous information.
    If the required information could not be found from the given source, return nothing. Do not hallucinate.
    """

    # Define the extraction prompt
    extraction_prompt = """
    You are provided with a text obtained from a company's webpage. Your task is to extract any sections or paragraphs that are relevant to the specified information of interest.

    ## Information of Interest:

    1. **About Product or Service**:
    - Any details about the products or services the company offers, including their features.

    2. **About Partner or Client**:
    - Any information about the company's partners or clients.
    - Any use cases (case studies) describing how a client is using the company's product or service.
    
    ## Note:
    Sometimes, the company does not explicit describe their clients and the client use case, instead, they will only display clients' logos. 
    You then need to extract client's name from their logos. 
    
    ## Instructions:
    - Do not summarize the content. Extract the raw lines or sections as they are.
    - If you are unsure about the relevance of the information, include it to ensure comprehensive coverage.
    - Output the extracted information in standard text format.

    ## Examples:

    ### Example 1: Product or Service
    If the input text contains:
    "Our company offers innovative cloud solutions that help businesses streamline their operations. Our key features include scalability, security, and ease of use.
    We partner with leading firms such as TechCorp and SoftInc to deliver top-notch services."

    The output should be:
    "Our company offers innovative cloud solutions that help businesses streamline their operations. Our key features include scalability, security, and ease of use.
    We partner with leading firms such as TechCorp and SoftInc to deliver top-notch services."

    ### Example 2: Client Logos
    If the input text contains:
    "Our platform and service is trusted by these innovative companies:
    ![Nationwide Logo]
    ![Freedom 365 Logo]
    ![Bestow Logo]
    ..."
    
    The output should be:
    "Our platform and service is trusted by these innovative companies: 
    Clients are: Nationwide, Freedom 365, Bestow..."
   
    """
    prompt = ChatPromptTemplate.from_messages(
        [
            ("system", system_message),
            ("system", extraction_prompt),
            ("human", "Use the given text to extract information: {input}"),
            ("human", """
                Here are the rules that you need to adhere:
                ## Rules:
                - Make sure to answer in the standard text format.
                - If no information is provided, return nothing.
                - DO NOT HALLUCINATE.
             """),
        ]
    )
    
    llm = ChatOpenAI(openai_api_key=os.getenv('OPENAI_KEY'),
                    temperature=0, 
                    model_name=model_name)

    llm_chain = prompt | llm | StrOutputParser()

    response = llm_chain.invoke({'input': text})
    
    return response


def llm_summary_execution(process_company_name:str, overwrite:bool = False):

    scrape_file_path = f'scraping_output_v2_raw/{process_company_name}.json'
    extraction_file_path = f'extraction_summary/{process_company_name}_summary_str.json'

    scrape_data = read_json_file(scrape_file_path)

    # Load existing data if the file exists
    if os.path.exists(extraction_file_path):
        with open(extraction_file_path, 'r') as file:
            extracted_data = json.load(file)
    else:
        extracted_data = {}

    for endpoint, content in tqdm(scrape_data.items(), total=len(scrape_data), desc="Extracting data", position=0, leave=True):
        if endpoint in extracted_data and not overwrite:
            print(f"Company: {process_company_name}; Skipping {endpoint} as it already exists and overwrite is set to False.")
            continue  # Skip this URL and move to the next one
        else:
            clean_content = clean_scraped_content(content)
            extracted_data[endpoint] = llm_summary(clean_content)
            print(f'Company: {process_company_name}; Content in {endpoint} is extracted.')
            
    write_json_file(f'extraction_summary/{process_company_name}_summary_str.json', extracted_data)
    
    return extracted_data

# # Example usage
# extracted_data = {}
# for key, value in data.items():
#     clean_content = clean_scraped_content(value)
#     extracted_data[key] = llm_summary(clean_content)

In [38]:
sample['processed_name'].to_list()

['vertice',
 'massive',
 'vesta',
 'hemlane',
 'missionmark',
 'bennie',
 'new_constructs',
 'aer_compliance']

In [40]:
for company in tqdm(sample['processed_name'].to_list()):
    llm_summary_execution(company)

Extracting data:  20%|██        | 1/5 [00:15<01:01, 15.41s/it]

Content in /product/saas-purchasing is extracted.


Extracting data:  40%|████      | 2/5 [00:35<00:54, 18.20s/it]

Content in /product/cloud-cost-optimization is extracted.


Extracting data:  60%|██████    | 3/5 [00:39<00:22, 11.48s/it]

Content in /partners is extracted.


Extracting data:  80%|████████  | 4/5 [00:51<00:11, 11.78s/it]

Content in /product/saas-cloud-platform is extracted.


Extracting data: 100%|██████████| 5/5 [00:57<00:00, 11.55s/it]
 12%|█▎        | 1/8 [00:57<06:44, 57.77s/it]

Content in /customer-stories is extracted.


Extracting data:  33%|███▎      | 1/3 [00:02<00:04,  2.28s/it]

Content in /casestudies is extracted.


Extracting data:  67%|██████▋   | 2/3 [00:04<00:02,  2.01s/it]

Content in /partners is extracted.


Extracting data: 100%|██████████| 3/3 [00:14<00:00,  4.96s/it]
 25%|██▌       | 2/8 [01:12<03:15, 32.55s/it]

Content in main_page is extracted.


Extracting data:  50%|█████     | 1/2 [00:11<00:11, 11.81s/it]

Content in /partners is extracted.


Extracting data: 100%|██████████| 2/2 [00:22<00:00, 11.43s/it]
 38%|███▊      | 3/8 [01:35<02:20, 28.12s/it]

Content in /product is extracted.


Extracting data: 100%|██████████| 2/2 [00:00<00:00, 25040.62it/s]


Skipping /realtor-partners/ as it already exists and overwrite is set to False.
Skipping main_page as it already exists and overwrite is set to False.


Extracting data:  33%|███▎      | 1/3 [00:05<00:11,  5.84s/it]

Content in /template/resources/product is extracted.


Extracting data:  67%|██████▋   | 2/3 [00:16<00:08,  8.61s/it]

Content in /template/services is extracted.


Extracting data: 100%|██████████| 3/3 [00:22<00:00,  7.55s/it]
 62%|██████▎   | 5/8 [01:58<00:55, 18.61s/it]

Content in main_page is extracted.


Extracting data: 100%|██████████| 3/3 [00:00<00:00, 63872.65it/s]


Skipping /partners as it already exists and overwrite is set to False.
Skipping /customers as it already exists and overwrite is set to False.
Skipping main_page as it already exists and overwrite is set to False.


Extracting data:  33%|███▎      | 1/3 [00:06<00:13,  6.95s/it]

Content in main_page is extracted.


Extracting data:  67%|██████▋   | 2/3 [00:14<00:07,  7.33s/it]

Content in /partnerships/ is extracted.


Extracting data: 100%|██████████| 3/3 [00:38<00:00, 12.78s/it]
 88%|████████▊ | 7/8 [02:36<00:18, 18.86s/it]

Content in /customer-testimonials/ is extracted.


Extracting data:   0%|          | 0/12 [00:00<?, ?it/s]

Skipping /solution/firm-trading as it already exists and overwrite is set to False.
Skipping /solution/artificial-intelligence as it already exists and overwrite is set to False.
Skipping /solution/best-in-class-coverage as it already exists and overwrite is set to False.
Skipping /solution/pre-trade-clearance as it already exists and overwrite is set to False.
Skipping /solutions as it already exists and overwrite is set to False.
Skipping /solution/conflicts-of-interest as it already exists and overwrite is set to False.
Skipping main_page as it already exists and overwrite is set to False.
Skipping /solution/attestations-certifications as it already exists and overwrite is set to False.
Skipping /solution/post-trade-monitoring as it already exists and overwrite is set to False.
Skipping /solution/crypto as it already exists and overwrite is set to False.


Extracting data:  92%|█████████▏| 11/12 [00:07<00:00,  1.42it/s]

Content in /industry/financial-services is extracted.


Extracting data: 100%|██████████| 12/12 [00:12<00:00,  1.04s/it]
100%|██████████| 8/8 [02:48<00:00, 21.12s/it]

Content in /solution/cutting-edge-analytics-dashboards is extracted.





## Instructor

https://github.com/jxnl/instructor

In [3]:
from pydantic import BaseModel, Field
from typing import List, Optional
from langchain_core.prompts import ChatPromptTemplate

class ProductDescription(BaseModel):
    name: str = Field(..., alias='summarised name of product')
    description: str = Field(..., alias='concise features description of the product or service')
    
class SummaryProductDescription(BaseModel):
    name: str = Field(..., alias='summarised name of the main product offerings of the company')
    description: str = Field(..., alias='summary of product offering of the company')
    
class ClientDescription(BaseModel):
    name: str = Field(..., alias='name of the client or partner')
    product_used: Optional[str] = Field(None, alias='summary of the product or service used by the client or partner')
    description: Optional[str] = Field(None, alias='description of the usecase')

class ExtractedInformation(BaseModel):
    product_descriptions: Optional[List[ProductDescription]] = None
    # product_offering_summary: str = Field(..., alias='summary of product offering of the company')
    summary_product_description: SummaryProductDescription
    client_descriptions: Optional[List[ClientDescription]] = None
    



In [4]:
def llm_information_extraction(text: str, custom_extraction_prompt: str, model_name: str = 'gpt-4o') -> ExtractedInformation:
    system_message = """
    You are an intelligent text extraction and conversion assistant. Your task is to extract structured information 
    from the given text and convert it into a structured format. 
    The output response should contain only the data extracted from the text, with no additional commentary, explanations, or extraneous information.
    If the required information could not be found from the given source, return nothing for that field. Do not hallucinate.
    """
    
    extraction_prompt = f"""
    {system_message}
    {custom_extraction_prompt}
    """
    
    # Other models to consider: "gpt-3.5-turbo-0125"
    # Patch the OpenAI client with Instructor
    client = instructor.from_openai(OpenAI(api_key=os.getenv('OPENAI_KEY')))
    
    response = client.chat.completions.create(
        model=model_name, 
        response_model=ExtractedInformation,
        messages=[
            {"role": "system", "content": extraction_prompt},
            {"role": "user", "content": f"Use the given text to extract information: {text}"},
            {"role": "user", "content": """
                Here are the rules that you need to adhere:
                ## Rules:
                - The aim is to achieve simplicity and clarity in the extracted text.
                - Make sure to answer in the structured format.
                - If no information is provided for any of the fields, return nothing of that field.
                - DO NOT HALLUCINATE.
             """},
        ]
    )
    return response


In [5]:
product_extraction_prompt = """
    Extract the following information from the text extracted from a webpage of a company:

    1. Product Description:
    - What service or product does the company provide?
    - What features does the product or service have?
    Note: If the company has more than one product or service, automatically detect and list each product with its relevant details.
    
    2. Summary of Product Offering:
    - Summary of the description of the service that the company provide, taking into consideration of all the product offerings.
    Note: Do not include any company-specific information in the summary, such as company name and location.
    
    3. Client Description:
    - Name of the client or partner. Note: Only focus on corporate partners or clients, instead of individuals. 
    - Summary of the product or service used by the client or partner.
    - Description of the use case.
    Note: If the product used and description fields are not mentioned, they should be None.

    Output in a structured format.
"""


In [8]:
def llm_extraction_execution(process_company_name:str):

    summary_file_path = f'extraction_summary/{process_company_name}_summary_str.json'
    if os.path.exists(summary_file_path):
        with open(summary_file_path, 'r') as file:
            summary = json.load(file)

        combined_summary = f"## Main Page:\n {summary['main_page']}\n----------------\n"

        for endpoint, text in summary.items():
            if endpoint != "main_page":
                combined_summary += f"## {endpoint}:\n{text}\n----------------\n"
                
        print(f'Cost: ${calculate_cost(combined_summary)}')
        
        # print(combined_summary)
        
        response = llm_information_extraction(combined_summary, product_extraction_prompt)
        # print(response)
        print(response.dict())
        write_json_file(f'extraction_output_v2/{process_company_name}_extraction.json', response.dict())
    else:
        print(f'Summary file: {summary_file_path} does not exist.')

In [17]:
llm_extraction_execution('vesta')

KeyError: 'main_page'

In [58]:
# Example data
text = """
Our company offers innovative cloud solutions that help businesses streamline their operations. Our key features include scalability, security, and ease of use.

We partner with leading firms such as TechCorp and SoftInc to deliver top-notch services. For example, TechCorp uses our cloud solutions to improve their data management, resulting in a 30% increase in efficiency.

Our client, SoftInc, has integrated our services into their workflow, leading to significant improvements in their project turnaround times.

Our platform and service are trusted by these innovative companies:
Clients are: Nationwide, Freedom, Bestow...
...
"""

response = llm_information_extraction(text, product_extraction_prompt)
print(response)


product_descriptions=[ProductDescription(name='Cloud Solutions', description='Scalability, security, and ease of use.')] product_offering_summary='Innovative cloud solutions that help businesses streamline their operations with key features including scalability, security, and ease of use.' client_descriptions=[ClientDescription(name='TechCorp', product_used='Cloud solutions', description='Improve data management, resulting in a 30% increase in efficiency.'), ClientDescription(name='SoftInc', product_used='Cloud solutions', description='Integrated services into their workflow, leading to significant improvements in project turnaround times.'), ClientDescription(name='Nationwide', product_used=None, description=None), ClientDescription(name='Freedom', product_used=None, description=None), ClientDescription(name='Bestow', product_used=None, description=None)]


'{"product_descriptions":[{"name":"Cloud Solutions","description":"Scalability, security, and ease of use."}],"product_offering_summary":"Innovative cloud solutions that help businesses streamline their operations with key features including scalability, security, and ease of use.","client_descriptions":[{"name":"TechCorp","product_used":"Cloud solutions","description":"Improve data management, resulting in a 30% increase in efficiency."},{"name":"SoftInc","product_used":"Cloud solutions","description":"Integrated services into their workflow, leading to significant improvements in project turnaround times."},{"name":"Nationwide","product_used":null,"description":null},{"name":"Freedom","product_used":null,"description":null},{"name":"Bestow","product_used":null,"description":null}]}'