In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from neo4j import GraphDatabase
from dotenv import load_dotenv
from firecrawl_scraping import *
from utility import *
from llm_extraction import *
from tqdm import tqdm
import glob
import tiktoken

In [2]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_openai import ChatOpenAI
from langchain.output_parsers.json import SimpleJsonOutputParser
from langchain.schema import StrOutputParser
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_community.chat_message_histories import ChatMessageHistory
import json

- gpt-4o: "o200k_base",
- gpt-4: "cl100k_base",
- gpt-3.5-turbo: "cl100k_base",
- gpt-3.5: "cl100k_base",  # Common shorthand
- gpt-35-turbo : "cl100k_base",  # Azure deployment name

gpt-4o US$5.00 / 1M input tokens； US$15.00 / 1M output tokens

gpt-4o context length: 128K tokens


In [5]:
encoding = tiktoken.encoding_for_model("gpt-4o")
print(encoding)

<Encoding 'o200k_base'>


In [18]:
data = read_json_file('scraping_output_v2_raw/aer_compliance.json')

data



In [4]:
print(data['/partners'])

Partner with Bennie

Our partner programs are designed for technology providers, consultants, insurance carriers, and referral partners who want to connect their networks with our benefits platform. Find the program that works best for you.

[Get in Touch](#become-a-partner-form)

![Partners Hero](https://images.ctfassets.net/0xmico1wg6et/R5pwIIpxdPlA7rDo1Ivt5/825113b9b56b2e2d6107a195adfaf531/partners-hero_2x.png?q=85&fm=png&w=750)

*   [Our Marketplace](#section-partners-2021-marketplace)
    
*   [Referral Partners](#section-partners-2021-referral-program)
    

![Benefits Partners Mobiles](https://images.ctfassets.net/0xmico1wg6et/7vAsQo4rbTx9594X1tbXem/60c70429fbb1cee9eed2efa8a0f928b8/benefits-partners-mobiles-v2.png?q=85&fm=png&w=695)

BENEFITS Marketplace

Benefits Partners
-----------------

Our benefits partners are comprised of products and services available for the Employer or Individual level. Learn more about what it takes to be listed as a Benefits Partner with Bennie.

[

In [10]:
print(clean_scraped_content(data['/partners']))

Partner with Bennie
Our partner programs are designed for technology providers, consultants, insurance carriers, and referral partners who want to connect their networks with our benefits platform. Find the program that works best for you.
[Get in Touch]
![Partners Hero]
*   [Our Marketplace]
*   [Referral Partners]
![Benefits Partners Mobiles]
BENEFITS Marketplace
Benefits Partners
Our benefits partners are comprised of products and services available for the Employer or Individual level. Learn more about what it takes to be listed as a Benefits Partner with Bennie.
[Browse Our Partners]
![Benefits Partners Image 2]
HCM Technology Marketplace
HCM Tech Partners
We believe in an open ecosystem of HCM products and data. By partnering with the Bennie platform, you can help improve the experience of our growing mutual client base and their employee populations.
[View All]
![Benefits Partners Image 3]
Expand The Bennie Network
Referral Partners
If you recommend Bennie to your network, we’ll

In [16]:
print(calculate_cost(data['main_page']))
print(calculate_cost(clean_scraped_content(data['main_page'])))

0.00941


In [11]:
for url, content in data.items():
    print(url)
    print(f'Estimated GPT4-o cost: ${calculate_cost(data[url])}')
    print(f'Estimated GPT4-o cost after cleaning: ${calculate_cost(clean_scraped_content(data[url]))}')
    print('------------------------')
    

/partners
Estimated GPT4-o cost: $0.024155000000000003
Estimated GPT4-o cost after cleaning: $0.00414
------------------------
/customers
Estimated GPT4-o cost: $0.00637
Estimated GPT4-o cost after cleaning: $0.00283
------------------------
main_page
Estimated GPT4-o cost: $0.00965
Estimated GPT4-o cost after cleaning: $0.00417
------------------------


### Exploration of first shorten the page by extracting relevant information
Issue: The output of the content might be shorten too much

In [15]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_openai import ChatOpenAI
from langchain.output_parsers.json import SimpleJsonOutputParser
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_community.chat_message_histories import ChatMessageHistory
import os
import json


def llm_extraction_json(text, model_name="gpt-4o"):
    system_message = """
    You are an intelligent text extraction and conversion assistant. Your task is to extract information 
    from the given text and convert it into a pure JSON format. 
    The JSON should contain only the structured data extracted from the text, with no additional commentary, explanations, or extraneous information.
    If the required information could not be found from the given source, return nothing for that field. Do not hallucinate.
    """

    extraction_prompt = """
    You are provided a text obtained from a webpage of a company. 
    Extract any sections or paragraphs that are relevant to the information of interest from the text.
    
    ## Here are the information of interest:
    
    1. About Product or Service: 
    - Any information about the service or product that the company offer and their features

    2. About Partner or Client:
    - Any information about the partners or clients of the company. 
    - Any use cases (case studies) about how a client is using the product or service.
    
    ## Note: 
    - For this task, you do not need to summarise. You just need to extract raw lines from the text. 
    - If you are unsure about whether the information is relevant, just include this information as I want as much information as possible.

    ## Output in JSON format:
    {{
        "about_product": "any information about the product or service",
        "about_client": "any information about the client or partnership"
    }}
    
    """

    prompt = ChatPromptTemplate.from_messages(
        [
            ("system", system_message),
            ("system", extraction_prompt),
            ("human", "Use the given text to extract information: {input}"),
            ("human", """
                Here are the rules that you need to adhere:
                ## Rules:
                - Make sure to answer in the correct JSON format.
                - If no information is provided for any of the fields, return nothing of that field.
                - DO NOT HALLUCINATE.
             """),
        ]
    )
    
    llm = ChatOpenAI(openai_api_key=os.getenv('OPENAI_KEY'),
                    temperature=0, 
                    model_name=model_name)

    llm_chain = prompt | llm | SimpleJsonOutputParser()

    response = llm_chain.invoke({'input': text})
    
    return response

# Example usage
extracted_data = {}

# llm_extraction(clean_scraped_content(result['/partnerships/'], model_name="gpt-4o"))

for key, value in data.items():
    clean_content = clean_scraped_content(value)
    extracted_data[key] = llm_extraction_json(clean_content)

In [25]:
data



In [17]:
extracted_data

{'/partners': {'about_product': 'Our partner programs are designed for technology providers, consultants, insurance carriers, and referral partners who want to connect their networks with our benefits platform. Our benefits partners are comprised of products and services available for the Employer or Individual level. We believe in an open ecosystem of HCM products and data. By partnering with the Bennie platform, you can help improve the experience of our growing mutual client base and their employee populations. If you recommend Bennie to your network, we’ll pay you a referral fee for every qualified lead. We understand that people today face challenges navigating the benefits landscape. This is why we have developed benefit partnerships that solve these unique challenges. Whether it’s Individual, D2C, B2B, or B2C products employees can now access benefits through our suite of preferred partners. By building an integration with Bennie, you can make it seamless for clients to utilize 

In [19]:
write_json_file('extraction_summary/bennie_summary.json', extracted_data)

In [22]:
extracted_data['main_page']['about_product']

'Our global benefits platform helps employers, employees, and their dependents thrive through a range of modern solutions, including consulting, insurance, coaching, and training. The Bennie App allows employees to view ID cards, search for in-network doctors, explore costs, and chat with a healthcare concierge to get help with questions about claims, coverage, enrollment, and more. Better Health Plan is a level-funded, cost-effective option for your healthcare. Enjoy everything you’d get with fully insured plans, but Better. Better Health Plan allows employers to achieve improved cash flow with lower monthly healthcare payments while providing plan design flexibility and limiting employee out-of-pocket costs. Better Insurance provides consulting for all lines of coverage, including workers’ compensation, general liability, property, auto, and more. Better rates, bigger limits, better policy language, and lower retentions. Sayge is a global coaching platform, matching employees with de

In [23]:
get_related_urls('https://360ofme.com')

https://360ofme.com




({'https://360ofme.com#action',
  'https://360ofme.com#challenge',
  'https://360ofme.com#difference',
  'https://360ofme.com#gdpr_cookie_modal',
  'https://360ofme.com#outcomes',
  'https://360ofme.com#solution',
  'https://360ofme.com/',
  'https://360ofme.com/company/',
  'https://360ofme.com/contact/',
  'https://360ofme.com/cookie-policy/',
  'https://360ofme.com/icwp-wpsf-link-cheese/',
  'https://360ofme.com/privacy-policy/',
  'https://360ofme.com/product/',
  'https://360ofme.com/solution/'},
 ['https://360ofme.com',
  'https://360ofme.com#solution',
  'https://360ofme.com/solution/',
  'https://360ofme.com/product/'])

In [20]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_openai import ChatOpenAI
from langchain.output_parsers.json import SimpleJsonOutputParser
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_community.chat_message_histories import ChatMessageHistory
import os
import json


def llm_extraction_str(text, model_name="gpt-4o"):
    system_message = """
    You are an intelligent text extraction and conversion assistant. Your task is to extract information 
    from the given text and convert it into a text (string) format. 
    The output response should contain only the data extracted from the text, with no additional commentary, explanations, or extraneous information.
    If the required information could not be found from the given source, return nothing. Do not hallucinate.
    """

    # Define the extraction prompt
    extraction_prompt = """
    You are provided with a text obtained from a company's webpage. Your task is to extract any sections or paragraphs that are relevant to the specified information of interest.

    ## Information of Interest:

    1. **About Product or Service**:
    - Any details about the products or services the company offers, including their features.

    2. **About Partner or Client**:
    - Any information about the company's partners or clients.
    - Any use cases (case studies) describing how a client is using the company's product or service.
    
    ## Note:
    Sometimes, the company does not explicit describe their clients and the client use case, instead, they will only display clients' logos. 
    You then need to extract client's name from their logos. 
    
    ## Instructions:
    - Do not summarize the content. Extract the raw lines or sections as they are.
    - If you are unsure about the relevance of the information, include it to ensure comprehensive coverage.
    - Output the extracted information in standard text format.

    ## Examples:

    ### Example 1: Product or Service
    If the input text contains:
    "Our company offers innovative cloud solutions that help businesses streamline their operations. Our key features include scalability, security, and ease of use.
    We partner with leading firms such as TechCorp and SoftInc to deliver top-notch services."

    The output should be:
    "Our company offers innovative cloud solutions that help businesses streamline their operations. Our key features include scalability, security, and ease of use.
    We partner with leading firms such as TechCorp and SoftInc to deliver top-notch services."

    ### Example 2: Client Logos
    If the input text contains:
    "Our platform and service is trusted by these innovative companies:
    ![Nationwide Logo]
    ![Freedom 365 Logo]
    ![Bestow Logo]
    ..."
    
    The output should be:
    "Our platform and service is trusted by these innovative companies: 
    Clients are: Nationwide, Freedom 365, Bestow..."
   
    """
    prompt = ChatPromptTemplate.from_messages(
        [
            ("system", system_message),
            ("system", extraction_prompt),
            ("human", "Use the given text to extract information: {input}"),
            ("human", """
                Here are the rules that you need to adhere:
                ## Rules:
                - Make sure to answer in the standard text format.
                - If no information is provided, return nothing.
                - DO NOT HALLUCINATE.
             """),
        ]
    )
    
    llm = ChatOpenAI(openai_api_key=os.getenv('OPENAI_KEY'),
                    temperature=0, 
                    model_name=model_name)

    llm_chain = prompt | llm | StrOutputParser()

    response = llm_chain.invoke({'input': text})
    
    return response

# Example usage
extracted_data = {}

# llm_extraction(clean_scraped_content(result['/partnerships/'], model_name="gpt-4o"))

for key, value in data.items():
    clean_content = clean_scraped_content(value)
    extracted_data[key] = llm_extraction_str(clean_content)

In [28]:
print(data['main_page'])

Compliance · Simplified · automated

Join us in the next evolution of compliance

Keeping employees compliant doesn’t need to be messy or hard. Let Aer do the manual work by automating your daily Code of Ethics compliance tasks so you can tackle the real issues in your business.

[Request a demo](https://calendly.com/ossr)

![](https://assets-global.website-files.com/64b49d37d09f5d40a796c8e0/64b4a670097020ec3b491054_Frame%20181.webp)

Backed by leading venture firms and investors

![](https://assets-global.website-files.com/64b49d37d09f5d40a796c8e0/64b4a7d54c270b78dff186a8_image%201.png)

![](https://assets-global.website-files.com/64b49d37d09f5d40a796c8e0/64b4a7f877901d548fb41ace_image%203.webp)

Featured in

[![](https://assets-global.website-files.com/64b4b39e7de034a04ef56b95/64b8ad759bd4e117c6b6c4f2_image%204.png)](https://www.wsj.com/articles/crypto-might-have-an-insider-trading-problem-11653084398)

[![](https://assets-global.website-files.com/64b4b39e7de034a04ef56b95/64b8adda775

In [24]:
print(extracted_data['main_page'])

Compliance · Simplified · automated
Join us in the next evolution of compliance
Keeping employees compliant doesn’t need to be messy or hard. Let Aer do the manual work by automating your daily Code of Ethics compliance tasks so you can tackle the real issues in your business.

Why Aer?
Buried in employee statements?
Remove the manual review process with seamless connections to 1,000+ exchanges, wallets and brokers.
Run a lean compliance team.
Automatically surface potential violations with real-time notifications.
Constantly working on manual employee trade compliance tasks?
Automate low-value repetitive tasks.
Use automation to remove human error and make it easy for your employees
The easier it is to stick to policy, the more likely employees will do it. Code of Ethics software doesn’t need to be complex and cumbersome. We build great software that removes manual tasks and turbo-charges your team’s efficacy. Go beyond out-dated employee trade compliance technology and meet us **in**

In [17]:
write_json_file('extraction_summary/aer_compliance_summary_str.json', extracted_data)

In [19]:
print(clean_scraped_content(data['/solutions']))

Solutions
Streamline Code of Ethics
Pre-trade clearance\
Automatically approve and reject employee trades, or surface them for manual review based on your customized company policies.\
Find out more\
![]](/solution/pre-trade-clearance)
Post-trade monitoring\
Seamless integrations where your employees trade. Coverage spanning thousands of exchanges, wallets, and brokerages.\
![]](/solution/post-trade-monitoring)
Best-in-class coverage\
Eliminate headaches for your compliance team and employees. With coverage across the US, UK, EU, Singapore, India, Hong Kong, Canada, Australia, and beyond, let Aer close your compliance gaps and reduce manual reporting.\
![]](/solution/best-in-class-coverage)
Crypto\
No one does crypto like Aer. With full coverage across exchanges and blockchains wherever your employees are trading, Aer enables a frictionless process for the future of finance.\
![]](/solution/crypto)
Conflicts of interest\
Keep employees compliant with reporting workflows that are simple