In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from neo4j import GraphDatabase
from dotenv import load_dotenv
from firecrawl_scraping import *
from utility import *
from llm_extraction import *
from tqdm import tqdm
import glob
import tiktoken
import instructor
from pydantic import BaseModel
import instructor
from openai import OpenAI
import ast
from datetime import datetime
import pytz

current_dateTime = datetime.now(pytz.timezone('Etc/GMT'))
print(current_dateTime.strftime(format = "%Y-%m-%d %H:%M"))

2024-07-17 07:53


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_openai import ChatOpenAI
from langchain.output_parsers.json import SimpleJsonOutputParser
from langchain.schema import StrOutputParser
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_community.chat_message_histories import ChatMessageHistory
import json

- gpt-4o: "o200k_base",
- gpt-4: "cl100k_base",
- gpt-3.5-turbo: "cl100k_base",
- gpt-3.5: "cl100k_base",  # Common shorthand
- gpt-35-turbo : "cl100k_base",  # Azure deployment name

gpt-4o US$5.00 / 1M input tokens； US$15.00 / 1M output tokens

gpt-4o context length: 128K tokens


In [4]:
encoding = tiktoken.encoding_for_model("gpt-4o")
print(encoding)

<Encoding 'o200k_base'>


In [24]:
df_all = pd.read_csv('PitchBook_All_Columns_2024_07_04_14_48_36_accessibility.csv')
df_all = df_all[~df_all['business_status'].isin(['Out of Business', 'Bankruptcy: Liquidation', 'Bankruptcy: Admin/Reorg'])]
df_all['companies'] = df_all['companies'].str.replace(r'\s*\(.*?\)\s*', '', regex=True)
df_all = df_all[df_all['is_accessible'] == True]
df_all['processed_name'] = df_all['companies'].apply(process_company_name)
df_all.head()

Unnamed: 0,company_id,companies,company_former_name,company_legal_name,competitors,description,primary_industry_sector,primary_industry_group,primary_industry_code,all_industries,...,first_financing_valuation,first_financing_valuation_status,last_financing_valuation,last_financing_valuation_status,last_known_valuation,last_known_valuation_date,last_known_valuation_deal_type,processed_url,is_accessible,processed_name
0,55185-04,Estimize,,"Estimize, Inc.","Neudata, SigFig, Motif (Financial Software), Y...",Developer of an open financial estimates platf...,Information Technology,Software,Financial Software,"Financial Software*, Media and Information Ser...",...,6.34,Actual,,,36.0,16/07/2015,Early Stage VC,www.estimize.com,True,estimize
1,56288-62,New Constructs,,"New Constructs, LLC","Morningstar, CFRA, Finbox (Media and Informati...",Operator of an investment research firm intend...,Information Technology,Software,Financial Software,"Financial Software*, Media and Information Ser...",...,2.17,Actual,,,2.17,13/05/2003,Early Stage VC,www.newconstructs.com,True,new_constructs
3,53739-01,Procore Technologies,,"Procore Technologies, Inc.","Projectmates, eBuilder, CMiC",Procore Technologies Inc is a cloud-based cons...,Information Technology,Software,Business/Productivity Software,"Business/Productivity Software*, Construction ...",...,4.0,Actual,8585.03,Estimated,8585.03,20/05/2021,IPO,www.procore.com,True,procore_technologies
5,153145-27,Proof,"16 Pins, Notarize","Notarize, Inc.","Templafy, ZorroSign, eOriginal, PandaDoc, Cong...",Developer of an identity-assured transaction m...,Information Technology,Software,Business/Productivity Software,"Business/Productivity Software*, Media and Inf...",...,46.5,Actual,,,760.0,25/03/2021,Later Stage VC,www.proof.com,True,proof
6,52304-77,SMS Assist,,"SMS Assist, L.L.C.","ServiceChannel, Divisions Maintenance Group, T...",Provider of business services intended to deli...,Business Products and Services (B2B),Commercial Services,Other Commercial Services,"Buildings and Property, Business/Productivity ...",...,,,950.0,Estimated,950.0,05/01/2023,Merger/Acquisition,www.smsassist.com,True,sms_assist


In [45]:
def get_additional_info(processed_name:str, column_name:str):
    df_all = pd.read_csv('PitchBook_All_Columns_2024_07_04_14_48_36_accessibility.csv')
    df_all['companies'] = df_all['companies'].str.replace(r'\s*\(.*?\)\s*', '', regex=True)
    df_all['processed_name'] = df_all['companies'].apply(process_company_name)
    
    df_select = df_all[df_all['processed_name'] == processed_name]
    if len(df_select) > 0:
        return df_select[column_name].iloc[0]
    else:
        return None

In [46]:
df = pd.read_csv('companies_urls_info.csv')
sample = df[df['url'].isin(['https://www.vertice.one', 
                   'https://www.estimize.com',
                   'https://www.newconstructs.com',
                   'https://www.chargebee.com',
                   'https://www.bennie.com',
                   'https://www.aercompliance.com',
                   'https://www.missionmark.com',
                   'https://www.joinmassive.com',
                   'https://www.hemlane.com',
                   'https://www.vesta.com',
                   'https://www.adaptive.build',
                   'https://www.additive.ai',
                   'https://www.9fin.com',
                   'https://www.niloom.ai',
                   'https://www.nexben.com',
                   'https://www.naturealpha.ai',
                   'https://www.lworks.io',
                   'https://www.infogrid.io',
                   'https://www.harnessproperty.com',
                   'https://www.directsoftware.com',
                   'https://www.dexitcorp.com'])]

sample

Unnamed: 0,company,processed_name,url,related_urls_str,related_urls,num_of_related_urls,all_urls,num_of_all_urls
5,Vertice,vertice,https://www.vertice.one,https://www.vertice.one/product/saas-purchasin...,['https://www.vertice.one/product/saas-purchas...,6,['https://www.vertice.one/explore/cloud-manage...,31
6,Massive,massive,https://www.joinmassive.com,"https://www.joinmassive.com/casestudies,https:...","['https://www.joinmassive.com/casestudies', 'h...",3,"['https://www.joinmassive.com/faq#users', 'htt...",25
14,Additive,additive,https://www.additive.ai,https://www.additive.ai,['https://www.additive.ai'],1,"['https://www.additive.ai', 'https://www.addit...",4
105,Nexben,nexben,https://www.nexben.com,https://www.nexben.com/payment-solutions/ichra...,['https://www.nexben.com/payment-solutions/ich...,16,"['https://www.nexben.com/about/meet-the-team',...",32
142,Direct,direct,https://www.directsoftware.com,"https://www.directsoftware.com/partners,https:...","['https://www.directsoftware.com/partners', 'h...",13,"['https://www.directsoftware.com/partners', 'h...",16
168,Ledger Works,ledger_works,https://www.lworks.io,"https://www.lworks.io/customers-partners,https...","['https://www.lworks.io/customers-partners', '...",10,"['https://www.lworks.io/customers-partners', '...",20
196,Vesta,vesta,https://www.vesta.com,"https://www.vesta.com/partners,https://www.ves...","['https://www.vesta.com/partners', 'https://ww...",3,"['https://www.vesta.com/privacy', 'https://www...",9
197,Niloom.ai,niloom_ai,https://www.niloom.ai,https://www.niloom.ai,['https://www.niloom.ai'],1,"['https://www.niloom.ai', 'https://www.niloom....",7
226,Hemlane,hemlane,https://www.hemlane.com,"https://www.hemlane.com/realtor-partners/,http...","['https://www.hemlane.com/realtor-partners/', ...",2,['https://www.hemlane.com/features/rental-adve...,31
247,Harness Data Intelligence,harness_data_intelligence,https://www.harnessproperty.com,https://www.harnessproperty.com/search/service...,['https://www.harnessproperty.com/search/servi...,2,"['https://www.harnessproperty.com/contact-us',...",31


In [55]:
import os
import time
import json
from dotenv import load_dotenv
from requests.exceptions import HTTPError

def crawl_data(base_url, url_list: list, file_path: str, overwrite: bool = False):
    load_dotenv()
    # Initialize the FirecrawlApp with your API key
    app = FirecrawlApp(api_key=os.getenv('FIRECRAWL_KEY'))
    
    # Load existing data if the file exists
    if os.path.exists(file_path):
        with open(file_path, 'r') as file:
            result = json.load(file)
    else:
        result = {}
        
    current_dateTime = datetime.now(pytz.timezone('Etc/GMT'))
    result['timestamp'] = current_dateTime.strftime(format = "%Y-%m-%d %H:%M") + ' Etc/GMT'

    rate_limit_reset_time = 0
    
    for url in url_list:
        # Determine the endpoint
        if base_url == url:
            endpoint = 'main_page'
        else:
            if base_url in url:
                endpoint = url.replace(base_url, '')
            else:
                endpoint = url
        
        # Check if the endpoint already exists in the result
        if endpoint in result and not overwrite:
            print(f"Skipping {url} as it already exists and overwrite is set to False.")
            continue  # Skip this URL and move to the next one

        # Respect rate limit by waiting until the reset time
        if time.time() < rate_limit_reset_time:
            wait_time = rate_limit_reset_time - time.time()
            print(f"Rate limit exceeded. Waiting for {wait_time} seconds.")
            time.sleep(wait_time)
        
        try:
            # Scrape a single URL
            print(f"Scraping {url}.")
            response = app.scrape_url(url, {'pageOptions': {'onlyMainContent': True}})
            
            try:
                scraped_data = response.json()  # Attempt to parse JSON response
            except ValueError:
                print(f"Failed to decode JSON response for {url}")
                continue  # Skip to the next URL

            # Check if 'markdown' key exists in the scraped data
            if 'markdown' in scraped_data:
                result[endpoint] = scraped_data['markdown']
        
        except HTTPError as e:
            # Handle rate limit exceeded error
            if e.response.status_code == 429:
                rate_limit_reset_time = int(e.response.headers.get('Retry-After', 60)) + time.time()
                print(f"Rate limit exceeded. Retrying after {rate_limit_reset_time - time.time()} seconds.")
                time.sleep(rate_limit_reset_time - time.time())
                continue  # Skip the rest of the code in this iteration and retry scraping the same URL
            else:
                print(f"Unexpected error: {e}")


    # Write the updated JSON data back to the file
    with open(file_path, 'w') as file:
        json.dump(result, file, indent=4)
    
    return result

# Example usage:
# result = crawl_data(base_url, url_list, 'scraped_data.json', overwrite=False)


In [56]:
for index, row in sample.iterrows():
    base_url = row['url']
    url_list = ast.literal_eval(row['related_urls'])
    result = crawl_data(base_url, url_list, f'scraping_output_v2_raw/{row["processed_name"]}.json', overwrite=False)

Skipping https://www.vertice.one/product/saas-purchasing as it already exists and overwrite is set to False.
Skipping https://www.vertice.one/product/cloud-cost-optimization as it already exists and overwrite is set to False.
Skipping https://www.vertice.one/partners as it already exists and overwrite is set to False.
Skipping https://www.vertice.one/product/saas-cloud-platform as it already exists and overwrite is set to False.
Skipping https://www.vertice.one/customer-stories as it already exists and overwrite is set to False.
Skipping https://www.vertice.one as it already exists and overwrite is set to False.
Skipping https://www.joinmassive.com/casestudies as it already exists and overwrite is set to False.
Skipping https://www.joinmassive.com/partners as it already exists and overwrite is set to False.
Skipping https://www.joinmassive.com as it already exists and overwrite is set to False.
Skipping https://www.additive.ai as it already exists and overwrite is set to False.
Skippin

In [None]:
res = scrape_data('https://www.vertice.one/')

In [13]:
data = read_json_file('scraping_output_v2_raw/hemlane.json')

print(clean_scraped_content(data['/realtor-partners/']))

Talk to a human: [(866) 387-1629]
[Sign in]
REALTORS®' Partner in Property Management
**The best agents** help their clients get the most out of their rental properties.
![Voted Capterra's Top 20 Property Management Solutions]![Software Advice most recommended Property Management Solution badge]![Software Advice Real Estate Property Management Front Runner Badge]![Software Advice Badge - Best Customer Support for Property Management]![GetApp Badge - Best Functionality and Features]
Back
How do you support your clients with their rental properties?
I want to refer landlords
(and get paid for it)
I offer leasing
(and want free leads and tools)
I offer property management
(and want to eliminate trust accounts)
Check out other REALTORS® partnering with us
![]![Headshot of Timothy Hampson]
Timothy Hampson
License #9008072 (TX)
HP2 RESIDENTIAL
Experience
Leasing
12 years
Management
Real estate
![]![Headshot of Sandy Wickware]
Sandy Wickware
License #253554 (TX)
Fathom Realty, LLC
14 years
2 

In [15]:
for url, content in data.items():
    print(url)
    print(f'Estimated GPT4-o cost: ${calculate_cost(data[url])}')
    print(f'Estimated GPT4-o cost after cleaning: ${calculate_cost(clean_scraped_content(data[url]))}')
    print('------------------------')
    

/realtor-partners/
Estimated GPT4-o cost: $0.08219499999999999
Estimated GPT4-o cost after cleaning: $0.00167
------------------------
main_page
Estimated GPT4-o cost: $0.01288
Estimated GPT4-o cost after cleaning: $0.005535
------------------------


### Exploration of first shorten the page by extracting relevant information
Issue: The output of the content might be shorten too much

In [101]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_openai import ChatOpenAI
from langchain.output_parsers.json import SimpleJsonOutputParser
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_community.chat_message_histories import ChatMessageHistory
import os
import json


def llm_summary(text, model_name="gpt-4o"):
    system_message = """
    You are an intelligent text extraction and conversion assistant. Your task is to extract information 
    from the given text and convert it into a text (string) format. 
    The output response should contain only the data extracted from the text, with no additional commentary, explanations, or extraneous information.
    If the required information could not be found from the given source, return nothing. Do not hallucinate.
    """

    # Define the extraction prompt
    extraction_prompt = """
    You are provided with a text obtained from a company's webpage. Your task is to extract any sections or paragraphs that are relevant to the specified information of interest.

    ## Information of Interest:

    1. **About Product or Service**:
    - Any details about the products or services the company offers, including their features.

    2. **About Partner or Client**:
    - Any information about the company's partners or clients.
    - Any use cases (case studies) describing how a client is using the company's product or service.
    
    ## Note:
    Sometimes, the company does not explicit describe their clients and the client use case, instead, they will only display clients' logos. 
    You then need to extract client's name from their logos. 
    
    ## Instructions:
    - Do not summarize the content. Extract the raw lines or sections as they are.
    - If you are unsure about the relevance of the information, include it to ensure comprehensive coverage.
    - Output the extracted information in standard text format.

    ## Examples:

    ### Example 1: Product or Service
    If the input text contains:
    "Our company offers innovative cloud solutions that help businesses streamline their operations. Our key features include scalability, security, and ease of use.
    We partner with leading firms such as TechCorp and SoftInc to deliver top-notch services."

    The output should be:
    "Our company offers innovative cloud solutions that help businesses streamline their operations. Our key features include scalability, security, and ease of use.
    We partner with leading firms such as TechCorp and SoftInc to deliver top-notch services."

    ### Example 2: Client Logos
    If the input text contains:
    "Our platform and service is trusted by these innovative companies:
    ![Nationwide Logo]
    ![Freedom 365 Logo]
    ![Bestow Logo]
    ..."
    
    The output should be:
    "Our platform and service is trusted by these innovative companies: 
    Clients are: Nationwide, Freedom 365, Bestow..."
   
    """
    prompt = ChatPromptTemplate.from_messages(
        [
            ("system", system_message),
            ("system", extraction_prompt),
            ("human", "Use the given text to extract information: {input}"),
            ("human", """
                Here are the rules that you need to adhere:
                ## Rules:
                - Make sure to answer in the standard text format.
                - If no information is provided, return nothing.
                - DO NOT HALLUCINATE.
             """),
        ]
    )
    
    llm = ChatOpenAI(openai_api_key=os.getenv('OPENAI_KEY'),
                    temperature=0, 
                    model_name=model_name)

    llm_chain = prompt | llm | StrOutputParser()

    response = llm_chain.invoke({'input': text})
    
    return response


def llm_summary_execution(processed_name:str, overwrite:bool = False):

    scrape_file_path = f'scraping_output_v2_raw/{processed_name}.json'
    extraction_file_path = f'extraction_summary/{processed_name}_summary_str.json'

    scrape_data = read_json_file(scrape_file_path)

    # Load existing data if the file exists
    if os.path.exists(extraction_file_path):
        with open(extraction_file_path, 'r') as file:
            extracted_data = json.load(file)
    else:
        extracted_data = {}
        
    current_dateTime = datetime.now(pytz.timezone('Etc/GMT'))
    extracted_data['timestamp'] = current_dateTime.strftime(format = "%Y-%m-%d %H:%M") + ' Etc/GMT'
    extracted_data['processed_company'] = processed_name
    extracted_data['url'] = "https://" + get_additional_info(processed_name, 'processed_url')

    for endpoint, content in tqdm(scrape_data.items(), total=len(scrape_data), desc="Extracting data", position=0, leave=True):
        if endpoint in extracted_data and not overwrite:
            print(f"Company: {processed_name}; Skipping {endpoint} as it already exists and overwrite is set to False.")
            continue  # Skip this URL and move to the next one
        else:
            clean_content = clean_scraped_content(content)
            extracted_data[endpoint] = llm_summary(clean_content)
            print(f'Company: {processed_name}; Content in {endpoint} is extracted.')
            
    write_json_file(f'extraction_summary/{processed_name}_summary_str.json', extracted_data)
    
    return extracted_data

# # Example usage
# extracted_data = {}
# for key, value in data.items():
#     clean_content = clean_scraped_content(value)
#     extracted_data[key] = llm_summary(clean_content)

In [8]:
sample['processed_name'].to_list()

['vertice',
 'massive',
 'additive',
 'nexben',
 'direct',
 'ledger_works',
 'vesta',
 'niloom_ai',
 'hemlane',
 'harness_data_intelligence',
 'dexit',
 'naturealpha',
 'missionmark',
 'bennie',
 'infogrid',
 'new_constructs',
 '9fin',
 'adaptive',
 'estimize',
 'aer_compliance']

In [102]:
for company in tqdm(sample['processed_name'].to_list()):
    llm_summary_execution(company)

Extracting data: 100%|██████████| 7/7 [00:00<00:00, 77672.30it/s]


Company: vertice; Skipping /product/saas-purchasing as it already exists and overwrite is set to False.
Company: vertice; Skipping /product/cloud-cost-optimization as it already exists and overwrite is set to False.
Company: vertice; Skipping /partners as it already exists and overwrite is set to False.
Company: vertice; Skipping /product/saas-cloud-platform as it already exists and overwrite is set to False.
Company: vertice; Skipping /customer-stories as it already exists and overwrite is set to False.
Company: vertice; Skipping main_page as it already exists and overwrite is set to False.
Company: vertice; Skipping timestamp as it already exists and overwrite is set to False.


Extracting data: 100%|██████████| 4/4 [00:00<00:00, 68759.08it/s]


Company: massive; Skipping /casestudies as it already exists and overwrite is set to False.
Company: massive; Skipping /partners as it already exists and overwrite is set to False.
Company: massive; Skipping main_page as it already exists and overwrite is set to False.
Company: massive; Skipping timestamp as it already exists and overwrite is set to False.


Extracting data: 100%|██████████| 2/2 [00:00<00:00, 37117.73it/s]


Company: additive; Skipping main_page as it already exists and overwrite is set to False.
Company: additive; Skipping timestamp as it already exists and overwrite is set to False.


Extracting data: 100%|██████████| 17/17 [00:00<00:00, 128012.87it/s]
 20%|██        | 4/20 [00:00<00:00, 36.46it/s]

Company: nexben; Skipping /payment-solutions/ichra as it already exists and overwrite is set to False.
Company: nexben; Skipping /insurance-solutions/products-and-services/group-services as it already exists and overwrite is set to False.
Company: nexben; Skipping /payment-solutions/education as it already exists and overwrite is set to False.
Company: nexben; Skipping /insurance-solutions/about-general-agency as it already exists and overwrite is set to False.
Company: nexben; Skipping /insurance-solutions as it already exists and overwrite is set to False.
Company: nexben; Skipping /payment-solutions/education/workshops as it already exists and overwrite is set to False.
Company: nexben; Skipping /payment-solutions/education/whats-an-ichra as it already exists and overwrite is set to False.
Company: nexben; Skipping /insurance-solutions/products-and-services as it already exists and overwrite is set to False.
Company: nexben; Skipping /insurance-solutions/products-and-services/ancill

Extracting data: 100%|██████████| 14/14 [00:00<00:00, 102837.58it/s]


Company: direct; Skipping /partners as it already exists and overwrite is set to False.
Company: direct; Skipping /products/reporting-and-business-intelligence as it already exists and overwrite is set to False.
Company: direct; Skipping /products/unified-inbox-and-messaging as it already exists and overwrite is set to False.
Company: direct; Skipping /products/channel-manager as it already exists and overwrite is set to False.
Company: direct; Skipping /products/payments-and-payouts as it already exists and overwrite is set to False.
Company: direct; Skipping /products/user-manager as it already exists and overwrite is set to False.
Company: direct; Skipping /products/housekeeping-maintenance-manager as it already exists and overwrite is set to False.
Company: direct; Skipping /products/direct-booking-websites as it already exists and overwrite is set to False.
Company: direct; Skipping /use-cases as it already exists and overwrite is set to False.
Company: direct; Skipping /products 

Extracting data: 100%|██████████| 11/11 [00:00<00:00, 137313.52it/s]


Company: ledger_works; Skipping /customers-partners as it already exists and overwrite is set to False.
Company: ledger_works; Skipping /customers-partners#partners as it already exists and overwrite is set to False.
Company: ledger_works; Skipping /products#protect as it already exists and overwrite is set to False.
Company: ledger_works; Skipping /products as it already exists and overwrite is set to False.
Company: ledger_works; Skipping /products#predict as it already exists and overwrite is set to False.
Company: ledger_works; Skipping /products#analyze as it already exists and overwrite is set to False.
Company: ledger_works; Skipping /products#what-we-do as it already exists and overwrite is set to False.
Company: ledger_works; Skipping /products#how-we-do-it as it already exists and overwrite is set to False.
Company: ledger_works; Skipping main_page as it already exists and overwrite is set to False.
Company: ledger_works; Skipping #riskops-as-a-service as it already exists an

Extracting data: 100%|██████████| 4/4 [00:00<00:00, 73908.44it/s]


Company: vesta; Skipping /partners as it already exists and overwrite is set to False.
Company: vesta; Skipping /product as it already exists and overwrite is set to False.
Company: vesta; Skipping main_page as it already exists and overwrite is set to False.
Company: vesta; Skipping timestamp as it already exists and overwrite is set to False.


Extracting data: 100%|██████████| 2/2 [00:00<00:00, 8551.08it/s]
 40%|████      | 8/20 [00:00<00:00, 38.39it/s]

Company: niloom_ai; Skipping main_page as it already exists and overwrite is set to False.
Company: niloom_ai; Skipping timestamp as it already exists and overwrite is set to False.


Extracting data: 100%|██████████| 3/3 [00:00<00:00, 48960.75it/s]


Company: hemlane; Skipping /realtor-partners/ as it already exists and overwrite is set to False.
Company: hemlane; Skipping main_page as it already exists and overwrite is set to False.
Company: hemlane; Skipping timestamp as it already exists and overwrite is set to False.


Extracting data: 100%|██████████| 3/3 [00:00<00:00, 45425.68it/s]


Company: harness_data_intelligence; Skipping /search/serviced-offices-to-rent as it already exists and overwrite is set to False.
Company: harness_data_intelligence; Skipping main_page as it already exists and overwrite is set to False.
Company: harness_data_intelligence; Skipping timestamp as it already exists and overwrite is set to False.


Extracting data: 100%|██████████| 3/3 [00:00<00:00, 3079.52it/s]


Company: dexit; Skipping /services as it already exists and overwrite is set to False.
Company: dexit; Skipping main_page as it already exists and overwrite is set to False.
Company: dexit; Skipping timestamp as it already exists and overwrite is set to False.


Extracting data: 100%|██████████| 3/3 [00:00<00:00, 7521.17it/s]
 60%|██████    | 12/20 [00:00<00:00, 30.85it/s]

Company: naturealpha; Skipping /solutions as it already exists and overwrite is set to False.
Company: naturealpha; Skipping main_page as it already exists and overwrite is set to False.
Company: naturealpha; Skipping timestamp as it already exists and overwrite is set to False.


Extracting data: 100%|██████████| 4/4 [00:00<00:00, 15827.56it/s]


Company: missionmark; Skipping /template/resources/product as it already exists and overwrite is set to False.
Company: missionmark; Skipping /template/services as it already exists and overwrite is set to False.
Company: missionmark; Skipping main_page as it already exists and overwrite is set to False.
Company: missionmark; Skipping timestamp as it already exists and overwrite is set to False.


Extracting data: 100%|██████████| 4/4 [00:00<00:00, 1304.71it/s]


Company: bennie; Skipping /partners as it already exists and overwrite is set to False.
Company: bennie; Skipping /customers as it already exists and overwrite is set to False.
Company: bennie; Skipping main_page as it already exists and overwrite is set to False.
Company: bennie; Skipping timestamp as it already exists and overwrite is set to False.


Extracting data: 100%|██████████| 5/5 [00:00<00:00, 53773.13it/s]


Company: infogrid; Skipping /solutions as it already exists and overwrite is set to False.
Company: infogrid; Skipping /partners as it already exists and overwrite is set to False.
Company: infogrid; Skipping /products as it already exists and overwrite is set to False.
Company: infogrid; Skipping main_page as it already exists and overwrite is set to False.
Company: infogrid; Skipping timestamp as it already exists and overwrite is set to False.


Extracting data: 100%|██████████| 4/4 [00:00<00:00, 64527.75it/s]
 80%|████████  | 16/20 [00:00<00:00, 30.47it/s]

Company: new_constructs; Skipping main_page as it already exists and overwrite is set to False.
Company: new_constructs; Skipping /partnerships/ as it already exists and overwrite is set to False.
Company: new_constructs; Skipping /customer-testimonials/ as it already exists and overwrite is set to False.
Company: new_constructs; Skipping timestamp as it already exists and overwrite is set to False.


Extracting data: 100%|██████████| 7/7 [00:00<00:00, 84368.18it/s]


Company: 9fin; Skipping /solutions/asset-managers-and-hedge-funds as it already exists and overwrite is set to False.
Company: 9fin; Skipping /insights/categories/Product%20Update as it already exists and overwrite is set to False.
Company: 9fin; Skipping /solutions/sales-and-trading as it already exists and overwrite is set to False.
Company: 9fin; Skipping /solutions/investment-banks as it already exists and overwrite is set to False.
Company: 9fin; Skipping main_page as it already exists and overwrite is set to False.
Company: 9fin; Skipping /solutions/law-firms as it already exists and overwrite is set to False.
Company: 9fin; Skipping timestamp as it already exists and overwrite is set to False.


Extracting data: 100%|██████████| 3/3 [00:00<00:00, 49152.00it/s]

Company: adaptive; Skipping /case-studies as it already exists and overwrite is set to False.
Company: adaptive; Skipping main_page as it already exists and overwrite is set to False.
Company: adaptive; Skipping timestamp as it already exists and overwrite is set to False.



Extracting data: 100%|██████████| 2/2 [00:00<00:00, 26630.50it/s]


Company: estimize; Skipping main_page as it already exists and overwrite is set to False.
Company: estimize; Skipping timestamp as it already exists and overwrite is set to False.


Extracting data: 100%|██████████| 13/13 [00:00<00:00, 133970.40it/s]
100%|██████████| 20/20 [00:00<00:00, 32.50it/s]

Company: aer_compliance; Skipping /solution/firm-trading as it already exists and overwrite is set to False.
Company: aer_compliance; Skipping /solution/artificial-intelligence as it already exists and overwrite is set to False.
Company: aer_compliance; Skipping /solution/best-in-class-coverage as it already exists and overwrite is set to False.
Company: aer_compliance; Skipping /solution/pre-trade-clearance as it already exists and overwrite is set to False.
Company: aer_compliance; Skipping /solutions as it already exists and overwrite is set to False.
Company: aer_compliance; Skipping /solution/conflicts-of-interest as it already exists and overwrite is set to False.
Company: aer_compliance; Skipping main_page as it already exists and overwrite is set to False.
Company: aer_compliance; Skipping /solution/attestations-certifications as it already exists and overwrite is set to False.
Company: aer_compliance; Skipping /solution/post-trade-monitoring as it already exists and overwrite 




## Instructor

https://github.com/jxnl/instructor

In [11]:
from pydantic import BaseModel, Field
from typing import List, Optional
from langchain_core.prompts import ChatPromptTemplate

class ProductDescription(BaseModel):
    name: str = Field(..., alias='summarised name of product')
    description: str = Field(..., alias='concise features description of the product or service')
    
class SummaryProductDescription(BaseModel):
    name: str = Field(..., alias='summarised name of the main product offerings of the company')
    description: str = Field(..., alias='summary of product offering of the company')
    
class ClientDescription(BaseModel):
    name: str = Field(..., alias='name of the client or partner')
    product_used: Optional[str] = Field(None, alias='summary of the product or service used by the client or partner')
    description: Optional[str] = Field(None, alias='description of the usecase')

class ExtractedInformation(BaseModel):
    product_descriptions: Optional[List[ProductDescription]] = None
    # product_offering_summary: str = Field(..., alias='summary of product offering of the company')
    summary_product_description: SummaryProductDescription
    client_descriptions: Optional[List[ClientDescription]] = None
    



In [93]:
def llm_information_extraction(text: str, custom_extraction_prompt: str, model_name: str = 'gpt-4o', additional_context: str = None) -> ExtractedInformation:
    system_message = """
    You are an intelligent text extraction and conversion assistant. Your task is to extract structured information 
    from the given text and convert it into a structured format. 
    The output response should contain only the data extracted from the text, with no additional commentary, explanations, or extraneous information.
    If the required information could not be found from the given source, return nothing for that field. Do not hallucinate.
    """
    
    extraction_prompt = f"""
    {system_message}
    {custom_extraction_prompt}
    """
    
    # Other models to consider: "gpt-3.5-turbo-0125"
    # Patch the OpenAI client with Instructor
    client = instructor.from_openai(OpenAI(api_key=os.getenv('OPENAI_KEY')))
    
    if additional_context:
        response = client.chat.completions.create(
            model=model_name, 
            response_model=ExtractedInformation,
            messages=[
                {"role": "system", "content": extraction_prompt},
                {"role": "user", "content": f"Use the given text to extract information: {text}"},
                {"role": "user", "content": f"""Here are some additional descriptions about this company for your reference:
                                                {additional_context}"""},
                {"role": "user", "content": """
                    Here are the rules that you need to adhere:
                    ## Rules:
                    - The aim is to achieve simplicity and clarity in the extracted text.
                    - Make sure to answer in the structured format.
                    - If no information is provided for any of the fields, return nothing of that field.
                    - DO NOT HALLUCINATE.
                """},
            ]
        )
        
    else:
        response = client.chat.completions.create(
            model=model_name, 
            response_model=ExtractedInformation,
            messages=[
                {"role": "system", "content": extraction_prompt},
                {"role": "user", "content": f"Use the given text to extract information: {text}"},
                {"role": "user", "content": """
                    Here are the rules that you need to adhere:
                    ## Rules:
                    - The aim is to achieve simplicity and clarity in the extracted text.
                    - Make sure to answer in the structured format.
                    - If no information is provided for any of the fields, return nothing of that field.
                    - DO NOT HALLUCINATE.
                """},
            ]
        )
    return response




In [76]:
product_extraction_prompt = """
    Extract the following information from the text extracted from a webpage of a company:

    1. Product Description:
    - What service or product does the company provide?
    - What features does the product or service have?
    Note: If the company has more than one product or service, automatically detect and list each product with its relevant details.
    
    2. Summary of Product Offering:
    - Summary of the description of the service that the company provide, taking into consideration of all the product offerings.
    Note: Do not include any company-specific information in the summary, such as company name and location.
    
    3. Client Description:
    - Name of the client or partner. Note: Only focus on corporate partners or clients, instead of individuals. 
    - Summary of the product or service used by the client or partner.
    - Description of the use case.
    Note: If the product used and description fields are not mentioned, they should be None.

    Output in a structured format.
"""


In [103]:
def llm_extraction_execution(processed_name:str, include_additional_context:bool = True, overwrite:bool = False):

    summary_file_path = f'extraction_summary/{processed_name}_summary_str.json'
    extraction_file_path = f'extraction_output_v2/{processed_name}_extraction.json'
    
    if not overwrite and os.path.exists(extraction_file_path):
        print(f"Company: {processed_name}; Skipping extraction as the extraction file already exists and overwrite is set to False.")
        return None
    else:
        if os.path.exists(summary_file_path):
            with open(summary_file_path, 'r') as file:
                summary = json.load(file)

            combined_summary = f"## Main Page:\n {summary['main_page']}\n----------------\n"

            for endpoint, text in summary.items():
                if endpoint not in ["main_page", "timestamp", "processed_company", "url"]:
                    combined_summary += f"## {endpoint}:\n{text}\n----------------\n"
                    
            # print(combined_summary)
            
            print(f"Company: {processed_name}; Information extraction begins.")
            if include_additional_context:
                context = get_additional_info(processed_name, 'description')
                
                print(f'Estimated Cost: ${calculate_cost(combined_summary + context)}')
                print(f'Pitchbook description obtained: {context}')
                
                response = llm_information_extraction(text = combined_summary, 
                                                    custom_extraction_prompt = product_extraction_prompt, 
                                                    additional_context = context)
                
                
            else:
                print(f'Estimated Cost: ${calculate_cost(combined_summary)}')
                response = llm_information_extraction(text = combined_summary, 
                                            custom_extraction_prompt = product_extraction_prompt, 
                                            additional_context = None)
            
            # print(response)
            result = response.dict()
            current_dateTime = datetime.now(pytz.timezone('Etc/GMT'))
            result['timestamp'] = current_dateTime.strftime(format = "%Y-%m-%d %H:%M") + ' Etc/GMT'
            result['processed_company'] = processed_name
            result['url'] = "https://" + get_additional_info(processed_name, 'processed_url')
            write_json_file(extraction_file_path, result)
            
            return result
        else:
            print(f'Summary file: {summary_file_path} does not exist.')
            return None
            
            

In [104]:
def update_json_file(file_path:str, new_info:dict, update_timestamp:bool = False):
    data = read_json_file(file_path)
    for key, value in new_info.items():
        data[key] = value
        
    if update_timestamp:
        current_dateTime = datetime.now(pytz.timezone('Etc/GMT'))
        data['timestamp'] = current_dateTime.strftime(format = "%Y-%m-%d %H:%M") + ' Etc/GMT'

    write_json_file(file_path, data)
    print(f'JSON FILE: {file_path} is updated.')

In [105]:
processed_name = '9fin'
update_json_file('extraction_output_v2/9fin_extraction.json', {'processed_company': processed_name,
                                                               'url': "https://" + get_additional_info(processed_name, 'processed_url')})

In [106]:
for company in tqdm(list(sample['processed_name'])):
    # llm_extraction_execution(company)
    update_json_file(f'extraction_output_v2/{company}_extraction.json', {'processed_company': company,
                                                               'url': "https://" + get_additional_info(company, 'processed_url')})

100%|██████████| 20/20 [00:00<00:00, 35.82it/s]


In [97]:
'https://' + get_additional_info('aer_compliance', 'processed_url')


'https://www.aercompliance.com'

In [58]:
# Example data
text = """
Our company offers innovative cloud solutions that help businesses streamline their operations. Our key features include scalability, security, and ease of use.

We partner with leading firms such as TechCorp and SoftInc to deliver top-notch services. For example, TechCorp uses our cloud solutions to improve their data management, resulting in a 30% increase in efficiency.

Our client, SoftInc, has integrated our services into their workflow, leading to significant improvements in their project turnaround times.

Our platform and service are trusted by these innovative companies:
Clients are: Nationwide, Freedom, Bestow...
...
"""

response = llm_information_extraction(text, product_extraction_prompt)
print(response)


product_descriptions=[ProductDescription(name='Cloud Solutions', description='Scalability, security, and ease of use.')] product_offering_summary='Innovative cloud solutions that help businesses streamline their operations with key features including scalability, security, and ease of use.' client_descriptions=[ClientDescription(name='TechCorp', product_used='Cloud solutions', description='Improve data management, resulting in a 30% increase in efficiency.'), ClientDescription(name='SoftInc', product_used='Cloud solutions', description='Integrated services into their workflow, leading to significant improvements in project turnaround times.'), ClientDescription(name='Nationwide', product_used=None, description=None), ClientDescription(name='Freedom', product_used=None, description=None), ClientDescription(name='Bestow', product_used=None, description=None)]


In [88]:
with open('extraction_summary/new_constructs_summary_str.json', 'r') as file:
    summary = json.load(file)

combined_summary = f"## Main Page:\n {summary['main_page']}\n----------------\n"

for endpoint, text in summary.items():
    if endpoint not in ["main_page", "timestamp"]:
        combined_summary += f"## {endpoint}:\n{text}\n----------------\n"

In [89]:
print(combined_summary)

## Main Page:
 Get Instant Access to our 30 "**Zombie Stocks**" you should eliminate from your portfolio!
Plus, you'll get CEO David Trainer's thoughts on the market, investing philosophies, and much more.
Superior Investment Research Powered by AI
Unconflicted, proprietary, novel [alpha]
[Proven-superior]
data, reports, alerts & ratings on [10,000+]
stocks, ETFs, mutual funds & debt issuers based on our [Robo-Analyst]
AI.
The best of the best use our research.
Proof that our research outperforms:
*   **Superior Fundamental Data: [The Journal of Financial Economics]
*   **Superior Financial Models: [Ernst & Young]
*   **Superior Stock Ratings: [Harvard Business School]
> Core Earnings contains information about future performance that is incremental to Street Earnings.”
> HBS & MIT Sloan professors, page 29
> we identified cases where Compustat did not collect information relating to firms’ income that is useful in assessing Core Earnings.
> HBS & MIT Sloan professors, page 16
The prop

### Prompting Chains

In [58]:
from pydantic import BaseModel, Field
from typing import List, Optional, Literal

class ProductDescription(BaseModel):
    name: str = Field(..., alias='summarised name of product')
    description: str = Field(..., alias='concise features description of the product or service')

class SummaryProductDescription(BaseModel):
    name: str = Field(..., alias='summarised name of the main product offerings of the company')
    description: str = Field(..., alias='summary of product offering of the company')

class ClientDescription(BaseModel):
    name: str = Field(..., alias='name of the client or partner')
    description: Optional[str] = Field(None, alias='description of the usecase')

class ExtractedInformation(BaseModel):
    product_descriptions: Optional[List[ProductDescription]] = None
    summary_product_description: Optional[SummaryProductDescription] = None
    client_descriptions: Optional[List[ClientDescription]] = None
    
class ValidatedClientDescription(BaseModel):
    name: str = Field(..., alias='name of the client or partner')
    entity_type: Literal["person", "company", "general_entity", "other", "school"]
    product_used: Optional[str] = Field(None, alias='summary of the product or service used by the client or partner')
    description: Optional[str] = Field(None, alias='description of the usecase')

class ValidatedExtractedInformation(BaseModel):
    # product_descriptions: Optional[List[ProductDescription]] = None
    # summary_product_description: Optional[SummaryProductDescription] = None
    client_descriptions: Optional[List[ValidatedClientDescription]] = None


In [72]:
import instructor
from openai import OpenAI
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Patch the OpenAI client with Instructor
client = instructor.from_openai(OpenAI(api_key=os.getenv('OPENAI_KEY')))

def initial_extraction(text: str, model_name: str = 'gpt-4o', additional_context: str = None) -> ExtractedInformation:
    system_message = """
    You are an intelligent text extraction and conversion assistant. Your task is to extract structured information 
    from the given text and convert it into a structured format. 
    The output response should contain only the data extracted from the text, with no additional commentary, explanations, or extraneous information.
    If the required information could not be found from the given source, return nothing for that field. Do not hallucinate.
    """
    
    custom_extraction_prompt = """
    Extract the following information from the text extracted from a webpage of a company:

    1. Product Description:
    - What service or product does the company provide?
    - What features does the product or service have?
    Note: If the company has more than one product or service, automatically detect and list each product with its relevant details.
    
    2. Summary of Product Offering:
    - Summary of the description of the service that the company provide, taking into consideration of all the product offerings.
    Note: Do not include any company-specific information in the summary, such as company name and location.
    
    3. Client Description:
    - Name of the corporate client or partner. 
    - Description of the use case.
    Note: Focus on the extraction of company's name, instead of individuals.
    Note: If the description of the use case is not mentioned, it should be None.
    

    Output in a structured format.
    """
    
    rule_prompt = """
                Here are the rules that you need to adhere:
                    ## Rules:
                    - The aim is to achieve simplicity and clarity in the extracted text.
                    - Make sure to answer in the structured format.
                    - If no information is provided for any of the fields, return nothing of that field.
                    - DO NOT HALLUCINATE.
                """
    
    extraction_prompt = f"""
    {system_message}
    {custom_extraction_prompt}
    """
    
    if additional_context:
        response = client.chat.completions.create(
            model=model_name, 
            response_model=ExtractedInformation,
            messages=[
                {"role": "system", "content": extraction_prompt},
                {"role": "user", "content": f"Use the given text to extract information: {text}"},
                {"role": "user", "content": f"""Here are some additional descriptions about this company for your reference:
                                                {additional_context}"""},
                {"role": "user", "content": rule_prompt}
            ]
        )
        
    else:
        response = client.chat.completions.create(
            model=model_name, 
            response_model=ExtractedInformation,
            messages=[
                {"role": "system", "content": extraction_prompt},
                {"role": "user", "content": f"Use the given text to extract information: {text}"},
                {"role": "user", "content": rule_prompt}
            ]
        )
    return response


# Example usage for the first prompt
text = """
Our company offers innovative cloud solutions that help businesses streamline their operations. Our key features include scalability, security, and ease of use.

We partner with leading firms such as TechCorp and SoftInc to deliver top-notch services. For example, TechCorp uses our cloud solutions to improve their data management, resulting in a 30% increase in efficiency.

Our client, SoftInc, has integrated our services into their workflow, leading to significant improvements in their project turnaround times.

Our platform and service are trusted by these innovative companies:
![Nationwide Logo]
![Freedom 365 Logo]
![Bestow Logo]
...
"""



In [74]:
def information_validation(products: list, clients: list, summary: dict, model_name: str = 'gpt-4o') -> ValidatedExtractedInformation:
    system_message = """
    You are an intelligent text extraction and conversion assistant. Your task is to validate the client information, classify the client names into different entity types, and determine which product is likely used by the client. 
    The output response should contain only the data validated and assigned, with no additional commentary, explanations, or extraneous information.
    If the required information could not be found from the given source, return nothing for that field. Do not hallucinate.
    """
    
    product_info = "\n".join([f"Product: {p['name']}; Description: {p['description']}" for p in products])
    client_info = "\n".join([f"Client: {c['name']}; Description: {c['description']}" for c in clients])
    summary_info = f"{summary['name']}: {summary['description']}"
    
    few_shot_examples = """
        ## Example 1:
        Client Name: Mike Johnson, CEO of TechCorp
        Entity_type: person
        - Reason: Mike Johnson is the name of a person. 
        
        ## Example 2:
        Client Name: Government
        Entity_type: general_entity
        - Reason: "Government" is a general entity, not a specific company.

        ## Example 3:
        Client Name: Innovative Solutions LLC
        Entity_type: company
        - Reason: Innovative Solutions LLC is a specific company name.
        
        ## Example 4:
        Client Name: A US resort
        Entity_type: general_entity
        - Reason: "A US resort" is a general description, not a specific company name.
    
        ## Example 5: 
        Client Name: University College London
        Entity_type: school
        - Reason: University College London is a specific school name.
    """

    validation_prompt = f"""
    {system_message}
    Here is the product information extracted:
    {product_info}
    
    Here is the summary of product offerings of the company:
    {summary_info}
    
    Here are the clients and their use cases:
    {client_info}
    
    Your task is to:
    1. Classify each client name into one of the following entity types: person, company, general_entity, school, or other.
       Note: the entity type "company" should be given to specific companies, with company names.
    2. Based on the product descriptions and client use cases, assign the most likely product used by each client. 
       If you are not confident about which product the client uses, return None for that field.

    Here are some examples regarding the classifying clients into different entity types:
    {few_shot_examples}

    Output in a structured format.
    """
    
    response = client.chat.completions.create(
        model=model_name,
        response_model=ValidatedExtractedInformation,
        messages=[
            {"role": "system", "content": validation_prompt},
            {"role": "user", "content": """
                Here are the rules that you need to adhere:
                ## Rules:
                - Classify each client name into one of the following entity types: person, company, general_entity, school, or other.
                - Assign the most likely product used by each client based on the provided product descriptions and use cases.
                - If the product used is not clear, return None for that field.
                - Make sure to answer in the structured format.
                - DO NOT HALLUCINATE.
            """},
        ]
    )
    return response


In [77]:
initial_response = initial_extraction(text).dict()

# Example usage for the second prompt

products = initial_response['product_descriptions'] if initial_response['product_descriptions'] else []
clients = initial_response['client_descriptions'] if initial_response['client_descriptions'] else []
summary = initial_response['summary_product_description']

validated_response = information_validation(products, clients, summary)
print(validated_response.dict())

result = initial_response
result['validated_client_description'] = validated_response.dict()['client_descriptions']

result

{'client_descriptions': [{'name': 'TechCorp', 'entity_type': 'company', 'product_used': 'Cloud Solutions', 'description': 'Uses cloud solutions to improve data management, resulting in a 30% increase in efficiency.'}, {'name': 'SoftInc', 'entity_type': 'company', 'product_used': 'Cloud Solutions', 'description': 'Integrated cloud services into their workflow, leading to significant improvements in their project turnaround times.'}, {'name': 'Nationwide', 'entity_type': 'company', 'product_used': None, 'description': None}, {'name': 'Freedom 365', 'entity_type': 'company', 'product_used': None, 'description': None}, {'name': 'Bestow', 'entity_type': 'company', 'product_used': None, 'description': None}]}


{'product_descriptions': [{'name': 'Cloud Solutions',
   'description': 'Scalability, security, and ease of use.'}],
 'summary_product_description': {'name': 'Cloud Solutions',
  'description': 'Innovative cloud solutions that help businesses streamline their operations with features such as scalability, security, and ease of use.'},
 'client_descriptions': [{'name': 'TechCorp',
   'description': 'Uses cloud solutions to improve data management, resulting in a 30% increase in efficiency.'},
  {'name': 'SoftInc',
   'description': 'Integrated cloud services into their workflow, leading to significant improvements in their project turnaround times.'},
  {'name': 'Nationwide', 'description': None},
  {'name': 'Freedom 365', 'description': None},
  {'name': 'Bestow', 'description': None}],
 'validated_client_description': [{'name': 'TechCorp',
   'entity_type': 'company',
   'product_used': 'Cloud Solutions',
   'description': 'Uses cloud solutions to improve data management, resulting in 

In [66]:
def llm_extraction_execution(processed_name:str, include_additional_context:bool = True, overwrite:bool = False):

    summary_file_path = f'extraction_summary/{processed_name}_summary_str.json'
    extraction_file_path = f'extraction_output_v2/{processed_name}_extraction.json'
    
    if not overwrite and os.path.exists(extraction_file_path):
        print(f"Company: {processed_name}; Skipping extraction as the extraction file already exists and overwrite is set to False.")
        return None
    else:
        if os.path.exists(summary_file_path):
            with open(summary_file_path, 'r') as file:
                summary = json.load(file)

            combined_summary = f"## Main Page:\n {summary['main_page']}\n----------------\n"

            for endpoint, text in summary.items():
                if endpoint not in ["main_page", "timestamp", "processed_company", "url"]:
                    combined_summary += f"## {endpoint}:\n{text}\n----------------\n"
            
            print(f"Company: {processed_name}; Information extraction begins.")
            if include_additional_context:
                context = get_additional_info(processed_name, 'description')
                
                print(f'Company: {processed_name}; Estimated Cost: ${calculate_cost(combined_summary + context)}')
                print(f'Company: {processed_name}; Pitchbook description obtained: {context}')
                
                initial_response = initial_extraction(text = combined_summary, 
                                                additional_context = context)
                
            else:
                print(f'Company: {processed_name}; Estimated Cost: ${calculate_cost(combined_summary)}')
                initial_response = initial_extraction(text = combined_summary, 
                                            additional_context = None)
            
            print(f'Company: {processed_name}; PART 1 - Initial extraction is completed.')
            # Example usage for the second prompt
            
            result = initial_response.dict()
            
            if initial_response.client_descriptions:
                products = initial_response.product_descriptions if initial_response.product_descriptions else []
                clients = initial_response.client_descriptions if initial_response.client_descriptions else []
                summary = initial_response.summary_product_description

                validated_response = information_validation(products, clients, summary)
                print(f'Company: {processed_name}; PART 2 - Information validation is completed.')
                result['validated_client_descriptions'] = validated_response.dict()['client_descriptions']
                
            else:
                print(f'Company: {processed_name}; PART 2 - Skipped, due to lack of client information.')
                result['validated_client_descriptions'] = None
            
            current_dateTime = datetime.now(pytz.timezone('Etc/GMT'))
            result['timestamp'] = current_dateTime.strftime(format = "%Y-%m-%d %H:%M") + ' Etc/GMT'
            result['processed_company'] = processed_name
            result['url'] = "https://" + get_additional_info(processed_name, 'processed_url')
            write_json_file(extraction_file_path, result)
            
            return result
        else:
            print(f'Summary file: {summary_file_path} does not exist.')
            return None
            
            

In [61]:
llm_extraction_execution(processed_name = '9fin', 
                         include_additional_context = True, 
                         overwrite = True)

Company: 9fin; Information extraction begins.
Company: 9fin; Estimated Cost: $0.01919
Company: 9fin; Pitchbook description obtained: Developer of a data analytics platform designed to provide AI-powered financial data. The company's platform uses computer vision and machine learning to find key data on high-yield bonds and the companies who issue them along with up-to-date tracking and monitoring of deals, real-time market news and events, and price data on bonds displayed as charts for the fixed-income market, enabling income professionals to save time and make proper investment decisions.
Company: 9fin; PART 1 - Initial extraction is completed.
Company: 9fin; PART 2 - Information validation is completed.


{'product_descriptions': [{'name': 'Data & Analytics Platform',
   'description': 'Provides AI-powered financial data and analytics. Features real-time market news, key data on high-yield bonds, deal tracking, financial profiles, predictive analytics, search functionality, and ESG data.'},
  {'name': 'Comparables',
   'description': 'Benchmark prior transactions, bonds, loans, or company profiles using over 300 credit metrics.'},
  {'name': 'Earnings',
   'description': 'AI transcripts and instant analysis for earnings reports.'},
  {'name': 'Search',
   'description': 'Powerful search tool for thousands of documents text-searchable by any keyword or phrase.'},
  {'name': 'ESG',
   'description': 'A full suite of Environmental, Social, and Governance data and analysis.'},
  {'name': 'Distressed and Restructuring',
   'description': 'Tools to spot undervalued credits and potential future restructurings.'},
  {'name': 'News',
   'description': 'Aggregates news from 2,000 sources using AI

In [62]:
for company in list(sample.processed_name):
    if company in ['9fin']:
        continue
    llm_extraction_execution(processed_name = company, 
                        include_additional_context = True, 
                        overwrite = True)
        

Company: vertice; Information extraction begins.
Company: vertice; Estimated Cost: $0.016800000000000002
Company: vertice; Pitchbook description obtained: Developer a spend optimization platform designed to help save on annual software expenditure. The company's platform empowers companies of every size and industry to get more visibility and control of their software and cloud spending and leverages automation to deliver guaranteed cost savings, enabling companies to view, control, and save on both Software as a service (SaaS) and cloud costs with an integrated unified offering.


Company: vertice; PART 1 - Initial extraction is completed.
Company: vertice; PART 2 - Information validation is completed.
Company: massive; Information extraction begins.
Company: massive; Estimated Cost: $0.004515
Company: massive; Pitchbook description obtained: Developer of an online monetization platform designed to serve as an alternative to advertisements or charged subscriptions. The company's platform utilizes unused computing resources like a central processing unit, graphics processing unit, and bandwidth to establish a new way to make money and pay online, enabling companies to earn revenue without inconvenience to the user.
Company: massive; PART 1 - Initial extraction is completed.
Company: massive; PART 2 - Information validation is completed.
Company: additive; Information extraction begins.
Company: additive; Estimated Cost: $0.000315
Company: additive; Pitchbook description obtained: Developer of a tax workflow management platform designed to help tax professionals t

In [80]:

for processed_name in tqdm(list(sample.processed_name)):
    
    extraction_file_path = f'extraction_output_v2/{processed_name}_extraction.json'
    initial_response = read_json_file(extraction_file_path)
    if initial_response['client_descriptions']:
        products = initial_response['product_descriptions'] if initial_response['product_descriptions'] else []
        clients = initial_response['client_descriptions'] if initial_response['client_descriptions'] else []
        summary = initial_response['summary_product_description']

        validated_response = information_validation(products, clients, summary)
        print(f'Company: {processed_name}; PART 2 - Information validation is completed.')
        result['validated_client_descriptions'] = validated_response.dict()['client_descriptions']
        
    else:
        print(f'Company: {processed_name}; PART 2 - Skipped, due to lack of client information.')
        result['validated_client_descriptions'] = None
    
    current_dateTime = datetime.now(pytz.timezone('Etc/GMT'))
    result['timestamp'] = current_dateTime.strftime(format = "%Y-%m-%d %H:%M") + ' Etc/GMT'
    result['processed_company'] = processed_name
    result['url'] = "https://" + get_additional_info(processed_name, 'processed_url')
    write_json_file(extraction_file_path, result)

  5%|▌         | 1/20 [00:11<03:34, 11.30s/it]

Company: vertice; PART 2 - Information validation is completed.


 10%|█         | 2/20 [00:15<02:13,  7.41s/it]

Company: massive; PART 2 - Information validation is completed.
Company: additive; PART 2 - Skipped, due to lack of client information.


 20%|██        | 4/20 [00:18<00:58,  3.65s/it]

Company: nexben; PART 2 - Information validation is completed.


 25%|██▌       | 5/20 [00:32<01:42,  6.84s/it]

Company: direct; PART 2 - Information validation is completed.


 30%|███       | 6/20 [00:46<02:06,  9.03s/it]

Company: ledger_works; PART 2 - Information validation is completed.


 35%|███▌      | 7/20 [01:19<03:31, 16.27s/it]

Company: vesta; PART 2 - Information validation is completed.
Company: niloom_ai; PART 2 - Skipped, due to lack of client information.


 45%|████▌     | 9/20 [01:22<01:42,  9.31s/it]

Company: hemlane; PART 2 - Information validation is completed.


 50%|█████     | 10/20 [01:27<01:23,  8.38s/it]

Company: harness_data_intelligence; PART 2 - Information validation is completed.
Company: dexit; PART 2 - Skipped, due to lack of client information.


 60%|██████    | 12/20 [01:34<00:50,  6.26s/it]

Company: naturealpha; PART 2 - Information validation is completed.


 65%|██████▌   | 13/20 [01:36<00:36,  5.23s/it]

Company: missionmark; PART 2 - Information validation is completed.


 70%|███████   | 14/20 [01:59<00:57,  9.65s/it]

Company: bennie; PART 2 - Information validation is completed.


 75%|███████▌  | 15/20 [02:12<00:52, 10.53s/it]

Company: infogrid; PART 2 - Information validation is completed.


 80%|████████  | 16/20 [02:33<00:53, 13.34s/it]

Company: new_constructs; PART 2 - Information validation is completed.


 85%|████████▌ | 17/20 [02:51<00:43, 14.62s/it]

Company: 9fin; PART 2 - Information validation is completed.


 90%|█████████ | 18/20 [02:56<00:24, 12.05s/it]

Company: adaptive; PART 2 - Information validation is completed.


 95%|█████████▌| 19/20 [03:17<00:14, 14.66s/it]

Company: estimize; PART 2 - Information validation is completed.


100%|██████████| 20/20 [03:18<00:00,  9.93s/it]

Company: aer_compliance; PART 2 - Information validation is completed.





In [79]:
sample

Unnamed: 0,company,processed_name,url,related_urls_str,related_urls,num_of_related_urls,all_urls,num_of_all_urls
5,Vertice,vertice,https://www.vertice.one,https://www.vertice.one/product/saas-purchasin...,['https://www.vertice.one/product/saas-purchas...,6,['https://www.vertice.one/explore/cloud-manage...,31
6,Massive,massive,https://www.joinmassive.com,"https://www.joinmassive.com/casestudies,https:...","['https://www.joinmassive.com/casestudies', 'h...",3,"['https://www.joinmassive.com/faq#users', 'htt...",25
14,Additive,additive,https://www.additive.ai,https://www.additive.ai,['https://www.additive.ai'],1,"['https://www.additive.ai', 'https://www.addit...",4
105,Nexben,nexben,https://www.nexben.com,https://www.nexben.com/payment-solutions/ichra...,['https://www.nexben.com/payment-solutions/ich...,16,"['https://www.nexben.com/about/meet-the-team',...",32
142,Direct,direct,https://www.directsoftware.com,"https://www.directsoftware.com/partners,https:...","['https://www.directsoftware.com/partners', 'h...",13,"['https://www.directsoftware.com/partners', 'h...",16
168,Ledger Works,ledger_works,https://www.lworks.io,"https://www.lworks.io/customers-partners,https...","['https://www.lworks.io/customers-partners', '...",10,"['https://www.lworks.io/customers-partners', '...",20
196,Vesta,vesta,https://www.vesta.com,"https://www.vesta.com/partners,https://www.ves...","['https://www.vesta.com/partners', 'https://ww...",3,"['https://www.vesta.com/privacy', 'https://www...",9
197,Niloom.ai,niloom_ai,https://www.niloom.ai,https://www.niloom.ai,['https://www.niloom.ai'],1,"['https://www.niloom.ai', 'https://www.niloom....",7
226,Hemlane,hemlane,https://www.hemlane.com,"https://www.hemlane.com/realtor-partners/,http...","['https://www.hemlane.com/realtor-partners/', ...",2,['https://www.hemlane.com/features/rental-adve...,31
247,Harness Data Intelligence,harness_data_intelligence,https://www.harnessproperty.com,https://www.harnessproperty.com/search/service...,['https://www.harnessproperty.com/search/servi...,2,"['https://www.harnessproperty.com/contact-us',...",31


In [68]:
extraction_file_path = f'extraction_output_v2/9fin_extraction.json'
initial_response = read_json_file(extraction_file_path)

In [71]:
products = initial_response['product_descriptions']
product_info = "\n".join([f"Product: {p['name']}; Description: {p['description']}" for p in products])

product_info

'Product: Data & Analytics Platform; Description: Provides AI-powered financial data and analytics. Features real-time market news, key data on high-yield bonds, deal tracking, financial profiles, predictive analytics, search functionality, and ESG data.\nProduct: Comparables; Description: Benchmark prior transactions, bonds, loans, or company profiles using over 300 credit metrics.\nProduct: Earnings; Description: AI transcripts and instant analysis for earnings reports.\nProduct: Search; Description: Powerful search tool for thousands of documents text-searchable by any keyword or phrase.\nProduct: ESG; Description: A full suite of Environmental, Social, and Governance data and analysis.\nProduct: Distressed and Restructuring; Description: Tools to spot undervalued credits and potential future restructurings.\nProduct: News; Description: Aggregates news from 2,000 sources using AI and delivers it quickly.\nProduct: Financials; Description: Full financial profiles with 3 statements, K

In [73]:
products

[{'name': 'Data & Analytics Platform',
  'description': 'Provides AI-powered financial data and analytics. Features real-time market news, key data on high-yield bonds, deal tracking, financial profiles, predictive analytics, search functionality, and ESG data.'},
 {'name': 'Comparables',
  'description': 'Benchmark prior transactions, bonds, loans, or company profiles using over 300 credit metrics.'},
 {'name': 'Earnings',
  'description': 'AI transcripts and instant analysis for earnings reports.'},
 {'name': 'Search',
  'description': 'Powerful search tool for thousands of documents text-searchable by any keyword or phrase.'},
 {'name': 'ESG',
  'description': 'A full suite of Environmental, Social, and Governance data and analysis.'},
 {'name': 'Distressed and Restructuring',
  'description': 'Tools to spot undervalued credits and potential future restructurings.'},
 {'name': 'News',
  'description': 'Aggregates news from 2,000 sources using AI and delivers it quickly.'},
 {'name':