In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from neo4j import GraphDatabase
from dotenv import load_dotenv
from firecrawl_scraping import *
from utility import *
from llm_extraction import *
from tqdm import tqdm
import glob
import tiktoken as tiktoken
import instructor
from pydantic import BaseModel
import instructor
from openai import OpenAI
import ast
from datetime import datetime
import pytz

current_dateTime = datetime.now(pytz.timezone('Etc/GMT'))
print(current_dateTime.strftime(format = "%Y-%m-%d %H:%M"))

2024-07-28 18:39


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_openai import ChatOpenAI
from langchain.output_parsers.json import SimpleJsonOutputParser
from langchain.schema import StrOutputParser
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_community.chat_message_histories import ChatMessageHistory
import json

- gpt-4o: "o200k_base",
- gpt-4: "cl100k_base",
- gpt-3.5-turbo: "cl100k_base",
- gpt-3.5: "cl100k_base",  # Common shorthand
- gpt-35-turbo : "cl100k_base",  # Azure deployment name

gpt-4o US$5.00 / 1M input tokens； US$15.00 / 1M output tokens

gpt-4o context length: 128K tokens


In [5]:
encoding = tiktoken.encoding_for_model("gpt-4o")
print(encoding)

<Encoding 'o200k_base'>


In [14]:
df_all = pd.read_csv('data/PitchBook_All_Columns_2024_07_04_14_48_36_accessibility.csv')
df_all = df_all[~df_all['business_status'].isin(['Out of Business', 'Bankruptcy: Liquidation', 'Bankruptcy: Admin/Reorg'])]
df_all['companies'] = df_all['companies'].str.replace(r'\s*\(.*?\)\s*', '', regex=True)
df_all = df_all[df_all['is_accessible'] == True]
df_all['processed_name'] = df_all['companies'].apply(process_company_name)
df_all.head()

Unnamed: 0,company_id,companies,company_former_name,company_legal_name,competitors,description,primary_industry_sector,primary_industry_group,primary_industry_code,all_industries,...,first_financing_valuation,first_financing_valuation_status,last_financing_valuation,last_financing_valuation_status,last_known_valuation,last_known_valuation_date,last_known_valuation_deal_type,processed_url,is_accessible,processed_name
0,55185-04,Estimize,,"Estimize, Inc.","Neudata, SigFig, Motif (Financial Software), Y...",Developer of an open financial estimates platf...,Information Technology,Software,Financial Software,"Financial Software*, Media and Information Ser...",...,6.34,Actual,,,36.0,16/07/2015,Early Stage VC,www.estimize.com,True,estimize
1,56288-62,New Constructs,,"New Constructs, LLC","Morningstar, CFRA, Finbox (Media and Informati...",Operator of an investment research firm intend...,Information Technology,Software,Financial Software,"Financial Software*, Media and Information Ser...",...,2.17,Actual,,,2.17,13/05/2003,Early Stage VC,www.newconstructs.com,True,new_constructs
3,53739-01,Procore Technologies,,"Procore Technologies, Inc.","Projectmates, eBuilder, CMiC",Procore Technologies Inc is a cloud-based cons...,Information Technology,Software,Business/Productivity Software,"Business/Productivity Software*, Construction ...",...,4.0,Actual,8585.03,Estimated,8585.03,20/05/2021,IPO,www.procore.com,True,procore_technologies
5,153145-27,Proof,"16 Pins, Notarize","Notarize, Inc.","Templafy, ZorroSign, eOriginal, PandaDoc, Cong...",Developer of an identity-assured transaction m...,Information Technology,Software,Business/Productivity Software,"Business/Productivity Software*, Media and Inf...",...,46.5,Actual,,,760.0,25/03/2021,Later Stage VC,www.proof.com,True,proof
6,52304-77,SMS Assist,,"SMS Assist, L.L.C.","ServiceChannel, Divisions Maintenance Group, T...",Provider of business services intended to deli...,Business Products and Services (B2B),Commercial Services,Other Commercial Services,"Buildings and Property, Business/Productivity ...",...,,,950.0,Estimated,950.0,05/01/2023,Merger/Acquisition,www.smsassist.com,True,sms_assist


In [8]:
def get_additional_info(processed_name:str, column_name:str):
    df_all = pd.read_csv('data/PitchBook_All_Columns_2024_07_04_14_48_36_accessibility.csv')
    df_all['companies'] = df_all['companies'].str.replace(r'\s*\(.*?\)\s*', '', regex=True)
    df_all['processed_name'] = df_all['companies'].apply(process_company_name)
    
    df_select = df_all[df_all['processed_name'] == processed_name]
    if len(df_select) > 0:
        return df_select[column_name].iloc[0]
    else:
        return None

In [33]:
df = pd.read_csv('companies_urls_info.csv')
sample = df[~df['url'].isin(['https://www.vertice.one', 
                    'https://www.estimize.com',
                    'https://www.newconstructs.com',
                    'https://www.chargebee.com',
                    'https://www.bennie.com',
                    'https://www.aercompliance.com',
                    'https://www.missionmark.com',
                    'https://www.joinmassive.com',
                    'https://www.hemlane.com',
                    'https://www.vesta.com',
                    'https://www.adaptive.build',
                    'https://www.additive.ai',
                    'https://www.9fin.com',
                    'https://www.niloom.ai',
                    'https://www.nexben.com',
                    'https://www.naturealpha.ai',
                    'https://www.lworks.io',
                    'https://www.infogrid.io',
                    'https://www.harnessproperty.com',
                    'https://www.directsoftware.com',
                    'https://www.dexitcorp.com',
                    'https://www.bankerslab.com',
                    'https://www.avyst.com',
                    'https://www.aggregion.com',
                    'https://www.validifi.com',
                    'https://www.revvin.com',
                    'https://www.gotyou.co',
                    'https://www.credenza3.com',
                    'https://www.concertocard.com',
                    'https://www.element.io',
                    'https://www.deepview.com',
                    'https://www.realgrader.com',
                    'https://www.fanpage.com',
                    'https://www.insurgrid.com',
                    'https://www.cobalt.pe',
                    'https://www.soundout.com',
                    'https://www.imoto.com',
                    'https://www.ontheupper.com',
                    'https://www.getzorba.com',
                    'https://www.paralian.io',
                    'https://www.gzi.finance',
                    'https://www.retailmarketpoint.com',
                    'https://www.yardikube.com',
                    'https://www.getwats.com',
                    'https://www.truelytics.com',
                    'https://www.trykintsugi.com',
                    'https://www.veruna.com',
                    'https://www.tailpath.com',
                    'https://www.worksmith.com',
                    'https://www.go-maestro.com',
                    'https://www.goblueswipe.com',
                    'https://www.useink.com',
                    'https://www.verdata.com',
                    'https://www.beauhurst.com',
                    'https://www.saltmine.com',
                    'https://www.nammu21.com',
                    'https://www.alaffiahealth.com',
                    'https://www.bookingpal.com',
                    'https://www.metrika.co',
                    'https://www.accumula.com',
                    'https://www.flowfi.com',
                    'https://www.prodeal360.com',
                    'https://www.krowdit.com',
                    'https://www.jibtechnologies.com',
                    'https://www.fieldwire.com',
                    'https://www.commercesync.com',
                    'https://www.arcana.io',
                    'https://www.copernicspace.com',
                    'https://www.youattest.com',
                    'https://www.pilotbird.com',
                    'https://www.pocketbook.tech',
                    'https://www.chainlinklabs.com',
                    'https://www.proper.ai',
                    'https://www.layer.team',
                    'https://www.veriphyanalytics.com',
                    'https://www.wearegroov.io',
                    'https://www.buildstock.com',
                    'https://www.scriptainsights.com',
                    'https://www.solisolutions.net',
                    'https://www.titanpay.ai',
                    'https://www.herondata.io',
                    'https://www.locatestrategy.com',
                    'https://www.hopemacy.com',
                    'https://www.smartpayllc.com',
                    'https://www.prospectnow.com',
                    'https://www.hashku.com',
                    'https://www.prismdata.com',
                    'https://www.taxometry.com',
                    'https://www.r3vl.xyz',
                    'https://www.avantarisk.com',
                    'https://www.every.io',
                    'https://www.joot.io',
                    'https://www.buildops.com',
                    'https://www.downtobid.com',
                    'https://www.plural.ai',
                    'https://www.bitwage.com',
                    'https://www.gorodeo.app',
                    'https://www.ledgible.io',
                    'https://www.artd.ai',
                    'https://www.acumatica.com',
                    'https://www.carby.cc',
                    'https://www.shibuya.film',
                    'https://www.trelora.com',
                    'https://www.regeo.co',
                    'https://www.sustainround.com',
                    'https://www.cherre.com',
                    'https://www.yottled.com',
                    'https://www.singularities.com',
                    'https://www.domoticsre.com',
                    'https://www.dexfreight.io',
                    'https://www.nue.io',
                    'https://www.atto.co '
                    ])]

sample = sample.iloc[:180]

In [36]:
sample.tail(10)

Unnamed: 0,company,processed_name,url,related_urls_str,related_urls,num_of_related_urls,all_urls,num_of_all_urls
269,Solidspac3,solidspac3,https://www.solidspac3.com,https://www.solidspac3.com,['https://www.solidspac3.com'],1,"['https://www.solidspac3.com', 'https://www.so...",2
270,Gravity Software,gravity_software,https://www.gogravity.com,https://www.gogravity.com/product/functionalit...,['https://www.gogravity.com/product/functional...,44,['https://www.gogravity.com/industries/investm...,77
271,IPGen,ipgen,https://www.ipgen.io,https://www.ipgen.io,['https://www.ipgen.io'],1,"['https://www.ipgen.io/law-firms/', 'https://w...",18
272,The Booking Factory,the_booking_factory,https://www.thebookingfactory.com,https://www.thebookingfactory.com/services#par...,['https://www.thebookingfactory.com/services#p...,10,['https://www.thebookingfactory.com/white-labe...,32
273,Combat IQ,combat_iq,https://www.combatiq.io,https://www.combatiq.io,['https://www.combatiq.io'],1,"['https://www.combatiq.io/schedule-demo', 'htt...",7
274,Anybill,anybill,https://www.anybill.com,"https://www.anybill.com/services,https://www.a...","['https://www.anybill.com/services', 'https://...",4,"['https://www.anybill.com/careers', 'https://w...",12
275,Caligotech,caligotech,https://www.caligotech.com,https://www.caligotech.com,['https://www.caligotech.com'],1,"['https://www.caligotech.com/careers', 'https:...",9
276,iLumen,ilumen,https://www.ilumen.com,"https://www.ilumen.com/case-studies,https://ww...","['https://www.ilumen.com/case-studies', 'https...",5,"['https://www.ilumen.com/case-studies', 'https...",19
277,iknowa,iknowa,https://www.iknowa.com,https://www.iknowa.com,['https://www.iknowa.com'],1,"['https://www.iknowa.com/', 'https://www.iknow...",2
278,Loanbase,loanbase,https://www.loanbase.com,"https://www.loanbase.com/case-studies/,https:/...","['https://www.loanbase.com/case-studies/', 'ht...",2,"['https://www.loanbase.com#brokers', 'https://...",16


In [12]:
sample[sample['num_of_related_urls']==1]

Unnamed: 0,company,processed_name,url,related_urls_str,related_urls,num_of_related_urls,all_urls,num_of_all_urls
91,Rental Beast,rental_beast,https://www.rentalbeast.com,https://www.rentalbeast.com,['https://www.rentalbeast.com'],1,['https://www.rentalbeast.com/about-rental-bea...,25
92,Nophin,nophin,https://www.nophin.com,https://www.nophin.com,['https://www.nophin.com'],1,"['https://www.nophin.com/terms-of-use', 'https...",4
93,Candidly,candidly,https://www.getcandidly.com,https://www.getcandidly.com,['https://www.getcandidly.com'],1,"['https://www.getcandidly.com', 'https://www.g...",2
96,Mango REIX,mango_reix,https://www.mangoreix.com,https://www.mangoreix.com,['https://www.mangoreix.com'],1,['https://www.mangoreix.com/_files/ugd/b3a289_...,3
98,Next Quarter,next_quarter,https://www.nextq.ai,https://www.nextq.ai,['https://www.nextq.ai'],1,"['https://www.nextq.ai', 'https://www.nextq.ai...",5
...,...,...,...,...,...,...,...,...
256,Howsy,howsy,https://www.howsy.com,https://www.howsy.com,['https://www.howsy.com'],1,['https://www.howsy.com'],1
258,Terrene Labs,terrene_labs,https://www.terrenelabs.com,https://www.terrenelabs.com,['https://www.terrenelabs.com'],1,['https://www.terrenelabs.com'],1
262,AuthorLoyalty,authorloyalty,https://www.authorloyalty.com,https://www.authorloyalty.com,['https://www.authorloyalty.com'],1,['https://www.authorloyalty.com'],1
269,Solidspac3,solidspac3,https://www.solidspac3.com,https://www.solidspac3.com,['https://www.solidspac3.com'],1,"['https://www.solidspac3.com', 'https://www.so...",2


In [23]:
doc_list = os.listdir('scraping_output_v2_raw')
for doc in doc_list:
    
    try: 
        data = read_json_file(f'scraping_output_v2_raw/{doc}')
        if 'timestamp' not in data:
            current_dateTime = datetime.now(pytz.timezone('Etc/GMT'))
            data['timestamp'] = current_dateTime.strftime(format = "%Y-%m-%d %H:%M")
            
        if "processed_company" not in data:
            process_name = doc.replace('.json', '')
            data["processed_company"] = process_name
            
        if "url" not in data:
            data["url"] = "https://" + get_additional_info(process_name, 'processed_url')
        
        write_json_file(f'scraping_output_v2_raw/{doc}', data)
    except Exception as e:
        print(f'Error occurs on {doc}: {e}')

Error occurs on .DS_Store: 'utf-8' codec can't decode byte 0x80 in position 3131: invalid start byte


In [None]:
for index, row in tqdm(sample.iterrows(), total=len(sample), desc="Scraping data", position=0, leave=True):
    base_url = row['url']
    url_list = ast.literal_eval(row['related_urls'])
    result = crawl_data(base_url, url_list, f'scraping_output_v2_raw/{row["processed_name"]}.json', overwrite=False)

In [15]:
for url, content in data.items():
    print(url)
    print(f'Estimated GPT4-o cost: ${calculate_cost(data[url])}')
    print(f'Estimated GPT4-o cost after cleaning: ${calculate_cost(clean_scraped_content(data[url]))}')
    print('------------------------')
    

/realtor-partners/
Estimated GPT4-o cost: $0.08219499999999999
Estimated GPT4-o cost after cleaning: $0.00167
------------------------
main_page
Estimated GPT4-o cost: $0.01288
Estimated GPT4-o cost after cleaning: $0.005535
------------------------


### Exploration of first shorten the page by extracting relevant information
Issue: The output of the content might be shorten too much

In [9]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_openai import ChatOpenAI
from langchain.output_parsers.json import SimpleJsonOutputParser
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_community.chat_message_histories import ChatMessageHistory
import os
import json


def llm_summary(text, model_name="gpt-4o"):
    system_message = """
    You are an intelligent text extraction and conversion assistant. Your task is to extract information 
    from the given text and convert it into a text (string) format. 
    The output response should contain only the data extracted from the text, with no additional commentary, explanations, or extraneous information.
    If the required information could not be found from the given source, return nothing. Do not hallucinate.
    """

    # Define the extraction prompt
    extraction_prompt = """
    You are provided with a text obtained from a company's webpage. Your task is to extract any sections or paragraphs that are relevant to the specified information of interest.

    ## Information of Interest:

    1. **About Product or Service**:
    - Any details about the products or services the company offers, including their features.

    2. **About Partner or Client**:
    - Any information about the company's partners or clients.
    - Any use cases (case studies) describing how a client is using the company's product or service.
    
    ## Note:
    Sometimes, the company does not explicit describe their clients and the client use case, instead, they will only display clients' logos. 
    You then need to extract client's name from their logos. 
    
    ## Instructions:
    - Do not summarize the content. Extract the raw lines or sections as they are.
    - If you are unsure about the relevance of the information, include it to ensure comprehensive coverage.
    - Output the extracted information in standard text format.

    ## Examples:

    ### Example 1: Product or Service
    If the input text contains:
    "Our company offers innovative cloud solutions that help businesses streamline their operations. Our key features include scalability, security, and ease of use.
    We partner with leading firms such as TechCorp and SoftInc to deliver top-notch services."

    The output should be:
    "Our company offers innovative cloud solutions that help businesses streamline their operations. Our key features include scalability, security, and ease of use.
    We partner with leading firms such as TechCorp and SoftInc to deliver top-notch services."

    ### Example 2: Client Logos
    If the input text contains:
    "Our platform and service is trusted by these innovative companies:
    ![Nationwide Logo]
    ![Freedom 365 Logo]
    ![Bestow Logo]
    ..."
    
    The output should be:
    "Our platform and service is trusted by these innovative companies: 
    Clients are: Nationwide, Freedom 365, Bestow..."
   
    """
    prompt = ChatPromptTemplate.from_messages(
        [
            ("system", system_message),
            ("system", extraction_prompt),
            ("human", "Use the given text to extract information: {input}"),
            ("human", """
                Here are the rules that you need to adhere:
                ## Rules:
                - Make sure to answer in the standard text format.
                - If no information is provided, return nothing.
                - DO NOT HALLUCINATE.
             """),
        ]
    )
    
    llm = ChatOpenAI(openai_api_key=os.getenv('OPENAI_KEY'),
                    temperature=0, 
                    model_name=model_name)

    llm_chain = prompt | llm | StrOutputParser()

    response = llm_chain.invoke({'input': text})
    
    return response


def llm_summary_execution(processed_name:str, 
                          scrape_file_path:str,
                          summary_file_path:str,
                          overwrite:bool = False, 
                          model_name:str = 'gpt-4o-mini'):

    scrape_data = read_json_file(scrape_file_path)
    
    file_modified = False

    # Load existing data if the file exists
    if os.path.exists(summary_file_path):
        with open(summary_file_path, 'r') as file:
            extracted_data = json.load(file)
    else:
        extracted_data = {}

    for endpoint, content in tqdm(scrape_data.items(), total=len(scrape_data), desc="Extracting data", position=0, leave=True):
        if endpoint in ['timestamp', 'processed_company', 'url']:
            continue
        if endpoint in extracted_data and not overwrite:
            print(f"Company: {processed_name}; Skipping {endpoint} as it already exists and overwrite is set to False.")
            continue  # Skip this URL and move to the next one
        else:
            clean_content = clean_scraped_content(content)
            extracted_data[endpoint] = llm_summary(text = clean_content, model_name = model_name)
            print(f'Company: {processed_name}; Content in {endpoint} is extracted.')
            
            current_dateTime = datetime.now(pytz.timezone('Etc/GMT'))
            extracted_data['timestamp'] = current_dateTime.strftime(format = "%Y-%m-%d %H:%M") + ' Etc/GMT'
            file_modified = True
    
    if file_modified:
        extracted_data['processed_company'] = processed_name
        extracted_data['url'] = "https://" + get_additional_info(processed_name, 'processed_url')
        write_json_file(summary_file_path, extracted_data)
        
    return extracted_data


In [10]:
# Example usage
processed_name = 'the_booking_factory'
scrape_file_path = f'scraping_output_v2_raw/{processed_name}.json'
summary_file_path = f'extraction_summary_v2/{processed_name}_summary_str.json'

response = llm_summary_execution(processed_name = processed_name,
                                 scrape_file_path = scrape_file_path,
                                 summary_file_path = summary_file_path)

Extracting data:  31%|███       | 4/13 [00:09<00:22,  2.49s/it]

Company: the_booking_factory; Content in /services#partner-plan is extracted.


Extracting data:  38%|███▊      | 5/13 [00:15<00:26,  3.37s/it]

Company: the_booking_factory; Content in /services#rev-plus is extracted.


Extracting data:  46%|████▌     | 6/13 [00:20<00:26,  3.79s/it]

Company: the_booking_factory; Content in /customer-agreement is extracted.


Extracting data:  54%|█████▍    | 7/13 [00:26<00:26,  4.34s/it]

Company: the_booking_factory; Content in /services#hotel-it is extracted.


Extracting data:  62%|██████▏   | 8/13 [00:32<00:24,  4.96s/it]

Company: the_booking_factory; Content in /services is extracted.


Extracting data:  69%|██████▉   | 9/13 [00:39<00:21,  5.37s/it]

Company: the_booking_factory; Content in /services#accounting-services is extracted.


Extracting data:  77%|███████▋  | 10/13 [00:45<00:16,  5.55s/it]

Company: the_booking_factory; Content in /services#basic-plan is extracted.


Extracting data:  85%|████████▍ | 11/13 [00:50<00:11,  5.55s/it]

Company: the_booking_factory; Content in /partner-plan is extracted.


Extracting data:  92%|█████████▏| 12/13 [00:58<00:06,  6.09s/it]

Company: the_booking_factory; Content in main_page is extracted.


Extracting data: 100%|██████████| 13/13 [01:04<00:00,  4.97s/it]

Company: the_booking_factory; Content in /services#bf-web is extracted.





## Instructor

https://github.com/jxnl/instructor

### Prompting Chains

In [3]:
from pydantic import BaseModel, Field
from typing import List, Optional, Literal

class ProductDescription(BaseModel):
    name: str = Field(..., alias='summarised name of product')
    description: str = Field(..., alias='concise features description of the product or service')

class SummaryProductDescription(BaseModel):
    name: str = Field(..., alias='summarised name of the main product offerings of the company')
    description: str = Field(..., alias='summary of product offering of the company')

class ClientDescription(BaseModel):
    name: str = Field(..., alias='name of the client or partner')
    description: Optional[str] = Field(None, alias='description of the usecase')

class ExtractedInformation(BaseModel):
    product_descriptions: Optional[List[ProductDescription]] = None
    summary_product_description: Optional[SummaryProductDescription] = None
    client_descriptions: Optional[List[ClientDescription]] = None
    
class ValidatedClientDescription(BaseModel):
    name: str = Field(..., alias='name of the client or partner')
    entity_type: Literal["person", "company", "general_entity", "other", "school"]
    product_used: Optional[str] = Field(None, alias='summary of the product or service used by the client or partner')
    description: Optional[str] = Field(None, alias='description of the usecase')

class ValidatedExtractedInformation(BaseModel):
    # product_descriptions: Optional[List[ProductDescription]] = None
    # summary_product_description: Optional[SummaryProductDescription] = None
    client_descriptions: Optional[List[ValidatedClientDescription]] = None


In [4]:
import instructor
from openai import OpenAI
from dotenv import load_dotenv

# Load environment variables
load_dotenv()


def initial_extraction(text: str, model_name: str = 'gpt-4o', additional_context: str = None) -> ExtractedInformation:
    
    # Patch the OpenAI client with Instructor
    client = instructor.from_openai(OpenAI(api_key=os.getenv('OPENAI_KEY')))
    
    system_message = """
    You are an intelligent text extraction and conversion assistant. Your task is to extract structured information 
    from the given text and convert it into a structured format. 
    The output response should contain only the data extracted from the text, with no additional commentary, explanations, or extraneous information.
    If the required information could not be found from the given source, return nothing for that field. Do not hallucinate.
    """
    
    custom_extraction_prompt = """
    Extract the following information from the text extracted from a webpage of a company:

    1. Product Description:
    - What service or product does the company provide?
    - What features does the product or service have?
    Note: If the company has more than one product or service, automatically detect and list each product with its relevant details.
    
    2. Summary of Product Offering:
    - Summary of the description of the service that the company provide, taking into consideration of all the product offerings.
    Note: Do not include any company-specific information in the summary, such as company name and location.
    
    3. Client Description:
    - Name of the corporate client or partner. 
    - Description of the use case.
    Note: Focus on the extraction of company's name, instead of individuals.
    Note: If the description of the use case is not mentioned, it should be None.
    

    Output in a structured format.
    """
    
    rule_prompt = """
                Here are the rules that you need to adhere:
                    ## Rules:
                    - The aim is to achieve simplicity and clarity in the extracted text.
                    - Make sure to answer in the structured format.
                    - If no information is provided for any of the fields, return nothing of that field.
                    - DO NOT HALLUCINATE.
                """
    
    extraction_prompt = f"""
    {system_message}
    {custom_extraction_prompt}
    """
    
    if additional_context:
        response = client.chat.completions.create(
            model=model_name, 
            response_model=ExtractedInformation,
            messages=[
                {"role": "system", "content": extraction_prompt},
                {"role": "user", "content": f"Use the given text to extract information: {text}"},
                {"role": "user", "content": f"""Here are some additional descriptions about this company for your reference:
                                                {additional_context}"""},
                {"role": "user", "content": rule_prompt}
            ]
        )
        
    else:
        response = client.chat.completions.create(
            model=model_name, 
            response_model=ExtractedInformation,
            messages=[
                {"role": "system", "content": extraction_prompt},
                {"role": "user", "content": f"Use the given text to extract information: {text}"},
                {"role": "user", "content": rule_prompt}
            ]
        )
    return response



In [5]:
def information_validation(products: list, clients: list, summary: dict, model_name: str = 'gpt-4o') -> ValidatedExtractedInformation:
    
    # Patch the OpenAI client with Instructor
    client = instructor.from_openai(OpenAI(api_key=os.getenv('OPENAI_KEY')))
    
    system_message = """
    You are an intelligent text extraction and conversion assistant. Your task is to validate the client information, classify the client names into different entity types, and determine which product is likely used by the client. 
    The output response should contain only the data validated and assigned, with no additional commentary, explanations, or extraneous information.
    If the required information could not be found from the given source, return nothing for that field. Do not hallucinate.
    """
    
    product_info = "\n".join([f"Product: {p['name']}; Description: {p['description']}" for p in products])
    client_info = "\n".join([f"Client: {c['name']}; Description: {c['description']}" for c in clients])
    summary_info = f"{summary['name']}: {summary['description']}"
    
    few_shot_examples = """
        ## Example 1:
        Client Name: Mike Johnson, CEO of TechCorp
        Entity_type: person
        - Reason: Mike Johnson is the name of a person. 
        
        ## Example 2:
        Client Name: Government
        Entity_type: general_entity
        - Reason: "Government" is a general entity, not a specific company.

        ## Example 3:
        Client Name: Innovative Solutions LLC
        Entity_type: company
        - Reason: Innovative Solutions LLC is a specific company name.
        
        ## Example 4:
        Client Name: A US resort
        Entity_type: general_entity
        - Reason: "A US resort" is a general description, not a specific company name.
    
        ## Example 5: 
        Client Name: University College London
        Entity_type: school
        - Reason: University College London is a specific school name.
    """

    validation_prompt = f"""
    {system_message}
    Here is the product information extracted:
    {product_info}
    
    Here is the summary of product offerings of the company:
    {summary_info}
    
    Here are the clients and their use cases:
    {client_info}
    
    Your task is to:
    1. Classify each client name into one of the following entity types: person, company, general_entity, school, or other.
       Note: the entity type "company" should be given to specific companies, with company names.
    2. Based on the product descriptions and client use cases, assign the most likely product used by each client. 
       If you are not confident about which product the client uses, return None for that field.

    Here are some examples regarding the classifying clients into different entity types:
    {few_shot_examples}

    Output in a structured format.
    """
    
    response = client.chat.completions.create(
        model=model_name,
        response_model=ValidatedExtractedInformation,
        messages=[
            {"role": "system", "content": validation_prompt},
            {"role": "user", "content": """
                Here are the rules that you need to adhere:
                ## Rules:
                - Classify each client name into one of the following entity types: person, company, general_entity, school, or other.
                - Assign the most likely product used by each client based on the provided product descriptions and use cases.
                - If the product used is not clear, return None for that field.
                - Make sure to answer in the structured format.
                - DO NOT HALLUCINATE.
            """},
        ]
    )
    return response


In [6]:
def llm_extraction_execution(processed_name:str, 
                             summary_file_path:str,
                             extraction_file_path:str, 
                             include_additional_context:bool = True, 
                             overwrite:bool = False):
    
    if not overwrite and os.path.exists(extraction_file_path):
        print(f"Company: {processed_name}; Skipping extraction as the extraction file already exists and overwrite is set to False.")
        return None
    else:
        if os.path.exists(summary_file_path):
            with open(summary_file_path, 'r') as file:
                summary = json.load(file)

            combined_summary = f"## Main Page:\n {summary['main_page']}\n----------------\n"

            for endpoint, text in summary.items():
                if endpoint not in ["main_page", "timestamp", "processed_company", "url"]:
                    combined_summary += f"## {endpoint}:\n{text}\n----------------\n"
            
            print(f"Company: {processed_name}; Information extraction begins.")
            if include_additional_context:
                context = get_additional_info(processed_name, 'description')
                
                print(f'Company: {processed_name}; Estimated Cost: ${calculate_cost(combined_summary + context)}')
                print(f'Company: {processed_name}; Pitchbook description obtained: {context}')
                
                initial_response = initial_extraction(text = combined_summary, 
                                                additional_context = context).dict()
                
            else:
                print(f'Company: {processed_name}; Estimated Cost: ${calculate_cost(combined_summary)}')
                initial_response = initial_extraction(text = combined_summary, 
                                            additional_context = None).dict()
            
            print(f'Company: {processed_name}; PART 1 - Initial extraction is completed.')
            
            result = initial_response
            
            if initial_response['client_descriptions']:
                products = initial_response['product_descriptions'] if initial_response['product_descriptions'] else []
                clients = initial_response['client_descriptions'] if initial_response['client_descriptions'] else []
                summary = initial_response['summary_product_description']

                validated_response = information_validation(products, clients, summary)
                print(f'Company: {processed_name}; PART 2 - Information validation is completed.')
                result['validated_client_descriptions'] = validated_response.dict()['client_descriptions']
                
            else:
                print(f'Company: {processed_name}; PART 2 - Skipped, due to lack of client information.')
                result['validated_client_descriptions'] = None
            
            current_dateTime = datetime.now(pytz.timezone('Etc/GMT'))
            result['timestamp'] = current_dateTime.strftime(format = "%Y-%m-%d %H:%M") + ' Etc/GMT'
            result['processed_company'] = processed_name
            result['url'] = "https://" + get_additional_info(processed_name, 'processed_url')

            write_json_file(extraction_file_path, result)
            
            return result
        else:
            print(f'Summary file: {summary_file_path} does not exist.')
            return None

def add_client_url_to_extraction_output(processed_name:str, extraction_file_path:str, verbose:bool = False):
    data = read_json_file(extraction_file_path)
    
    if data['validated_client_descriptions']:
        for client in data['validated_client_descriptions']:
            if client['entity_type'] != 'company':
                client['url'] = None
            else:
                url = get_and_verify_client_link(client['name'], verbose = verbose)
                client['url'] = url
        print(f"Company: {processed_name}; Client is extracted.")
    else:
        print(f"Company: {processed_name}; No clients' information.")
    write_json_file(extraction_file_path, data)    

    
def get_embedding(text:str, embedding_model:str="text-embedding-3-small"):
   client_openai = OpenAI(api_key=os.getenv('OPENAI_KEY'))
   
   text = text.replace("\n", " ")
   return client_openai.embeddings.create(input = [text], model=embedding_model).data[0].embedding


def get_product_embedding(processed_name:str, extraction_file_path:str, embedding_model:str="text-embedding-3-small"):
    
    data = read_json_file(extraction_file_path)
    # Check wheather embedding has already been done
    if 'name_embedding' in data['summary_product_description']:
        print(f'Company: {processed_name}; Embedding has already been done.')
        pass
    else:
        product_lst = data['product_descriptions']
        for product in product_lst:
            product['description_embedding'] = get_embedding(text = product['description'],
                                                                embedding_model = embedding_model)
            product['name_embedding'] = get_embedding(text = product['name'],
                                                                embedding_model = embedding_model)

        summary_product = data['summary_product_description']
        summary_product['description_embedding'] = get_embedding(text = summary_product['description'],
                                                                embedding_model = embedding_model)
        summary_product['name_embedding'] = get_embedding(text = summary_product['name'],
                                                                embedding_model = embedding_model)
        print(f'Company: {processed_name}; Embedding is completed.')
        write_json_file(extraction_file_path, data)
    
    return data

def update_client_list(processed_name:str, extraction_file_path:str, client_file_path:str = 'data/client_info.json', verbose:bool = False):
    
    data = read_json_file(extraction_file_path)
    client_info = read_json_file(client_file_path)
        
    if data['validated_client_descriptions']:
        try:        
            for client in data['validated_client_descriptions']:
                if client['entity_type'] != 'company':
                    continue
                # If a company's name already exists in the dictionary and the url is unchanged
                if client['name'] in client_info and client['url'] == client_info[client['name']]['url'] :
                    # If its service provider does not appear in the saved list, then append it
                    if processed_name not in client_info[client['name']]['service_provider_processed']:
                        client_info[client['name']]['service_provider_processed'].append(processed_name)
                        client_info[client['name']]['service_provider'].append(get_additional_info(processed_name, 'companies'))
                        client_info[client['name']]['service_provider_url'].append('https://' + get_additional_info(processed_name, 'processed_url'))
                    else:
                        if verbose:
                            print(f'Company {client["name"]} has already been recorded.')
                
                # If a company's name already does not exist, add the new company
                else:
                    client_info[client['name']] = {'processed_name': process_company_name(client['name']),
                                        'url': client['url'],
                                        'service_provider_processed': [processed_name],
                                        'service_provider': [get_additional_info(processed_name, 'companies')],
                                        'service_provider_url': ['https://' + get_additional_info(processed_name, 'processed_url')]
                                        }
            print(f"Company: {data['processed_company']}; Clients information is updated.")
            write_json_file(client_file_path, client_info)
        except Exception as e:
            print(f'Company: {processed_name}; Error occurred: {e}')
    else:
        print(f'Company: {processed_name}; No clients to be updated')
    

In [9]:
# Example usage
processed_name = 'the_booking_factory'
summary_file_path = f'extraction_summary_v2/{processed_name}_summary_str.json'
extraction_file_path = f'extraction_output_v2/{processed_name}_extraction.json'

llm_extraction_execution(processed_name = processed_name,
                         summary_file_path = summary_file_path,
                         extraction_file_path = extraction_file_path, 
                         include_additional_context = True, 
                         overwrite = False)

add_client_url_to_extraction_output(processed_name = processed_name,
                                    extraction_file_path = extraction_file_path)

get_product_embedding(processed_name = processed_name,
                      extraction_file_path = extraction_file_path)

update_client_list(processed_name = processed_name,
                   extraction_file_path = extraction_file_path,
                   client_file_path = 'data/client_info.json')

Company: the_booking_factory; Information extraction begins.
Company: the_booking_factory; Estimated Cost: $0.024290000000000003
Company: the_booking_factory; Pitchbook description obtained: Developer of a property management platform designed to advertise and conduct hotel operations. The company's platform helps businesses to advertise their hotel rooms for bookings, develop websites for customers with a friendly user interface as well as systems to manage hotel properties, enabling large resorts and small hotel owners to manage and improve their hotel business in an efficient manner.
Company: the_booking_factory; PART 1 - Initial extraction is completed.
Company: the_booking_factory; PART 2 - Information validation is completed.
Company: the_booking_factory; Client is extracted.
Company: the_booking_factory; Embedding is completed.
Company: the_booking_factory; Clients information is updated.


In [10]:
doc_list = os.listdir('extraction_summary_v2')
for doc in doc_list:
    processed_name = doc.replace('_summary_str.json', '')
    try:
        llm_extraction_execution(processed_name = processed_name, 
                        include_additional_context = True, 
                        overwrite = False)
        add_client_url_to_extraction_output(processed_name = processed_name)
        get_product_embedding(processed_name = processed_name)
        update_client_list(processed_name = processed_name)
        
    except Exception as e:
        print(f'Error occured on company {processed_name}: {e}')

Company: bankerslab; Skipping extraction as the extraction file already exists and overwrite is set to False.
Company: bankerslab; No clients' information.
Company: bankerslab; Embedding has already been done.
Company: bankerslab; No clients to be updated
Company: estimize; Skipping extraction as the extraction file already exists and overwrite is set to False.
Company: estimize; Client is extracted.
Company: estimize; Embedding has already been done.
Company: estimize; Clients information is updated.
Company: avyst; Skipping extraction as the extraction file already exists and overwrite is set to False.
Company: avyst; Client is extracted.
Company: avyst; Embedding has already been done.
Company: avyst; Clients information is updated.
Company: bennie; Skipping extraction as the extraction file already exists and overwrite is set to False.
Company: bennie; Client is extracted.
Company: bennie; Embedding has already been done.
Company: bennie; Clients information is updated.
Company: ch

In [11]:
update_client_list('hemlane')

Company: hemlane; Clients information is updated.


In [None]:
# Code used to re-validate the client description:

processed_name = 'vertice'
    
extraction_file_path = f'extraction_output_v2/{processed_name}_extraction.json'
initial_response = read_json_file(extraction_file_path)
if initial_response['client_descriptions']:
    products = initial_response['product_descriptions'] if initial_response['product_descriptions'] else []
    clients = initial_response['client_descriptions'] if initial_response['client_descriptions'] else []
    summary = initial_response['summary_product_description']

    validated_response = information_validation(products, clients, summary)
    print(f'Company: {processed_name}; PART 2 - Information validation is completed.')
    result['validated_client_descriptions'] = validated_response.dict()['client_descriptions']
    
else:
    print(f'Company: {processed_name}; PART 2 - Skipped, due to lack of client information.')
    result['validated_client_descriptions'] = None

current_dateTime = datetime.now(pytz.timezone('Etc/GMT'))
result['timestamp'] = current_dateTime.strftime(format = "%Y-%m-%d %H:%M") + ' Etc/GMT'
result['processed_company'] = processed_name
result['url'] = "https://" + get_additional_info(processed_name, 'processed_url')
write_json_file(extraction_file_path, result)

In [68]:
extraction_file_path = f'extraction_output_v2/9fin_extraction.json'
initial_response = read_json_file(extraction_file_path)

In [71]:
products = initial_response['product_descriptions']
product_info = "\n".join([f"Product: {p['name']}; Description: {p['description']}" for p in products])

product_info

'Product: Data & Analytics Platform; Description: Provides AI-powered financial data and analytics. Features real-time market news, key data on high-yield bonds, deal tracking, financial profiles, predictive analytics, search functionality, and ESG data.\nProduct: Comparables; Description: Benchmark prior transactions, bonds, loans, or company profiles using over 300 credit metrics.\nProduct: Earnings; Description: AI transcripts and instant analysis for earnings reports.\nProduct: Search; Description: Powerful search tool for thousands of documents text-searchable by any keyword or phrase.\nProduct: ESG; Description: A full suite of Environmental, Social, and Governance data and analysis.\nProduct: Distressed and Restructuring; Description: Tools to spot undervalued credits and potential future restructurings.\nProduct: News; Description: Aggregates news from 2,000 sources using AI and delivers it quickly.\nProduct: Financials; Description: Full financial profiles with 3 statements, K

In [73]:
products

[{'name': 'Data & Analytics Platform',
  'description': 'Provides AI-powered financial data and analytics. Features real-time market news, key data on high-yield bonds, deal tracking, financial profiles, predictive analytics, search functionality, and ESG data.'},
 {'name': 'Comparables',
  'description': 'Benchmark prior transactions, bonds, loans, or company profiles using over 300 credit metrics.'},
 {'name': 'Earnings',
  'description': 'AI transcripts and instant analysis for earnings reports.'},
 {'name': 'Search',
  'description': 'Powerful search tool for thousands of documents text-searchable by any keyword or phrase.'},
 {'name': 'ESG',
  'description': 'A full suite of Environmental, Social, and Governance data and analysis.'},
 {'name': 'Distressed and Restructuring',
  'description': 'Tools to spot undervalued credits and potential future restructurings.'},
 {'name': 'News',
  'description': 'Aggregates news from 2,000 sources using AI and delivers it quickly.'},
 {'name':

## Get client

In [12]:
client_info = read_json_file('data/client_info.json')

doc_list = os.listdir('extraction_output_v2')

for doc in tqdm(doc_list):
    try:
        processed_name = doc.replace('_extraction.json', '')
        data = read_json_file(f'extraction_output_v2/{doc}')

        for client in data['validated_client_descriptions']:
            if client['entity_type'] != 'company':
                continue
            # If a company's name already exists in the dictionary
            if client['name'] in client_info:
                
                # If its service provider does not appear in the saved list, then append it
                if processed_name not in client_info[client['name']]['service_provider_processed']:
                    client_info[client['name']]['service_provider_processed'].append(processed_name)
                    client_info[client['name']]['service_provider'].append(get_additional_info(processed_name, 'companies'))
                    client_info[client['name']]['service_provider_url'].append('https://' + get_additional_info(processed_name, 'processed_url'))
                else:
                    print(f'Company {client["name"]} has already been recorded.')
            
            # If a company's name already does not exist, add the new company
            else:
                client_info[client['name']] = {'processed_name': process_company_name(client['name']),
                                    'url': client['url'],
                                    'service_provider_processed': [processed_name],
                                    'service_provider': [get_additional_info(processed_name, 'companies')],
                                    'service_provider_url': ['https://' + get_additional_info(processed_name, 'processed_url')]
                                    }
        print(f"Company {data['processed_company']}'s clients are recorded.")
    except Exception as e:
        print(f'Company {processed_name} has error: {e}')
        
write_json_file('data/client_info.json', client_info)

100%|██████████| 28/28 [00:00<00:00, 355.27it/s]

Company 9fin's clients are recorded.
Company Bloomberg has already been recorded.
Company Apex Clearing has already been recorded.
Company IEX Cloud has already been recorded.
Company WisdomTree has already been recorded.
Company Viola Risk Advisors has already been recorded.
Company new_constructs's clients are recorded.
Company .DS_Store has error: 'utf-8' codec can't decode byte 0x80 in position 3131: invalid start byte
Company Green Way Homes has already been recorded.
Company Revent Builds has already been recorded.
Company Veldhouse Companies has already been recorded.
Company Drake Construction Services has already been recorded.
Company Riverside Homes has already been recorded.
Company Joseph Design & Build has already been recorded.
Company adaptive's clients are recorded.
Company HP2 RESIDENTIAL has already been recorded.
Company Fathom Realty, LLC has already been recorded.
Company B Wright At Home LLC has already been recorded.
Company hemlane's clients are recorded.
Compa




In [None]:
client_data = read_json_file('data/client_info.json')
for company, company_info in tqdm(client_data.items(), desc="Scraping data", position=0, leave=True):
    base_url = company_info['url']
    if not base_url:
        continue
    try:
        all_urls, related_urls = get_related_urls(base_url)
        if len(related_urls) > 10:
            related_urls = select_urls(related_urls, 10)
        result = crawl_data(base_url, related_urls, f'client_scraping_output/{company_info["processed_name"]}.json', overwrite=False)
    except Exception as e:
        print(f'Company {company} has error: {e}')

In [20]:
import shutil
client_data = read_json_file('data/client_info.json')
doc_list = os.listdir('scraping_output_v2_raw')

for company, company_info in client_data.items():
    filename = f"{company_info['processed_name']}.json"
    source_path = f'scraping_output_v2_raw/{filename}'
    destination_path = f'client_scraping_output/{filename}'
    if filename in doc_list:
        shutil.move(source_path, destination_path)
    


In [18]:
doc_list = os.listdir('scraping_output_v2_raw')
'bloomberg.json' in doc_list

True