In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from neo4j import GraphDatabase
from dotenv import load_dotenv
from firecrawl_scraping import *
from utility import *
from llm_extraction import *
from tqdm import tqdm
import glob
import tiktoken as tiktoken
import instructor
from pydantic import BaseModel
import instructor
from openai import OpenAI
import ast
from datetime import datetime
import pytz

current_dateTime = datetime.now(pytz.timezone('Etc/GMT'))
print(current_dateTime.strftime(format = "%Y-%m-%d %H:%M"))

2024-07-28 11:30


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_openai import ChatOpenAI
from langchain.output_parsers.json import SimpleJsonOutputParser
from langchain.schema import StrOutputParser
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_community.chat_message_histories import ChatMessageHistory
import json

- gpt-4o: "o200k_base",
- gpt-4: "cl100k_base",
- gpt-3.5-turbo: "cl100k_base",
- gpt-3.5: "cl100k_base",  # Common shorthand
- gpt-35-turbo : "cl100k_base",  # Azure deployment name

gpt-4o US$5.00 / 1M input tokens； US$15.00 / 1M output tokens

gpt-4o context length: 128K tokens


In [5]:
encoding = tiktoken.encoding_for_model("gpt-4o")
print(encoding)

<Encoding 'o200k_base'>


In [14]:
df_all = pd.read_csv('data/PitchBook_All_Columns_2024_07_04_14_48_36_accessibility.csv')
df_all = df_all[~df_all['business_status'].isin(['Out of Business', 'Bankruptcy: Liquidation', 'Bankruptcy: Admin/Reorg'])]
df_all['companies'] = df_all['companies'].str.replace(r'\s*\(.*?\)\s*', '', regex=True)
df_all = df_all[df_all['is_accessible'] == True]
df_all['processed_name'] = df_all['companies'].apply(process_company_name)
df_all.head()

Unnamed: 0,company_id,companies,company_former_name,company_legal_name,competitors,description,primary_industry_sector,primary_industry_group,primary_industry_code,all_industries,...,first_financing_valuation,first_financing_valuation_status,last_financing_valuation,last_financing_valuation_status,last_known_valuation,last_known_valuation_date,last_known_valuation_deal_type,processed_url,is_accessible,processed_name
0,55185-04,Estimize,,"Estimize, Inc.","Neudata, SigFig, Motif (Financial Software), Y...",Developer of an open financial estimates platf...,Information Technology,Software,Financial Software,"Financial Software*, Media and Information Ser...",...,6.34,Actual,,,36.0,16/07/2015,Early Stage VC,www.estimize.com,True,estimize
1,56288-62,New Constructs,,"New Constructs, LLC","Morningstar, CFRA, Finbox (Media and Informati...",Operator of an investment research firm intend...,Information Technology,Software,Financial Software,"Financial Software*, Media and Information Ser...",...,2.17,Actual,,,2.17,13/05/2003,Early Stage VC,www.newconstructs.com,True,new_constructs
3,53739-01,Procore Technologies,,"Procore Technologies, Inc.","Projectmates, eBuilder, CMiC",Procore Technologies Inc is a cloud-based cons...,Information Technology,Software,Business/Productivity Software,"Business/Productivity Software*, Construction ...",...,4.0,Actual,8585.03,Estimated,8585.03,20/05/2021,IPO,www.procore.com,True,procore_technologies
5,153145-27,Proof,"16 Pins, Notarize","Notarize, Inc.","Templafy, ZorroSign, eOriginal, PandaDoc, Cong...",Developer of an identity-assured transaction m...,Information Technology,Software,Business/Productivity Software,"Business/Productivity Software*, Media and Inf...",...,46.5,Actual,,,760.0,25/03/2021,Later Stage VC,www.proof.com,True,proof
6,52304-77,SMS Assist,,"SMS Assist, L.L.C.","ServiceChannel, Divisions Maintenance Group, T...",Provider of business services intended to deli...,Business Products and Services (B2B),Commercial Services,Other Commercial Services,"Buildings and Property, Business/Productivity ...",...,,,950.0,Estimated,950.0,05/01/2023,Merger/Acquisition,www.smsassist.com,True,sms_assist


In [13]:
def get_additional_info(processed_name:str, column_name:str):
    df_all = pd.read_csv('data/PitchBook_All_Columns_2024_07_04_14_48_36_accessibility.csv')
    df_all['companies'] = df_all['companies'].str.replace(r'\s*\(.*?\)\s*', '', regex=True)
    df_all['processed_name'] = df_all['companies'].apply(process_company_name)
    
    df_select = df_all[df_all['processed_name'] == processed_name]
    if len(df_select) > 0:
        return df_select[column_name].iloc[0]
    else:
        return None

In [33]:
df = pd.read_csv('companies_urls_info.csv')
sample = df[~df['url'].isin(['https://www.vertice.one', 
                    'https://www.estimize.com',
                    'https://www.newconstructs.com',
                    'https://www.chargebee.com',
                    'https://www.bennie.com',
                    'https://www.aercompliance.com',
                    'https://www.missionmark.com',
                    'https://www.joinmassive.com',
                    'https://www.hemlane.com',
                    'https://www.vesta.com',
                    'https://www.adaptive.build',
                    'https://www.additive.ai',
                    'https://www.9fin.com',
                    'https://www.niloom.ai',
                    'https://www.nexben.com',
                    'https://www.naturealpha.ai',
                    'https://www.lworks.io',
                    'https://www.infogrid.io',
                    'https://www.harnessproperty.com',
                    'https://www.directsoftware.com',
                    'https://www.dexitcorp.com',
                    'https://www.bankerslab.com',
                    'https://www.avyst.com',
                    'https://www.aggregion.com',
                    'https://www.validifi.com',
                    'https://www.revvin.com',
                    'https://www.gotyou.co',
                    'https://www.credenza3.com',
                    'https://www.concertocard.com',
                    'https://www.element.io',
                    'https://www.deepview.com',
                    'https://www.realgrader.com',
                    'https://www.fanpage.com',
                    'https://www.insurgrid.com',
                    'https://www.cobalt.pe',
                    'https://www.soundout.com',
                    'https://www.imoto.com',
                    'https://www.ontheupper.com',
                    'https://www.getzorba.com',
                    'https://www.paralian.io',
                    'https://www.gzi.finance',
                    'https://www.retailmarketpoint.com',
                    'https://www.yardikube.com',
                    'https://www.getwats.com',
                    'https://www.truelytics.com',
                    'https://www.trykintsugi.com',
                    'https://www.veruna.com',
                    'https://www.tailpath.com',
                    'https://www.worksmith.com',
                    'https://www.go-maestro.com',
                    'https://www.goblueswipe.com',
                    'https://www.useink.com',
                    'https://www.verdata.com',
                    'https://www.beauhurst.com',
                    'https://www.saltmine.com',
                    'https://www.nammu21.com',
                    'https://www.alaffiahealth.com',
                    'https://www.bookingpal.com',
                    'https://www.metrika.co',
                    'https://www.accumula.com',
                    'https://www.flowfi.com',
                    'https://www.prodeal360.com',
                    'https://www.krowdit.com',
                    'https://www.jibtechnologies.com',
                    'https://www.fieldwire.com',
                    'https://www.commercesync.com',
                    'https://www.arcana.io',
                    'https://www.copernicspace.com',
                    'https://www.youattest.com',
                    'https://www.pilotbird.com',
                    'https://www.pocketbook.tech',
                    'https://www.chainlinklabs.com',
                    'https://www.proper.ai',
                    'https://www.layer.team',
                    'https://www.veriphyanalytics.com',
                    'https://www.wearegroov.io',
                    'https://www.buildstock.com',
                    'https://www.scriptainsights.com',
                    'https://www.solisolutions.net',
                    'https://www.titanpay.ai',
                    'https://www.herondata.io',
                    'https://www.locatestrategy.com',
                    'https://www.hopemacy.com',
                    'https://www.smartpayllc.com',
                    'https://www.prospectnow.com',
                    'https://www.hashku.com',
                    'https://www.prismdata.com',
                    'https://www.taxometry.com',
                    'https://www.r3vl.xyz',
                    'https://www.avantarisk.com',
                    'https://www.every.io',
                    'https://www.joot.io',
                    'https://www.buildops.com',
                    'https://www.downtobid.com',
                    'https://www.plural.ai',
                    'https://www.bitwage.com',
                    'https://www.gorodeo.app',
                    'https://www.ledgible.io',
                    'https://www.artd.ai',
                    'https://www.acumatica.com',
                    'https://www.carby.cc',
                    'https://www.shibuya.film',
                    'https://www.trelora.com',
                    'https://www.regeo.co',
                    'https://www.sustainround.com',
                    'https://www.cherre.com',
                    'https://www.yottled.com',
                    'https://www.singularities.com',
                    'https://www.domoticsre.com',
                    'https://www.dexfreight.io',
                    'https://www.nue.io',
                    'https://www.atto.co '
                    ])]

sample = sample.iloc[:180]

In [36]:
sample.tail(10)

Unnamed: 0,company,processed_name,url,related_urls_str,related_urls,num_of_related_urls,all_urls,num_of_all_urls
269,Solidspac3,solidspac3,https://www.solidspac3.com,https://www.solidspac3.com,['https://www.solidspac3.com'],1,"['https://www.solidspac3.com', 'https://www.so...",2
270,Gravity Software,gravity_software,https://www.gogravity.com,https://www.gogravity.com/product/functionalit...,['https://www.gogravity.com/product/functional...,44,['https://www.gogravity.com/industries/investm...,77
271,IPGen,ipgen,https://www.ipgen.io,https://www.ipgen.io,['https://www.ipgen.io'],1,"['https://www.ipgen.io/law-firms/', 'https://w...",18
272,The Booking Factory,the_booking_factory,https://www.thebookingfactory.com,https://www.thebookingfactory.com/services#par...,['https://www.thebookingfactory.com/services#p...,10,['https://www.thebookingfactory.com/white-labe...,32
273,Combat IQ,combat_iq,https://www.combatiq.io,https://www.combatiq.io,['https://www.combatiq.io'],1,"['https://www.combatiq.io/schedule-demo', 'htt...",7
274,Anybill,anybill,https://www.anybill.com,"https://www.anybill.com/services,https://www.a...","['https://www.anybill.com/services', 'https://...",4,"['https://www.anybill.com/careers', 'https://w...",12
275,Caligotech,caligotech,https://www.caligotech.com,https://www.caligotech.com,['https://www.caligotech.com'],1,"['https://www.caligotech.com/careers', 'https:...",9
276,iLumen,ilumen,https://www.ilumen.com,"https://www.ilumen.com/case-studies,https://ww...","['https://www.ilumen.com/case-studies', 'https...",5,"['https://www.ilumen.com/case-studies', 'https...",19
277,iknowa,iknowa,https://www.iknowa.com,https://www.iknowa.com,['https://www.iknowa.com'],1,"['https://www.iknowa.com/', 'https://www.iknow...",2
278,Loanbase,loanbase,https://www.loanbase.com,"https://www.loanbase.com/case-studies/,https:/...","['https://www.loanbase.com/case-studies/', 'ht...",2,"['https://www.loanbase.com#brokers', 'https://...",16


In [12]:
sample[sample['num_of_related_urls']==1]

Unnamed: 0,company,processed_name,url,related_urls_str,related_urls,num_of_related_urls,all_urls,num_of_all_urls
91,Rental Beast,rental_beast,https://www.rentalbeast.com,https://www.rentalbeast.com,['https://www.rentalbeast.com'],1,['https://www.rentalbeast.com/about-rental-bea...,25
92,Nophin,nophin,https://www.nophin.com,https://www.nophin.com,['https://www.nophin.com'],1,"['https://www.nophin.com/terms-of-use', 'https...",4
93,Candidly,candidly,https://www.getcandidly.com,https://www.getcandidly.com,['https://www.getcandidly.com'],1,"['https://www.getcandidly.com', 'https://www.g...",2
96,Mango REIX,mango_reix,https://www.mangoreix.com,https://www.mangoreix.com,['https://www.mangoreix.com'],1,['https://www.mangoreix.com/_files/ugd/b3a289_...,3
98,Next Quarter,next_quarter,https://www.nextq.ai,https://www.nextq.ai,['https://www.nextq.ai'],1,"['https://www.nextq.ai', 'https://www.nextq.ai...",5
...,...,...,...,...,...,...,...,...
256,Howsy,howsy,https://www.howsy.com,https://www.howsy.com,['https://www.howsy.com'],1,['https://www.howsy.com'],1
258,Terrene Labs,terrene_labs,https://www.terrenelabs.com,https://www.terrenelabs.com,['https://www.terrenelabs.com'],1,['https://www.terrenelabs.com'],1
262,AuthorLoyalty,authorloyalty,https://www.authorloyalty.com,https://www.authorloyalty.com,['https://www.authorloyalty.com'],1,['https://www.authorloyalty.com'],1
269,Solidspac3,solidspac3,https://www.solidspac3.com,https://www.solidspac3.com,['https://www.solidspac3.com'],1,"['https://www.solidspac3.com', 'https://www.so...",2


In [23]:
doc_list = os.listdir('scraping_output_v2_raw')
for doc in doc_list:
    
    try: 
        data = read_json_file(f'scraping_output_v2_raw/{doc}')
        if 'timestamp' not in data:
            current_dateTime = datetime.now(pytz.timezone('Etc/GMT'))
            data['timestamp'] = current_dateTime.strftime(format = "%Y-%m-%d %H:%M")
            
        if "processed_company" not in data:
            process_name = doc.replace('.json', '')
            data["processed_company"] = process_name
            
        if "url" not in data:
            data["url"] = "https://" + get_additional_info(process_name, 'processed_url')
        
        write_json_file(f'scraping_output_v2_raw/{doc}', data)
    except Exception as e:
        print(f'Error occurs on {doc}: {e}')

Error occurs on .DS_Store: 'utf-8' codec can't decode byte 0x80 in position 3131: invalid start byte


In [None]:
for index, row in tqdm(sample.iterrows(), total=len(sample), desc="Scraping data", position=0, leave=True):
    base_url = row['url']
    url_list = ast.literal_eval(row['related_urls'])
    result = crawl_data(base_url, url_list, f'scraping_output_v2_raw/{row["processed_name"]}.json', overwrite=False)

In [13]:
data = read_json_file('scraping_output_v2_raw/hemlane.json')

print(clean_scraped_content(data['/realtor-partners/']))

Talk to a human: [(866) 387-1629]
[Sign in]
REALTORS®' Partner in Property Management
**The best agents** help their clients get the most out of their rental properties.
![Voted Capterra's Top 20 Property Management Solutions]![Software Advice most recommended Property Management Solution badge]![Software Advice Real Estate Property Management Front Runner Badge]![Software Advice Badge - Best Customer Support for Property Management]![GetApp Badge - Best Functionality and Features]
Back
How do you support your clients with their rental properties?
I want to refer landlords
(and get paid for it)
I offer leasing
(and want free leads and tools)
I offer property management
(and want to eliminate trust accounts)
Check out other REALTORS® partnering with us
![]![Headshot of Timothy Hampson]
Timothy Hampson
License #9008072 (TX)
HP2 RESIDENTIAL
Experience
Leasing
12 years
Management
Real estate
![]![Headshot of Sandy Wickware]
Sandy Wickware
License #253554 (TX)
Fathom Realty, LLC
14 years
2 

In [15]:
for url, content in data.items():
    print(url)
    print(f'Estimated GPT4-o cost: ${calculate_cost(data[url])}')
    print(f'Estimated GPT4-o cost after cleaning: ${calculate_cost(clean_scraped_content(data[url]))}')
    print('------------------------')
    

/realtor-partners/
Estimated GPT4-o cost: $0.08219499999999999
Estimated GPT4-o cost after cleaning: $0.00167
------------------------
main_page
Estimated GPT4-o cost: $0.01288
Estimated GPT4-o cost after cleaning: $0.005535
------------------------


### Exploration of first shorten the page by extracting relevant information
Issue: The output of the content might be shorten too much

In [24]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_openai import ChatOpenAI
from langchain.output_parsers.json import SimpleJsonOutputParser
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_community.chat_message_histories import ChatMessageHistory
import os
import json


def llm_summary(text, model_name="gpt-4o"):
    system_message = """
    You are an intelligent text extraction and conversion assistant. Your task is to extract information 
    from the given text and convert it into a text (string) format. 
    The output response should contain only the data extracted from the text, with no additional commentary, explanations, or extraneous information.
    If the required information could not be found from the given source, return nothing. Do not hallucinate.
    """

    # Define the extraction prompt
    extraction_prompt = """
    You are provided with a text obtained from a company's webpage. Your task is to extract any sections or paragraphs that are relevant to the specified information of interest.

    ## Information of Interest:

    1. **About Product or Service**:
    - Any details about the products or services the company offers, including their features.

    2. **About Partner or Client**:
    - Any information about the company's partners or clients.
    - Any use cases (case studies) describing how a client is using the company's product or service.
    
    ## Note:
    Sometimes, the company does not explicit describe their clients and the client use case, instead, they will only display clients' logos. 
    You then need to extract client's name from their logos. 
    
    ## Instructions:
    - Do not summarize the content. Extract the raw lines or sections as they are.
    - If you are unsure about the relevance of the information, include it to ensure comprehensive coverage.
    - Output the extracted information in standard text format.

    ## Examples:

    ### Example 1: Product or Service
    If the input text contains:
    "Our company offers innovative cloud solutions that help businesses streamline their operations. Our key features include scalability, security, and ease of use.
    We partner with leading firms such as TechCorp and SoftInc to deliver top-notch services."

    The output should be:
    "Our company offers innovative cloud solutions that help businesses streamline their operations. Our key features include scalability, security, and ease of use.
    We partner with leading firms such as TechCorp and SoftInc to deliver top-notch services."

    ### Example 2: Client Logos
    If the input text contains:
    "Our platform and service is trusted by these innovative companies:
    ![Nationwide Logo]
    ![Freedom 365 Logo]
    ![Bestow Logo]
    ..."
    
    The output should be:
    "Our platform and service is trusted by these innovative companies: 
    Clients are: Nationwide, Freedom 365, Bestow..."
   
    """
    prompt = ChatPromptTemplate.from_messages(
        [
            ("system", system_message),
            ("system", extraction_prompt),
            ("human", "Use the given text to extract information: {input}"),
            ("human", """
                Here are the rules that you need to adhere:
                ## Rules:
                - Make sure to answer in the standard text format.
                - If no information is provided, return nothing.
                - DO NOT HALLUCINATE.
             """),
        ]
    )
    
    llm = ChatOpenAI(openai_api_key=os.getenv('OPENAI_KEY'),
                    temperature=0, 
                    model_name=model_name)

    llm_chain = prompt | llm | StrOutputParser()

    response = llm_chain.invoke({'input': text})
    
    return response


def llm_summary_execution(processed_name:str, overwrite:bool = False, model_name:str = 'gpt-4o-mini'):

    scrape_file_path = f'scraping_output_v2_raw/{processed_name}.json'
    extraction_file_path = f'extraction_summary_v2/{processed_name}_summary_str.json'

    scrape_data = read_json_file(scrape_file_path)
    
    file_modified = False

    # Load existing data if the file exists
    if os.path.exists(extraction_file_path):
        with open(extraction_file_path, 'r') as file:
            extracted_data = json.load(file)
    else:
        extracted_data = {}

    for endpoint, content in tqdm(scrape_data.items(), total=len(scrape_data), desc="Extracting data", position=0, leave=True):
        if endpoint in ['timestamp', 'processed_company', 'url']:
            continue
        if endpoint in extracted_data and not overwrite:
            print(f"Company: {processed_name}; Skipping {endpoint} as it already exists and overwrite is set to False.")
            continue  # Skip this URL and move to the next one
        else:
            clean_content = clean_scraped_content(content)
            extracted_data[endpoint] = llm_summary(text = clean_content, model_name = model_name)
            print(f'Company: {processed_name}; Content in {endpoint} is extracted.')
            
            current_dateTime = datetime.now(pytz.timezone('Etc/GMT'))
            extracted_data['timestamp'] = current_dateTime.strftime(format = "%Y-%m-%d %H:%M") + ' Etc/GMT'
            file_modified = True
    
    if file_modified:
        extracted_data['processed_company'] = processed_name
        extracted_data['url'] = "https://" + get_additional_info(processed_name, 'processed_url')
        write_json_file(extraction_file_path, extracted_data)
        
    return extracted_data


In [None]:
doc_list = os.listdir('extraction_summary')
for doc in doc_list:
    processed_name = doc.replace('_summary_str.json', '')
    response = llm_summary_execution(processed_name)

In [7]:
data = read_json_file('scraping_output_v2_raw/vertice.json')
print(clean_scraped_content(data['/product/saas-purchasing']))

[Explore Vendors]
[Contact]
[Log In]
[Start Saving]
Unified SaaS Purchasing
Purchasing, renewing and streamlining your SaaS stack just became a whole lot easier
[Watch the video]
![Contracts List Panel]
![Existing Contract Button]
![Search Applications Field]
![All Departments Field]
![SaaS Renewals Pipeline]
![Autocad Fusion Card]
![Total Contracts Card]
![In Progress Card]
![Awaiting Approval Card]
![Empty Dashboard Background]
Trusted by Finance and Procurement Leaders
[![MotorK Logo]](#)
[![Ebury Logo]](#)
[![ba&sh Logo]](#)
[![Lincoln Investment Logo]](#)
[![Wallbox Logo]](#)
[![Matillion Logo]](#)
[![PageUp Logo]](#)
[![Omio Logo]](#)
[![Coronado Logo]](#)
[![Revel Logo]](#)
[![Podimo Logo]](#)
[![Haiilo Logo]](#)
[![Le Collectionist Logo]](#)
[![Euronext Logo]](#)
[![Choco Logo]](#)
[![Futureverse Logo]](#)
[![Multiplica Logo]](#)
Benefits  
Your SaaS purchasing has never been so under control
Every corner of our product is designed to help you understand your SaaS stack better


In [10]:
text = clean_scraped_content(data['/product/saas-purchasing'])
response = llm_summary(text = text, model_name='gpt-4o-mini')
print(response)

"Purchasing, renewing and streamlining your SaaS stack just became a whole lot easier
Trusted by Finance and Procurement Leaders
Clients are: MotorK, Ebury, ba&sh, Lincoln Investment, Wallbox, Matillion, PageUp, Omio, Coronado, Revel, Podimo, Haiilo, Le Collectionist, Euronext, Choco, Futureverse, Multiplica
Benefits  
Your SaaS purchasing has never been so under control
Every corner of our product is designed to help you understand your SaaS stack better
Full Stack Visibility
Proactively keep track of your software renewals, requests, and tail spend.
SaaS Visibility
Usage and Analytics
Track your license usage, helping you spot overspend and tool under-utilization.
Analytics and Benchmarking
Vendor Benchmarking
Data on 16,000+ vendors, showing you the real price you ought to be paying.
Browse Vendors
Expert Buyers
Our specialists partner with you to secure you the best possible deal, typically achieving savings of 20-30%.
The Proof  
Lose the pounds. Cut the sprawl.
Our tech-enabled s

In [8]:
data = read_json_file('extraction_summary/vertice_summary_str.json')
print(clean_scraped_content(data['/product/saas-purchasing']))

Unified SaaS Purchasing
Purchasing, renewing and streamlining your SaaS stack just became a whole lot easier
Trusted by Finance and Procurement Leaders
Clients are: MotorK, Ebury, ba&sh, Lincoln Investment, Wallbox, Matillion, PageUp, Omio, Coronado, Revel, Podimo, Haiilo, Le Collectionist, Euronext, Choco, Futureverse, Multiplica
Benefits  
Your SaaS purchasing has never been so under control
Every corner of our product is designed to help you understand your SaaS stack better
Full Stack Visibility
Proactively keep track of your software renewals, requests, and tail spend.
SaaS Visibility
Usage and Analytics
Track your license usage, helping you spot overspend and tool under-utilization.
Analytics and Benchmarking
Vendor Benchmarking
Data on 16,000+ vendors, showing you the real price you ought to be paying.
Browse Vendors
Expert Buyers
Our specialists partner with you to secure you the best possible deal, typically achieving savings of 20-30%.
The Proof  
Lose the pounds. Cut the spr

In [22]:
for company in tqdm(sample['processed_name'].to_list()):
    llm_summary_execution(company)

  0%|          | 0/110 [00:00<?, ?it/s]


NameError: name 'llm_summary_execution' is not defined

## Instructor

https://github.com/jxnl/instructor

### Prompting Chains

In [30]:
from pydantic import BaseModel, Field
from typing import List, Optional, Literal

class ProductDescription(BaseModel):
    name: str = Field(..., alias='summarised name of product')
    description: str = Field(..., alias='concise features description of the product or service')

class SummaryProductDescription(BaseModel):
    name: str = Field(..., alias='summarised name of the main product offerings of the company')
    description: str = Field(..., alias='summary of product offering of the company')

class ClientDescription(BaseModel):
    name: str = Field(..., alias='name of the client or partner')
    description: Optional[str] = Field(None, alias='description of the usecase')

class ExtractedInformation(BaseModel):
    product_descriptions: Optional[List[ProductDescription]] = None
    summary_product_description: Optional[SummaryProductDescription] = None
    client_descriptions: Optional[List[ClientDescription]] = None
    
class ValidatedClientDescription(BaseModel):
    name: str = Field(..., alias='name of the client or partner')
    entity_type: Literal["person", "company", "general_entity", "other", "school"]
    product_used: Optional[str] = Field(None, alias='summary of the product or service used by the client or partner')
    description: Optional[str] = Field(None, alias='description of the usecase')

class ValidatedExtractedInformation(BaseModel):
    # product_descriptions: Optional[List[ProductDescription]] = None
    # summary_product_description: Optional[SummaryProductDescription] = None
    client_descriptions: Optional[List[ValidatedClientDescription]] = None


In [40]:
import instructor
from openai import OpenAI
from dotenv import load_dotenv

# Load environment variables
load_dotenv()


def initial_extraction(text: str, model_name: str = 'gpt-4o', additional_context: str = None) -> ExtractedInformation:
    
    # Patch the OpenAI client with Instructor
    client = instructor.from_openai(OpenAI(api_key=os.getenv('OPENAI_KEY')))
    
    system_message = """
    You are an intelligent text extraction and conversion assistant. Your task is to extract structured information 
    from the given text and convert it into a structured format. 
    The output response should contain only the data extracted from the text, with no additional commentary, explanations, or extraneous information.
    If the required information could not be found from the given source, return nothing for that field. Do not hallucinate.
    """
    
    custom_extraction_prompt = """
    Extract the following information from the text extracted from a webpage of a company:

    1. Product Description:
    - What service or product does the company provide?
    - What features does the product or service have?
    Note: If the company has more than one product or service, automatically detect and list each product with its relevant details.
    
    2. Summary of Product Offering:
    - Summary of the description of the service that the company provide, taking into consideration of all the product offerings.
    Note: Do not include any company-specific information in the summary, such as company name and location.
    
    3. Client Description:
    - Name of the corporate client or partner. 
    - Description of the use case.
    Note: Focus on the extraction of company's name, instead of individuals.
    Note: If the description of the use case is not mentioned, it should be None.
    

    Output in a structured format.
    """
    
    rule_prompt = """
                Here are the rules that you need to adhere:
                    ## Rules:
                    - The aim is to achieve simplicity and clarity in the extracted text.
                    - Make sure to answer in the structured format.
                    - If no information is provided for any of the fields, return nothing of that field.
                    - DO NOT HALLUCINATE.
                """
    
    extraction_prompt = f"""
    {system_message}
    {custom_extraction_prompt}
    """
    
    if additional_context:
        response = client.chat.completions.create(
            model=model_name, 
            response_model=ExtractedInformation,
            messages=[
                {"role": "system", "content": extraction_prompt},
                {"role": "user", "content": f"Use the given text to extract information: {text}"},
                {"role": "user", "content": f"""Here are some additional descriptions about this company for your reference:
                                                {additional_context}"""},
                {"role": "user", "content": rule_prompt}
            ]
        )
        
    else:
        response = client.chat.completions.create(
            model=model_name, 
            response_model=ExtractedInformation,
            messages=[
                {"role": "system", "content": extraction_prompt},
                {"role": "user", "content": f"Use the given text to extract information: {text}"},
                {"role": "user", "content": rule_prompt}
            ]
        )
    return response



In [41]:
def information_validation(products: list, clients: list, summary: dict, model_name: str = 'gpt-4o') -> ValidatedExtractedInformation:
    
    # Patch the OpenAI client with Instructor
    client = instructor.from_openai(OpenAI(api_key=os.getenv('OPENAI_KEY')))
    
    system_message = """
    You are an intelligent text extraction and conversion assistant. Your task is to validate the client information, classify the client names into different entity types, and determine which product is likely used by the client. 
    The output response should contain only the data validated and assigned, with no additional commentary, explanations, or extraneous information.
    If the required information could not be found from the given source, return nothing for that field. Do not hallucinate.
    """
    
    product_info = "\n".join([f"Product: {p['name']}; Description: {p['description']}" for p in products])
    client_info = "\n".join([f"Client: {c['name']}; Description: {c['description']}" for c in clients])
    summary_info = f"{summary['name']}: {summary['description']}"
    
    few_shot_examples = """
        ## Example 1:
        Client Name: Mike Johnson, CEO of TechCorp
        Entity_type: person
        - Reason: Mike Johnson is the name of a person. 
        
        ## Example 2:
        Client Name: Government
        Entity_type: general_entity
        - Reason: "Government" is a general entity, not a specific company.

        ## Example 3:
        Client Name: Innovative Solutions LLC
        Entity_type: company
        - Reason: Innovative Solutions LLC is a specific company name.
        
        ## Example 4:
        Client Name: A US resort
        Entity_type: general_entity
        - Reason: "A US resort" is a general description, not a specific company name.
    
        ## Example 5: 
        Client Name: University College London
        Entity_type: school
        - Reason: University College London is a specific school name.
    """

    validation_prompt = f"""
    {system_message}
    Here is the product information extracted:
    {product_info}
    
    Here is the summary of product offerings of the company:
    {summary_info}
    
    Here are the clients and their use cases:
    {client_info}
    
    Your task is to:
    1. Classify each client name into one of the following entity types: person, company, general_entity, school, or other.
       Note: the entity type "company" should be given to specific companies, with company names.
    2. Based on the product descriptions and client use cases, assign the most likely product used by each client. 
       If you are not confident about which product the client uses, return None for that field.

    Here are some examples regarding the classifying clients into different entity types:
    {few_shot_examples}

    Output in a structured format.
    """
    
    response = client.chat.completions.create(
        model=model_name,
        response_model=ValidatedExtractedInformation,
        messages=[
            {"role": "system", "content": validation_prompt},
            {"role": "user", "content": """
                Here are the rules that you need to adhere:
                ## Rules:
                - Classify each client name into one of the following entity types: person, company, general_entity, school, or other.
                - Assign the most likely product used by each client based on the provided product descriptions and use cases.
                - If the product used is not clear, return None for that field.
                - Make sure to answer in the structured format.
                - DO NOT HALLUCINATE.
            """},
        ]
    )
    return response


In [None]:
# Sample run

# Example usage for the first prompt
text = """
Our company offers innovative cloud solutions that help businesses streamline their operations. Our key features include scalability, security, and ease of use.

We partner with leading firms such as TechCorp and SoftInc to deliver top-notch services. For example, TechCorp uses our cloud solutions to improve their data management, resulting in a 30% increase in efficiency.

Our client, SoftInc, has integrated our services into their workflow, leading to significant improvements in their project turnaround times.

Our platform and service are trusted by these innovative companies:
![Nationwide Logo]
![Freedom 365 Logo]
![Bestow Logo]
...
"""

initial_response = initial_extraction(text).dict()

# Example usage for the second prompt

products = initial_response['product_descriptions'] if initial_response['product_descriptions'] else []
clients = initial_response['client_descriptions'] if initial_response['client_descriptions'] else []
summary = initial_response['summary_product_description']

validated_response = information_validation(products, clients, summary)
print(validated_response.dict())

result = initial_response
result['validated_client_description'] = validated_response.dict()['client_descriptions']

result

In [42]:
def llm_extraction_execution(processed_name:str, include_additional_context:bool = True, overwrite:bool = False):

    summary_file_path = f'extraction_summary_v2/{processed_name}_summary_str.json'
    extraction_file_path = f'extraction_output_v2/{processed_name}_extraction.json'
    
    if not overwrite and os.path.exists(extraction_file_path):
        print(f"Company: {processed_name}; Skipping extraction as the extraction file already exists and overwrite is set to False.")
        return None
    else:
        if os.path.exists(summary_file_path):
            with open(summary_file_path, 'r') as file:
                summary = json.load(file)

            combined_summary = f"## Main Page:\n {summary['main_page']}\n----------------\n"

            for endpoint, text in summary.items():
                if endpoint not in ["main_page", "timestamp", "processed_company", "url"]:
                    combined_summary += f"## {endpoint}:\n{text}\n----------------\n"
            
            print(f"Company: {processed_name}; Information extraction begins.")
            if include_additional_context:
                context = get_additional_info(processed_name, 'description')
                
                print(f'Company: {processed_name}; Estimated Cost: ${calculate_cost(combined_summary + context)}')
                print(f'Company: {processed_name}; Pitchbook description obtained: {context}')
                
                initial_response = initial_extraction(text = combined_summary, 
                                                additional_context = context).dict()
                
            else:
                print(f'Company: {processed_name}; Estimated Cost: ${calculate_cost(combined_summary)}')
                initial_response = initial_extraction(text = combined_summary, 
                                            additional_context = None).dict()
            
            print(f'Company: {processed_name}; PART 1 - Initial extraction is completed.')
            
            result = initial_response
            
            if initial_response['client_descriptions']:
                products = initial_response['product_descriptions'] if initial_response['product_descriptions'] else []
                clients = initial_response['client_descriptions'] if initial_response['client_descriptions'] else []
                summary = initial_response['summary_product_description']

                validated_response = information_validation(products, clients, summary)
                print(f'Company: {processed_name}; PART 2 - Information validation is completed.')
                result['validated_client_descriptions'] = validated_response.dict()['client_descriptions']
                
            else:
                print(f'Company: {processed_name}; PART 2 - Skipped, due to lack of client information.')
                result['validated_client_descriptions'] = None
            
            current_dateTime = datetime.now(pytz.timezone('Etc/GMT'))
            result['timestamp'] = current_dateTime.strftime(format = "%Y-%m-%d %H:%M") + ' Etc/GMT'
            result['processed_company'] = processed_name
            result['url'] = "https://" + get_additional_info(processed_name, 'processed_url')

            write_json_file(extraction_file_path, result)
            
            return result
        else:
            print(f'Summary file: {summary_file_path} does not exist.')
            return None

def add_client_url_to_extraction_output(processed_name:str, file_path:str = 'extraction_output_v2'):
    data = read_json_file(f'{file_path}/{processed_name}_extraction.json')
    
    if data['validated_client_descriptions']:
        for client in data['validated_client_descriptions']:
            if client['entity_type'] != 'company':
                client['url'] = None
            else:
                url = get_and_verify_client_link(client['name'], verbose = False)
                client['url'] = url
            
        write_json_file(f'{file_path}/{processed_name}_extraction.json', data)
        print(f"Company: {processed_name}; Client is extracted.")
    else:
        print(f"Company: {processed_name}; No clients' information.")
        

def get_embedding(text:str, embedding_model:str="text-embedding-3-small"):
   client_openai = OpenAI(api_key=os.getenv('OPENAI_KEY'))
   
   text = text.replace("\n", " ")
   return client_openai.embeddings.create(input = [text], model=embedding_model).data[0].embedding


def get_product_embedding(processed_name:str, embedding_model:str="text-embedding-3-small"):
    file_path = f'extraction_output_v2/{processed_name}_extraction.json'
    
    data = read_json_file(file_path)
    # Check wheather embedding has already been done
    if 'name_embedding' in data['summary_product_description']:
        print(f'Company: {processed_name}; Embedding has already been done.')
        pass
    else:
        product_lst = data['product_descriptions']
        for product in product_lst:
            product['description_embedding'] = get_embedding(text = product['description'],
                                                                embedding_model = embedding_model)
            product['name_embedding'] = get_embedding(text = product['name'],
                                                                embedding_model = embedding_model)


        summary_product = data['summary_product_description']
        summary_product['description_embedding'] = get_embedding(text = summary_product['description'],
                                                                embedding_model = embedding_model)
        summary_product['name_embedding'] = get_embedding(text = summary_product['name'],
                                                                embedding_model = embedding_model)
        print(f'Company: {processed_name}; Embedding is completed.')
        write_json_file(file_path, data)
    
    return data
    
            

In [None]:
llm_extraction_execution(processed_name = 'vertice', 
                         include_additional_context = True, 
                         overwrite = False)

In [43]:
doc_list = os.listdir('extraction_summary_v2')
for doc in doc_list:
    processed_name = doc.replace('_summary_str.json', '')
    try:
        llm_extraction_execution(processed_name = processed_name, 
                        include_additional_context = True, 
                        overwrite = False)
        add_client_url_to_extraction_output(processed_name = processed_name)
        get_product_embedding(processed_name = processed_name)
        
    except Exception as e:
        print(f'Error occured on company {processed_name}: e')

Company: bankerslab; Skipping extraction as the extraction file already exists and overwrite is set to False.
Company: bankerslab; No clients' information.
Company: bankerslab; Embedding has already been done.
Company: estimize; Skipping extraction as the extraction file already exists and overwrite is set to False.
Company: estimize; Client is extracted.
Company: estimize; Embedding has already been done.
Company: avyst; Skipping extraction as the extraction file already exists and overwrite is set to False.
Company: avyst; Client is extracted.
Company: avyst; Embedding has already been done.
Company: bennie; Skipping extraction as the extraction file already exists and overwrite is set to False.
Company: bennie; Client is extracted.
Company: bennie; Embedding has already been done.
Company: vertice; Information extraction begins.
Company: vertice; Estimated Cost: $0.013940000000000001
Company: vertice; Pitchbook description obtained: Developer a spend optimization platform designed t

In [None]:
# Code used to re-validate the client description:

processed_name = 'vertice'
    
extraction_file_path = f'extraction_output_v2/{processed_name}_extraction.json'
initial_response = read_json_file(extraction_file_path)
if initial_response['client_descriptions']:
    products = initial_response['product_descriptions'] if initial_response['product_descriptions'] else []
    clients = initial_response['client_descriptions'] if initial_response['client_descriptions'] else []
    summary = initial_response['summary_product_description']

    validated_response = information_validation(products, clients, summary)
    print(f'Company: {processed_name}; PART 2 - Information validation is completed.')
    result['validated_client_descriptions'] = validated_response.dict()['client_descriptions']
    
else:
    print(f'Company: {processed_name}; PART 2 - Skipped, due to lack of client information.')
    result['validated_client_descriptions'] = None

current_dateTime = datetime.now(pytz.timezone('Etc/GMT'))
result['timestamp'] = current_dateTime.strftime(format = "%Y-%m-%d %H:%M") + ' Etc/GMT'
result['processed_company'] = processed_name
result['url'] = "https://" + get_additional_info(processed_name, 'processed_url')
write_json_file(extraction_file_path, result)

In [68]:
extraction_file_path = f'extraction_output_v2/9fin_extraction.json'
initial_response = read_json_file(extraction_file_path)

In [71]:
products = initial_response['product_descriptions']
product_info = "\n".join([f"Product: {p['name']}; Description: {p['description']}" for p in products])

product_info

'Product: Data & Analytics Platform; Description: Provides AI-powered financial data and analytics. Features real-time market news, key data on high-yield bonds, deal tracking, financial profiles, predictive analytics, search functionality, and ESG data.\nProduct: Comparables; Description: Benchmark prior transactions, bonds, loans, or company profiles using over 300 credit metrics.\nProduct: Earnings; Description: AI transcripts and instant analysis for earnings reports.\nProduct: Search; Description: Powerful search tool for thousands of documents text-searchable by any keyword or phrase.\nProduct: ESG; Description: A full suite of Environmental, Social, and Governance data and analysis.\nProduct: Distressed and Restructuring; Description: Tools to spot undervalued credits and potential future restructurings.\nProduct: News; Description: Aggregates news from 2,000 sources using AI and delivers it quickly.\nProduct: Financials; Description: Full financial profiles with 3 statements, K

In [73]:
products

[{'name': 'Data & Analytics Platform',
  'description': 'Provides AI-powered financial data and analytics. Features real-time market news, key data on high-yield bonds, deal tracking, financial profiles, predictive analytics, search functionality, and ESG data.'},
 {'name': 'Comparables',
  'description': 'Benchmark prior transactions, bonds, loans, or company profiles using over 300 credit metrics.'},
 {'name': 'Earnings',
  'description': 'AI transcripts and instant analysis for earnings reports.'},
 {'name': 'Search',
  'description': 'Powerful search tool for thousands of documents text-searchable by any keyword or phrase.'},
 {'name': 'ESG',
  'description': 'A full suite of Environmental, Social, and Governance data and analysis.'},
 {'name': 'Distressed and Restructuring',
  'description': 'Tools to spot undervalued credits and potential future restructurings.'},
 {'name': 'News',
  'description': 'Aggregates news from 2,000 sources using AI and delivers it quickly.'},
 {'name':

## Get client

In [38]:
client_info = read_json_file('data/client_info.json')
# client_info = {}

doc_list = os.listdir('extraction_output_v2')

for doc in tqdm(doc_list):
    try:
        processed_name = doc.replace('_extraction.json', '')
        data = read_json_file(f'extraction_output_v2/{doc}')

        for client in data['validated_client_descriptions']:
            if client['entity_type'] != 'company':
                continue
            # If a company's name already exists in the dictionary
            if client['name'] in client_info:
                
                # If its service provider does not appear in the saved list, then append it
                if processed_name not in client_info[client['name']]['service_provider_processed']:
                    client_info[client['name']]['service_provider_processed'].append(processed_name)
                    client_info[client['name']]['service_provider'].append(get_additional_info(processed_name, 'companies'))
                    client_info[client['name']]['service_provider_url'].append('https://' + get_additional_info(processed_name, 'processed_url'))
                else:
                    print(f'Company {client["name"]} has already been recorded.')
            
            # If a company's name already does not exist, add the new company
            else:
                client_info[client['name']] = {'processed_name': process_company_name(client['name']),
                                    'url': client['url'],
                                    'service_provider_processed': [processed_name],
                                    'service_provider': [get_additional_info(processed_name, 'companies')],
                                    'service_provider_url': ['https://' + get_additional_info(processed_name, 'processed_url')]
                                    }
        print(f"Company {data['processed_company']}'s clients are recorded.")
    except Exception as e:
        print(f'Company {processed_name} has error: {e}')
        
write_json_file('data/client_info.json', client_info)

  0%|          | 0/24 [00:00<?, ?it/s]

Company 9fin's clients are recorded.


  8%|▊         | 2/24 [00:00<00:02,  9.67it/s]

Company new_constructs's clients are recorded.
Company .DS_Store has error: 'utf-8' codec can't decode byte 0x80 in position 3131: invalid start byte


 17%|█▋        | 4/24 [00:00<00:02,  9.20it/s]

Company adaptive's clients are recorded.
Company hemlane's clients are recorded.
Company niloom_ai has error: 'NoneType' object is not iterable


 33%|███▎      | 8/24 [00:01<00:02,  6.50it/s]

Company bennie's clients are recorded.
Company infogrid's clients are recorded.
Company additive has error: 'NoneType' object is not iterable


 46%|████▌     | 11/24 [00:01<00:02,  5.32it/s]

Company vertice's clients are recorded.
Company avyst's clients are recorded.
Company estimize's clients are recorded.
Company missionmark's clients are recorded.


 67%|██████▋   | 16/24 [00:02<00:01,  5.77it/s]

Company vesta's clients are recorded.
Company aer_compliance has error: 'NoneType' object is not iterable
Company naturealpha's clients are recorded.


 71%|███████   | 17/24 [00:03<00:01,  5.17it/s]

Company aggregion's clients are recorded.
Company dexit has error: 'NoneType' object is not iterable
Company harness_data_intelligence's clients are recorded.


 92%|█████████▏| 22/24 [00:03<00:00,  6.93it/s]

Company direct's clients are recorded.
Company nexben has error: 'NoneType' object is not iterable
Company massive's clients are recorded.


100%|██████████| 24/24 [00:03<00:00,  6.03it/s]

Company Sceptre has already been recorded.
Company ledger_works's clients are recorded.
Company bankerslab has error: 'NoneType' object is not iterable





In [18]:
client_data = read_json_file('client_info.json')
for company, company_info in tqdm(client_data.items(), desc="Scraping data", position=0, leave=True):
    base_url = company_info['url']
    if not base_url:
        continue
    try:
        all_urls, related_urls = get_related_urls(base_url)
        if len(related_urls) > 10:
            related_urls = select_urls(related_urls, 10)
        result = crawl_data(base_url, related_urls, f'scraping_output_v2_raw/{company_info["processed_name"]}.json', overwrite=False)
    except Exception as e:
        print(f'Company {company} has error: {e}')



https://www.bloomberg.com/professional


Scraping data:   1%|          | 1/104 [00:00<01:06,  1.54it/s]

Skipping https://www.bloomberg.com/professional as it already exists and overwrite is set to False.
https://www.iexcloud.io


Scraping data:   3%|▎         | 3/104 [00:02<01:34,  1.07it/s]

Skipping https://www.iexcloud.io as it already exists and overwrite is set to False.
Skipping https://www.iexcloud.io/product-bulletin as it already exists and overwrite is set to False.
https://www.wisdomtree.com


Scraping data:   4%|▍         | 4/104 [00:03<01:21,  1.23it/s]

Skipping https://www.wisdomtree.com as it already exists and overwrite is set to False.
https://www.reventbuilds.com


Scraping data:   7%|▋         | 7/104 [00:06<01:24,  1.15it/s]

Skipping https://www.reventbuilds.com as it already exists and overwrite is set to False.
https://www.drakeconstructionservices.com




Skipping https://www.drakeconstructionservices.com as it already exists and overwrite is set to False.
Skipping https://www.drakeconstructionservices.com/index.asp as it already exists and overwrite is set to False.
Skipping https://www.drakeconstructionservices.com/project_details.asp?id=111 as it already exists and overwrite is set to False.
Skipping https://www.drakeconstructionservices.com/project_details.asp?id=136 as it already exists and overwrite is set to False.
Skipping https://www.drakeconstructionservices.com/project_details.asp?id=52 as it already exists and overwrite is set to False.
Skipping https://www.drakeconstructionservices.com/contact-us.asp as it already exists and overwrite is set to False.
Skipping https://www.drakeconstructionservices.com/project_details.asp?id=103 as it already exists and overwrite is set to False.
Skipping https://www.drakeconstructionservices.com/current-projects.asp as it already exists and overwrite is set to False.
Scraping https://www.dr



https://www.airbnb.com


Scraping data:  14%|█▍        | 15/104 [00:16<01:27,  1.02it/s]

Skipping https://www.airbnb.com as it already exists and overwrite is set to False.
https://www.thoropass.com


Scraping data:  15%|█▌        | 16/104 [00:17<01:31,  1.04s/it]

Skipping https://www.thoropass.com as it already exists and overwrite is set to False.
Skipping https://www.thoropass.com/solutions/ as it already exists and overwrite is set to False.
Skipping https://www.thoropass.com/company/become-a-partner/ as it already exists and overwrite is set to False.
Skipping https://www.thoropass.com/platform/service-partnerships/ as it already exists and overwrite is set to False.
Skipping https://www.thoropass.com/platform/penetration-testing/ as it already exists and overwrite is set to False.
Skipping https://www.thoropass.com/customers/monit/ as it already exists and overwrite is set to False.
Skipping https://www.thoropass.com/platform/due-diligence-questionnaire/ as it already exists and overwrite is set to False.
Skipping https://www.thoropass.com/solutions/maintain-compliance/ as it already exists and overwrite is set to False.
Skipping https://www.thoropass.com/customers/elestio/ as it already exists and overwrite is set to False.
Skipping https

Scraping data:  16%|█▋        | 17/104 [00:19<01:42,  1.18s/it]

Skipping https://www.fahertybrand.com as it already exists and overwrite is set to False.
Skipping https://www.fahertybrand.com/products/isha-dress-blue-mini-stripe-3 as it already exists and overwrite is set to False.
Skipping https://www.fahertybrand.com/products/inlet-knit-blazer-deep-navy-melange as it already exists and overwrite is set to False.
Skipping https://www.fahertybrand.com/products/sunwashed-tee-white as it already exists and overwrite is set to False.
Skipping https://www.fahertybrand.com/products/sunwashed-pocket-tee-white-2 as it already exists and overwrite is set to False.
Skipping https://www.fahertybrand.com/products/belt-loop-all-day-shorts-7-in7-in-olive as it already exists and overwrite is set to False.
Skipping https://www.fahertybrand.com/products/belt-loop-all-day-shorts-7-stone as it already exists and overwrite is set to False.
Skipping https://www.fahertybrand.com/pages/native-partnerships as it already exists and overwrite is set to False.
Skipping htt



Skipping https://www.formstack.com as it already exists and overwrite is set to False.
Skipping https://www.formstack.com/partners as it already exists and overwrite is set to False.
Skipping https://www.formstack.com/products as it already exists and overwrite is set to False.
Skipping https://www.formstack.com/customer-stories as it already exists and overwrite is set to False.
Skipping https://www.formstack.com/solutions as it already exists and overwrite is set to False.
Skipping https://www.formstack.com/find-a-partner as it already exists and overwrite is set to False.
Scraping https://www.formstack.com/products/online-forms.


Scraping data:  17%|█▋        | 18/104 [00:27<03:24,  2.38s/it]

Skipping https://www.formstack.com/legal/ccpa-information-for-customers as it already exists and overwrite is set to False.
Skipping https://www.formstack.com/solutions/employee-onboarding as it already exists and overwrite is set to False.
Skipping https://www.formstack.com/solutions/customer-satisfaction-surveys as it already exists and overwrite is set to False.
https://www.rokt.com


Scraping data:  19%|█▉        | 20/104 [00:27<02:16,  1.62s/it]

Skipping https://www.rokt.com as it already exists and overwrite is set to False.
Skipping https://www.rokt.com/partners as it already exists and overwrite is set to False.
Skipping https://www.rokt.com/case-studies/rokt-ecommerce-delivers-8x-more-monthly-revenue-than-google-adsense/ as it already exists and overwrite is set to False.
Skipping https://www.rokt.com/case-studies/schwans-home-delivery-speeds-up-their-customer-acquisition-with-rokt-ads/ as it already exists and overwrite is set to False.
Skipping https://www.rokt.com/case-studies/flamingo-shaves-off-their-cpa-by-60-with-rokt-ads/ as it already exists and overwrite is set to False.
Skipping https://www.rokt.com/customers as it already exists and overwrite is set to False.
Skipping https://www.rokt.com/case-studies as it already exists and overwrite is set to False.
Skipping https://www.rokt.com/case-studies/the-vitamin-shoppe-adds-profit-while-improving-their-customer-experience-with-rokt/ as it already exists and overwrite



Skipping https://www.nylas.com as it already exists and overwrite is set to False.
Skipping https://www.nylas.com/partners/ as it already exists and overwrite is set to False.
Skipping https://www.nylas.com/solutions/ as it already exists and overwrite is set to False.
Skipping https://www.nylas.com/case-studies/ as it already exists and overwrite is set to False.
Skipping https://www.nylas.com/products/ as it already exists and overwrite is set to False.
Skipping https://www.nylas.com/products/calendar-api/ as it already exists and overwrite is set to False.
Skipping https://www.nylas.com/solutions/travel-and-hospitality/ as it already exists and overwrite is set to False.
Scraping https://www.nylas.com/solutions/financial-technology/.
Scraping https://www.nylas.com/case-study/dialpad/.


Scraping data:  21%|██        | 22/104 [00:37<03:28,  2.55s/it]

Skipping https://www.nylas.com/company/contact-platform-specialist/ as it already exists and overwrite is set to False.
https://www.warbyparker.com
Failed to access https://www.warbyparker.com. Status code: 403
Company Warby Parker has error: object of type 'NoneType' has no len()
https://www.bonusly.com




Skipping https://www.bonusly.com as it already exists and overwrite is set to False.
Scraping https://www.bonusly.com/features/product-overview.
Rate limit exceeded. Retrying after 59.999998807907104 seconds.
Skipping https://www.bonusly.com/customers/solugenix as it already exists and overwrite is set to False.
Scraping https://www.bonusly.com/customers/rechat-elevates-engagement-and-culture-with-bonusly.
Skipping https://www.bonusly.com/customers/surveymonkey as it already exists and overwrite is set to False.
Scraping https://www.bonusly.com/product/achieve.
Scraping https://www.bonusly.com/product/appreciate.


Scraping data:  22%|██▏       | 23/104 [01:52<26:35, 19.70s/it]

Skipping https://www.bonusly.com/customers/nexthink as it already exists and overwrite is set to False.
Skipping https://www.bonusly.com/customers/toast as it already exists and overwrite is set to False.
Skipping https://www.bonusly.com/customers/headspace as it already exists and overwrite is set to False.
https://www.ltse.com


Scraping data:  23%|██▎       | 24/104 [01:58<21:40, 16.25s/it]

Skipping https://www.ltse.com as it already exists and overwrite is set to False.
Skipping https://www.ltse.com/trading/customer-notices as it already exists and overwrite is set to False.
Skipping https://www.ltse.com/listings/partner-with-ltse as it already exists and overwrite is set to False.
https://www.superhuman.com




Scraping https://www.superhuman.com.




https://www.pliancy.com


Scraping data:  25%|██▌       | 26/104 [02:07<13:40, 10.53s/it]

Skipping https://www.pliancy.com as it already exists and overwrite is set to False.
Skipping https://www.pliancy.com/clients/satellite-bio/ as it already exists and overwrite is set to False.
Skipping https://www.pliancy.com/services as it already exists and overwrite is set to False.
Skipping https://www.pliancy.com/clients/photys-therapeutics as it already exists and overwrite is set to False.
Skipping https://www.pliancy.com/clients/eclipse as it already exists and overwrite is set to False.
https://www.himarley.com




Scraping https://www.himarley.com.
Skipping https://www.himarley.com/solutions-overview/ as it already exists and overwrite is set to False.
Skipping https://www.himarley.com/resource-center/?resource-type=case-study as it already exists and overwrite is set to False.
Skipping https://www.himarley.com/customers as it already exists and overwrite is set to False.
Skipping https://www.himarley.com/service/ as it already exists and overwrite is set to False.
Skipping https://www.himarley.com/customers/ as it already exists and overwrite is set to False.
Scraping https://www.himarley.com/platform-overview.
Rate limit exceeded. Retrying after 59.999995946884155 seconds.
Skipping https://www.himarley.com/partners as it already exists and overwrite is set to False.
Skipping https://www.himarley.com/customer-success/ as it already exists and overwrite is set to False.
Scraping https://www.himarley.com/platform-overview/.




https://www.rippling.com
Skipping https://www.rippling.com as it already exists and overwrite is set to False.
Skipping https://www.rippling.com/platform as it already exists and overwrite is set to False.
Scraping https://www.rippling.com/customers.
Skipping https://www.rippling.com/global-benefit-solution as it already exists and overwrite is set to False.
Scraping https://www.rippling.com/customers/morning-consult.
Skipping https://www.rippling.com/industries/financial-services as it already exists and overwrite is set to False.
Scraping https://www.rippling.com/resources/pitfalls-modern-spend-solutions.
Scraping https://www.rippling.com/customers/harver.
Skipping https://www.rippling.com/customers/appcues as it already exists and overwrite is set to False.
Scraping https://www.rippling.com/customers/andros.
Rate limit exceeded. Retrying after 59.999998807907104 seconds.


Scraping data:  27%|██▋       | 28/104 [04:32<52:12, 41.21s/it]

https://www.crisistextline.org


Scraping data:  28%|██▊       | 29/104 [04:33<36:52, 29.50s/it]

Skipping https://www.crisistextline.org as it already exists and overwrite is set to False.
Skipping https://www.crisistextline.org/partnerships/ as it already exists and overwrite is set to False.
https://www.interpretek.com


Scraping data:  29%|██▉       | 30/104 [04:35<26:16, 21.30s/it]

Skipping https://www.interpretek.com as it already exists and overwrite is set to False.
Skipping https://www.interpretek.com/case-studies/ as it already exists and overwrite is set to False.
https://www.softbankrobotics.com




Scraping https://www.softbankrobotics.com.


Scraping data:  30%|██▉       | 31/104 [04:38<19:15, 15.83s/it]

Skipping https://www.softbankrobotics.com/solution/ as it already exists and overwrite is set to False.
https://www.holded.com


Scraping data:  37%|███▋      | 38/104 [04:38<03:56,  3.58s/it]

Skipping https://www.holded.com as it already exists and overwrite is set to False.
Skipping https://www.holded.com/professional-services as it already exists and overwrite is set to False.
Skipping https://www.holded.com/success-cases/ as it already exists and overwrite is set to False.
Skipping https://www.holded.com/success-cases as it already exists and overwrite is set to False.
https://www.encompass.com
Failed to access https://www.encompass.com. Status code: 403
Company Encompass has error: object of type 'NoneType' has no len()
https://www.vertafore.com




Scraping https://www.vertafore.com.
Skipping https://www.vertafore.com/solutions-mgas as it already exists and overwrite is set to False.
Skipping https://www.vertafore.com/products as it already exists and overwrite is set to False.
Skipping https://www.vertafore.com/solutions-agencies as it already exists and overwrite is set to False.
Skipping https://www.vertafore.com/solutions-carriers as it already exists and overwrite is set to False.
Scraping https://www.vertafore.com/products/insurlink.
Scraping https://www.vertafore.com/products/mga-systems.
Scraping https://www.vertafore.com/products/sircon-onboarding-and-self-service.
Scraping https://www.vertafore.com/why-vertafore/orange-partner-program.
Rate limit exceeded. Retrying after 59.99999690055847 seconds.
Scraping https://www.vertafore.com/products/vertafore-client-communications.




https://www.piiac.com


Scraping data:  39%|███▉      | 41/104 [06:01<11:43, 11.17s/it]

Skipping https://www.piiac.com as it already exists and overwrite is set to False.
https://www.iianc.com




Scraping https://www.iianc.com.
Scraping https://www.iianc.com/marketing-solutions.
Scraping https://www.iianc.com/staffing-solutions.
Scraping https://www.iianc.com/advertising-solutions.
Scraping https://www.iianc.com/market-access-solutions.
Rate limit exceeded. Retrying after 59.999999046325684 seconds.
Scraping https://www.iianc.com/independent-market-solutions.
Scraping https://www.iianc.com/eo-insurance-professional-liability-products.
Scraping https://www.iianc.com/technology-solutions.
Scraping https://www.iianc.com/marketing-solutions/real-grader.
Scraping https://www.iianc.com/staffing-solutions/insuracademy-graduates.


Scraping data:  40%|████      | 42/104 [07:39<28:10, 27.26s/it]

https://www.wolferesearch.com




Scraping https://www.wolferesearch.com.
Rate limit exceeded. Retrying after 59.999999046325684 seconds.
Scraping https://www.wolferesearch.com/wolfeservices/.
Scraping https://www.wolferesearch.com/executionservices/.


Scraping data:  42%|████▏     | 44/104 [08:56<31:04, 31.08s/it]

https://www.loanpass.io




Scraping https://www.loanpass.io.
Scraping https://www.loanpass.io/partners.
Scraping https://www.loanpass.io/solutions.
Scraping https://www.loanpass.io/services.
Rate limit exceeded. Retrying after 59.999999046325684 seconds.
Scraping https://www.loanpass.io/product-and-pricing-engine.


Scraping data:  45%|████▌     | 47/104 [10:16<27:39, 29.12s/it]

https://www.docmagic.com




Scraping https://www.docmagic.com.
Scraping https://www.docmagic.com/product-training.
Scraping https://www.docmagic.com/professional-services.
Scraping https://www.docmagic.com/data-services.
Scraping https://www.docmagic.com/third-party-services.
Rate limit exceeded. Retrying after 59.999996185302734 seconds.
Scraping https://www.docmagic.com/integration-services.
Scraping https://www.docmagic.com/esignature-platforms.
Scraping https://www.docmagic.com/settlement-services.
Scraping https://www.docmagic.com/esign-platform.
Scraping https://www.docmagic.com/partners.


Scraping data:  47%|████▋     | 49/104 [11:59<30:49, 33.63s/it]

https://www.upstart.com
Failed to access https://www.upstart.com. Status code: 403
Company Upstart has error: object of type 'NoneType' has no len()
https://www.willowservicing.com




Scraping https://www.willowservicing.com.
Rate limit exceeded. Retrying after 59.99999737739563 seconds.


Scraping data:  47%|████▋     | 49/104 [12:51<14:26, 15.75s/it]


KeyboardInterrupt: 

In [6]:
get_related_urls('https://www.bloomberg.com/professional')

https://www.bloomberg.com/professional




({'https://www.bloomberg.com/feedback',
  'https://www.bloomberg.com/notices/tos'},
 ['https://www.bloomberg.com/professional'])

In [19]:
related_urls

['https://www.hubsync.com',
 'https://www.hubsync.com/solutions#engagement-letter-wizard',
 'https://www.hubsync.com/solutions#efile',
 'https://www.hubsync.com/solutions#esign',
 'https://www.hubsync.com/solutions#workflow-tracking',
 'https://www.hubsync.com/solutions#client-portal-delivery',
 'https://www.hubsync.com/solutions',
 'https://www.hubsync.com/solutions#planning-analytics',
 'https://www.hubsync.com/solutions',
 'https://www.hubsync.com/solutions#hubsync-gateway']