In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from neo4j import GraphDatabase
from dotenv import load_dotenv
from firecrawl_scraping import *
from utility import *
from llm_extraction import *
from tqdm import tqdm
import glob
import tiktoken as tiktoken
import instructor
from pydantic import BaseModel
from openai import OpenAI
import ast
from datetime import datetime
import pytz

current_dateTime = datetime.now(pytz.timezone('Etc/GMT'))
print(current_dateTime.strftime(format = "%Y-%m-%d %H:%M"))

2024-08-26 13:32


In [9]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_openai import ChatOpenAI
from langchain.output_parsers.json import SimpleJsonOutputParser
from langchain.schema import StrOutputParser
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_community.chat_message_histories import ChatMessageHistory
import json

Encoding
- gpt-4o: "o200k_base",
- gpt-4: "cl100k_base",
- gpt-3.5-turbo: "cl100k_base",
- gpt-3.5: "cl100k_base",  # Common shorthand
- gpt-35-turbo : "cl100k_base",  # Azure deployment name

gpt-4o US$5.00 / 1M input tokens； US$15.00 / 1M output tokens

gpt-4o context length: 128K tokens


In [10]:
encoding = tiktoken.encoding_for_model("gpt-4o")
print(encoding)

<Encoding 'o200k_base'>


In [11]:
df_all = pd.read_csv('data/PitchBook_All_Columns_2024_07_04_14_48_36_accessibility.csv')
df_all = df_all[~df_all['business_status'].isin(['Out of Business', 'Bankruptcy: Liquidation', 'Bankruptcy: Admin/Reorg'])]
df_all['companies'] = df_all['companies'].str.replace(r'\s*\(.*?\)\s*', '', regex=True)
df_all = df_all[df_all['is_accessible'] == True]
df_all['processed_name'] = df_all['companies'].apply(process_company_name)
df_all.head()

Unnamed: 0,company_id,companies,company_former_name,company_legal_name,competitors,description,primary_industry_sector,primary_industry_group,primary_industry_code,all_industries,...,first_financing_valuation,first_financing_valuation_status,last_financing_valuation,last_financing_valuation_status,last_known_valuation,last_known_valuation_date,last_known_valuation_deal_type,processed_url,is_accessible,processed_name
0,55185-04,Estimize,,"Estimize, Inc.","Neudata, SigFig, Motif (Financial Software), Y...",Developer of an open financial estimates platf...,Information Technology,Software,Financial Software,"Financial Software*, Media and Information Ser...",...,6.34,Actual,,,36.0,16/07/2015,Early Stage VC,www.estimize.com,True,estimize
1,56288-62,New Constructs,,"New Constructs, LLC","Morningstar, CFRA, Finbox (Media and Informati...",Operator of an investment research firm intend...,Information Technology,Software,Financial Software,"Financial Software*, Media and Information Ser...",...,2.17,Actual,,,2.17,13/05/2003,Early Stage VC,www.newconstructs.com,True,new_constructs
3,53739-01,Procore Technologies,,"Procore Technologies, Inc.","Projectmates, eBuilder, CMiC",Procore Technologies Inc is a cloud-based cons...,Information Technology,Software,Business/Productivity Software,"Business/Productivity Software*, Construction ...",...,4.0,Actual,8585.03,Estimated,8585.03,20/05/2021,IPO,www.procore.com,True,procore_technologies
5,153145-27,Proof,"16 Pins, Notarize","Notarize, Inc.","Templafy, ZorroSign, eOriginal, PandaDoc, Cong...",Developer of an identity-assured transaction m...,Information Technology,Software,Business/Productivity Software,"Business/Productivity Software*, Media and Inf...",...,46.5,Actual,,,760.0,25/03/2021,Later Stage VC,www.proof.com,True,proof
6,52304-77,SMS Assist,,"SMS Assist, L.L.C.","ServiceChannel, Divisions Maintenance Group, T...",Provider of business services intended to deli...,Business Products and Services (B2B),Commercial Services,Other Commercial Services,"Buildings and Property, Business/Productivity ...",...,,,950.0,Estimated,950.0,05/01/2023,Merger/Acquisition,www.smsassist.com,True,sms_assist


### Exploration of first shorten the page by extracting relevant information
Issue: The output of the content might be shorten too much

In [34]:

def llm_summary(text, model_name="gpt-4o"):
    system_message = """
    You are an intelligent text extraction and conversion assistant. Your task is to extract information 
    from the given text and convert it into a text (string) format. 
    The output response should contain only the data extracted from the text, with no additional commentary, explanations, or extraneous information.
    If the required information could not be found from the given source, return nothing. Do not hallucinate.
    """

    # Define the extraction prompt
    extraction_prompt = """
    You are provided with a text obtained from a company's webpage. Your task is to extract any sections or paragraphs that are relevant to the specified information of interest.

    ## Information of Interest:

    1. **About Product or Service**:
    - Any details about the products or services the company offers, including their features.

    2. **About Partner or Client**:
    - Any information about the company's partners or clients.
    - Any use cases (case studies) describing how a client is using the company's product or service.
    
    ## Note:
    Sometimes, the company does not explicit describe their clients and the client use case, instead, they will only display clients' logos. 
    You then need to extract client's name from their logos. 
    
    ## Instructions:
    - Do not summarize the content. Extract the raw lines or sections as they are.
    - If you are unsure about the relevance of the information, include it to ensure comprehensive coverage.
    - Output the extracted information in standard text format.

    ## Examples:

    ### Example 1: Product or Service
    If the input text contains:
    "Our company offers innovative cloud solutions that help businesses streamline their operations. Our key features include scalability, security, and ease of use.
    We partner with leading firms such as TechCorp and SoftInc to deliver top-notch services."

    The output should be:
    "Our company offers innovative cloud solutions that help businesses streamline their operations. Our key features include scalability, security, and ease of use.
    We partner with leading firms such as TechCorp and SoftInc to deliver top-notch services."

    ### Example 2: Client Logos
    If the input text contains:
    "Our platform and service is trusted by these innovative companies:
    ![Nationwide Logo]
    ![Freedom 365 Logo]
    ![Bestow Logo]
    ..."
    
    The output should be:
    "Our platform and service is trusted by these innovative companies: 
    Clients are: Nationwide, Freedom 365, Bestow..."
   
    """
    prompt = ChatPromptTemplate.from_messages(
        [
            ("system", system_message),
            ("system", extraction_prompt),
            ("human", "Use the given text to extract information: {input}"),
            ("human", """
                Here are the rules that you need to adhere:
                ## Rules:
                - Make sure to answer in the standard text format.
                - If no information is provided, return nothing.
                - DO NOT HALLUCINATE.
             """),
        ]
    )
    
    llm = ChatOpenAI(openai_api_key=os.getenv('OPENAI_KEY'),
                    temperature=0, 
                    model_name=model_name)

    llm_chain = prompt | llm | StrOutputParser()

    response = llm_chain.invoke({'input': text})
    
    return response

def llm_summary_execution(processed_name:str, 
                          scrape_file_path:str,
                          summary_file_path:str,
                          overwrite:bool = False, 
                          model_name:str = 'gpt-4o-mini'):

    scrape_data = read_json_file(scrape_file_path)
    
    file_modified = False

    # Load existing data if the file exists
    if os.path.exists(summary_file_path):
        with open(summary_file_path, 'r') as file:
            extracted_data = json.load(file)
    else:
        extracted_data = {}

    for endpoint, content in tqdm(scrape_data.items(), total=len(scrape_data), desc="Extracting data", position=0, leave=True):
        if endpoint in ['timestamp', 'processed_company', 'url']:
            continue
        if endpoint in extracted_data and not overwrite:
            print(f"Company: {processed_name}; Skipping {endpoint} as it already exists and overwrite is set to False.")
            continue  # Skip this URL and move to the next one
        else:
            clean_content = clean_scraped_content(content)
            extracted_data[endpoint] = llm_summary(text = clean_content, model_name = model_name)
            print(f'Company: {processed_name}; Content in {endpoint} is extracted.')
            
            current_dateTime = datetime.now(pytz.timezone('Etc/GMT'))
            extracted_data['timestamp'] = current_dateTime.strftime(format = "%Y-%m-%d %H:%M") + ' Etc/GMT'
            file_modified = True
    
    if file_modified:
        extracted_data['processed_company'] = processed_name
        extracted_data['url'] = scrape_data['url']
        write_json_file(summary_file_path, extracted_data)
        
    return extracted_data


In [10]:
# Example usage
processed_name = 'the_booking_factory'
scrape_file_path = f'scraping_output_v2_raw/{processed_name}.json'
summary_file_path = f'extraction_summary_v2/{processed_name}_summary_str.json'

response = llm_summary_execution(processed_name = processed_name,
                                 scrape_file_path = scrape_file_path,
                                 summary_file_path = summary_file_path)

Extracting data:  31%|███       | 4/13 [00:09<00:22,  2.49s/it]

Company: the_booking_factory; Content in /services#partner-plan is extracted.


Extracting data:  38%|███▊      | 5/13 [00:15<00:26,  3.37s/it]

Company: the_booking_factory; Content in /services#rev-plus is extracted.


Extracting data:  46%|████▌     | 6/13 [00:20<00:26,  3.79s/it]

Company: the_booking_factory; Content in /customer-agreement is extracted.


Extracting data:  54%|█████▍    | 7/13 [00:26<00:26,  4.34s/it]

Company: the_booking_factory; Content in /services#hotel-it is extracted.


Extracting data:  62%|██████▏   | 8/13 [00:32<00:24,  4.96s/it]

Company: the_booking_factory; Content in /services is extracted.


Extracting data:  69%|██████▉   | 9/13 [00:39<00:21,  5.37s/it]

Company: the_booking_factory; Content in /services#accounting-services is extracted.


Extracting data:  77%|███████▋  | 10/13 [00:45<00:16,  5.55s/it]

Company: the_booking_factory; Content in /services#basic-plan is extracted.


Extracting data:  85%|████████▍ | 11/13 [00:50<00:11,  5.55s/it]

Company: the_booking_factory; Content in /partner-plan is extracted.


Extracting data:  92%|█████████▏| 12/13 [00:58<00:06,  6.09s/it]

Company: the_booking_factory; Content in main_page is extracted.


Extracting data: 100%|██████████| 13/13 [01:04<00:00,  4.97s/it]

Company: the_booking_factory; Content in /services#bf-web is extracted.





## Instructor

https://github.com/jxnl/instructor

### Prompting Chains

In [25]:
from pydantic import BaseModel, Field
from typing import List, Optional, Literal

class ProductDescription(BaseModel):
    name: str = Field(..., alias='summarised name of product')
    description: str = Field(..., alias='concise features description of the product or service')

class SummaryProductDescription(BaseModel):
    name: str = Field(..., alias='summarised name of the main product offerings of the company')
    description: str = Field(..., alias='summary of product offering of the company')

class ClientDescription(BaseModel):
    name: str = Field(..., alias='name of the client or partner')
    description: Optional[str] = Field(None, alias='description of the usecase')

class ExtractedInformation(BaseModel):
    product_descriptions: Optional[List[ProductDescription]] = None
    summary_product_description: Optional[SummaryProductDescription] = None
    client_descriptions: Optional[List[ClientDescription]] = None
    
class ValidatedClientDescription(BaseModel):
    name: str = Field(..., alias='name of the client or partner')
    entity_type: Literal["person", "company", "general_entity", "other", "school"]
    product_used: Optional[str] = Field(None, alias='summary of the product or service used by the client or partner')
    description: Optional[str] = Field(None, alias='description of the usecase')

class ValidatedExtractedInformation(BaseModel):
    # product_descriptions: Optional[List[ProductDescription]] = None
    # summary_product_description: Optional[SummaryProductDescription] = None
    client_descriptions: Optional[List[ValidatedClientDescription]] = None


In [26]:
import instructor
from openai import OpenAI
from dotenv import load_dotenv

# Load environment variables
load_dotenv()


def initial_extraction(text: str, model_name: str = 'gpt-4o', additional_context: str = None) -> ExtractedInformation:
    
    # Patch the OpenAI client with Instructor
    client = instructor.from_openai(OpenAI(api_key=os.getenv('OPENAI_KEY')))
    
    system_message = """
    You are an intelligent text extraction and conversion assistant. Your task is to extract structured information 
    from the given text and convert it into a structured format. 
    The output response should contain only the data extracted from the text, with no additional commentary, explanations, or extraneous information.
    If the required information could not be found from the given source, return nothing for that field. Do not hallucinate.
    """
    
    custom_extraction_prompt = """
    Extract the following information from the text extracted from a webpage of a company:

    1. Product Description:
    - What service or product does the company provide?
    - What features does the product or service have?
    Note: If the company has more than one product or service, automatically detect and list each product with its relevant details.
    
    2. Summary of Product Offering:
    - Summary of the description of the service that the company provide, taking into consideration of all the product offerings.
    Note: Do not include any company-specific information in the summary, such as company name and location.
    
    3. Client Description:
    - Name of the corporate client or partner. 
    - Description of the use case.
    Note: Focus on the extraction of company's name, instead of individuals.
    Note: If the description of the use case is not mentioned, it should be None.
    

    Output in a structured format.
    """
    
    rule_prompt = """
                Here are the rules that you need to adhere:
                    ## Rules:
                    - The aim is to achieve simplicity and clarity in the extracted text.
                    - Make sure to answer in the structured format.
                    - If no information is provided for any of the fields, return nothing of that field.
                    - DO NOT HALLUCINATE.
                """
    
    extraction_prompt = f"""
    {system_message}
    {custom_extraction_prompt}
    """
    
    if additional_context:
        response = client.chat.completions.create(
            model=model_name, 
            response_model=ExtractedInformation,
            messages=[
                {"role": "system", "content": extraction_prompt},
                {"role": "user", "content": f"Use the given text to extract information: {text}"},
                {"role": "user", "content": f"""Here are some additional descriptions about this company for your reference:
                                                {additional_context}"""},
                {"role": "user", "content": rule_prompt}
            ]
        )
        
    else:
        response = client.chat.completions.create(
            model=model_name, 
            response_model=ExtractedInformation,
            messages=[
                {"role": "system", "content": extraction_prompt},
                {"role": "user", "content": f"Use the given text to extract information: {text}"},
                {"role": "user", "content": rule_prompt}
            ]
        )
    return response



In [27]:
def information_validation(products: list, clients: list, summary: dict, model_name: str = 'gpt-4o') -> ValidatedExtractedInformation:
    
    # Patch the OpenAI client with Instructor
    client = instructor.from_openai(OpenAI(api_key=os.getenv('OPENAI_KEY')))
    
    system_message = """
    You are an intelligent text extraction and conversion assistant. Your task is to validate the client information, classify the client names into different entity types, and determine which product is likely used by the client. 
    The output response should contain only the data validated and assigned, with no additional commentary, explanations, or extraneous information.
    If the required information could not be found from the given source, return nothing for that field. Do not hallucinate.
    """
    
    product_info = "\n".join([f"Product: {p['name']}; Description: {p['description']}" for p in products])
    client_info = "\n".join([f"Client: {c['name']}; Description: {c['description']}" for c in clients])
    summary_info = f"{summary['name']}: {summary['description']}"
    
    few_shot_examples = """
        ## Example 1:
        Client Name: Mike Johnson, CEO of TechCorp
        Entity_type: person
        - Reason: Mike Johnson is the name of a person. 
        
        ## Example 2:
        Client Name: Government
        Entity_type: general_entity
        - Reason: "Government" is a general entity, not a specific company.

        ## Example 3:
        Client Name: Innovative Solutions LLC
        Entity_type: company
        - Reason: Innovative Solutions LLC is a specific company name.
        
        ## Example 4:
        Client Name: A US resort
        Entity_type: general_entity
        - Reason: "A US resort" is a general description, not a specific company name.
    
        ## Example 5: 
        Client Name: University College London
        Entity_type: school
        - Reason: University College London is a specific school name.
    """

    validation_prompt = f"""
    {system_message}
    Here is the product information extracted:
    {product_info}
    
    Here is the summary of product offerings of the company:
    {summary_info}
    
    Here are the clients and their use cases:
    {client_info}
    
    Your task is to:
    1. Classify each client name into one of the following entity types: person, company, general_entity, school, or other.
       Note: the entity type "company" should be given to specific companies, with company names.
    2. Based on the product descriptions and client use cases, assign the most likely product used by each client. 
       If you are not confident about which product the client uses, return None for that field.

    Here are some examples regarding the classifying clients into different entity types:
    {few_shot_examples}

    Output in a structured format.
    """
    
    response = client.chat.completions.create(
        model=model_name,
        response_model=ValidatedExtractedInformation,
        messages=[
            {"role": "system", "content": validation_prompt},
            {"role": "user", "content": """
                Here are the rules that you need to adhere:
                ## Rules:
                - Classify each client name into one of the following entity types: person, company, general_entity, school, or other.
                - Assign the most likely product used by each client based on the provided product descriptions and use cases.
                - If the product used is not clear, return None for that field.
                - Make sure to answer in the structured format.
                - DO NOT HALLUCINATE.
            """},
        ]
    )
    return response


In [41]:
def llm_extraction_execution(processed_name:str, 
                             summary_file_path:str,
                             extraction_file_path:str, 
                             include_additional_context:bool = True, 
                             overwrite:bool = False):
    
    if not overwrite and os.path.exists(extraction_file_path):
        print(f"Company: {processed_name}; Skipping extraction as the extraction file already exists and overwrite is set to False.")
        return None
    else:
        if os.path.exists(summary_file_path):
            summary = read_json_file(summary_file_path)

            combined_summary = f"## Main Page:\n {summary['main_page']}\n----------------\n"

            for endpoint, text in summary.items():
                if endpoint not in ["main_page", "timestamp", "processed_company", "url"]:
                    combined_summary += f"## {endpoint}:\n{text}\n----------------\n"
            
            print(f"Company: {processed_name}; Information extraction begins.")
            if include_additional_context:
                context = get_additional_info(processed_name, 'description')
                
                print(f'Company: {processed_name}; Estimated Cost: ${calculate_cost(combined_summary + context)}')
                print(f'Company: {processed_name}; Pitchbook description obtained: {context}')
                
                initial_response = initial_extraction(text = combined_summary, 
                                                additional_context = context).dict()
                
            else:
                print(f'Company: {processed_name}; Estimated Cost: ${calculate_cost(combined_summary)}')
                initial_response = initial_extraction(text = combined_summary, 
                                            additional_context = None).dict()
            
            print(f'Company: {processed_name}; PART 1 - Initial extraction is completed.')
            
            result = initial_response
            
            if initial_response['client_descriptions']:
                products = initial_response['product_descriptions'] if initial_response['product_descriptions'] else []
                clients = initial_response['client_descriptions'] if initial_response['client_descriptions'] else []
                summary = initial_response['summary_product_description']

                validated_response = information_validation(products, clients, summary)
                print(f'Company: {processed_name}; PART 2 - Information validation is completed.')
                result['validated_client_descriptions'] = validated_response.dict()['client_descriptions']
                
            else:
                print(f'Company: {processed_name}; PART 2 - Skipped, due to lack of client information.')
                result['validated_client_descriptions'] = None
            
            current_dateTime = datetime.now(pytz.timezone('Etc/GMT'))
            result['timestamp'] = current_dateTime.strftime(format = "%Y-%m-%d %H:%M") + ' Etc/GMT'
            result['processed_company'] = processed_name
            result['url'] = read_json_file(summary_file_path)['url']

            write_json_file(extraction_file_path, result)
            
            return result
        else:
            print(f'Summary file: {summary_file_path} does not exist.')
            return None

def add_client_url_to_extraction_output(processed_name:str, extraction_file_path:str, verbose:bool = False, overwrite:bool = False):
    data = read_json_file(extraction_file_path)
    
    # Check if the company has any clients
    if data['validated_client_descriptions']:
        
        # If the url key already exists, meaning the urls have been added and overwrite is False
        if 'url' in data['validated_client_descriptions'][0].keys() and not overwrite:
            print(f"Company: {processed_name}; Skipping as the clients' URLs have been added and overwrite is set to False.")
        else:
            for client in data['validated_client_descriptions']:
                if client['entity_type'] != 'company':
                    client['url'] = None
                else:
                    url = get_and_verify_company_link(client['name'], verbose = verbose)
                    client['url'] = url
            print(f"Company: {processed_name}; Client is extracted.")
    else:
        print(f"Company: {processed_name}; No clients' information.")
    write_json_file(extraction_file_path, data)
    return None

    
def get_embedding(text:str, embedding_model:str="text-embedding-3-small"):
   client_openai = OpenAI(api_key=os.getenv('OPENAI_KEY'))
   
   text = text.replace("\n", " ")
   return client_openai.embeddings.create(input = [text], model=embedding_model).data[0].embedding


def get_product_embedding(processed_name:str, extraction_file_path:str, embedding_model:str="text-embedding-3-small"):
    
    data = read_json_file(extraction_file_path)
    # Check wheather embedding has already been done
    if 'name_embedding' in data['summary_product_description']:
        print(f'Company: {processed_name}; Embedding has already been done.')
        pass
    else:
        product_lst = data['product_descriptions']
        for product in product_lst:
            product['description_embedding'] = get_embedding(text = product['description'],
                                                                embedding_model = embedding_model)
            product['name_embedding'] = get_embedding(text = product['name'],
                                                                embedding_model = embedding_model)

        summary_product = data['summary_product_description']
        summary_product['description_embedding'] = get_embedding(text = summary_product['description'],
                                                                embedding_model = embedding_model)
        summary_product['name_embedding'] = get_embedding(text = summary_product['name'],
                                                                embedding_model = embedding_model)
        print(f'Company: {processed_name}; Embedding is completed.')
        write_json_file(extraction_file_path, data)
    
    return data

def update_client_list(processed_name:str, extraction_file_path:str, client_file_path:str = 'data/client_info.json', verbose:bool = False):
    
    data = read_json_file(extraction_file_path)
    client_info = read_json_file(client_file_path)
        
    if data['validated_client_descriptions']:
        try:        
            for client in data['validated_client_descriptions']:
                if client['entity_type'] != 'company':
                    continue
                # If a company's name already exists in the dictionary and the url is unchanged
                if client['name'] in client_info and client['url'] == client_info[client['name']]['url'] :
                    # If its service provider does not appear in the saved list, then append it
                    if processed_name not in client_info[client['name']]['service_provider_processed']:
                        client_info[client['name']]['service_provider_processed'].append(processed_name)
                        client_info[client['name']]['service_provider'].append(get_additional_info(processed_name, 'companies'))
                        client_info[client['name']]['service_provider_url'].append('https://' + get_additional_info(processed_name, 'processed_url'))
                    else:
                        if verbose:
                            print(f'Company {client["name"]} has already been recorded.')
                
                # If a company's name already does not exist, add the new company
                else:
                    client_info[client['name']] = {'processed_name': process_company_name(client['name']),
                                        'url': client['url'],
                                        'service_provider_processed': [processed_name],
                                        'service_provider': [get_additional_info(processed_name, 'companies')],
                                        'service_provider_url': ['https://' + get_additional_info(processed_name, 'processed_url')]
                                        }
            print(f"Company: {data['processed_company']}; Clients information is updated.")
            write_json_file(client_file_path, client_info)
        except Exception as e:
            print(f'Company: {processed_name}; Error occurred: {e}')
    else:
        print(f'Company: {processed_name}; No clients to be updated')
    

In [43]:
# Example usage
processed_name = 'auquan'
scrape_file_path = f'scraping_output_v2_raw/{processed_name}.json'
summary_file_path = f'extraction_summary_v2/{processed_name}_summary_str.json'
extraction_file_path = f'extraction_output_v2/{processed_name}_extraction.json'

_ = llm_summary_execution(processed_name = processed_name,
                                 scrape_file_path = scrape_file_path,
                                 summary_file_path = summary_file_path)

_ = llm_extraction_execution(processed_name = processed_name,
                         summary_file_path = summary_file_path,
                         extraction_file_path = extraction_file_path, 
                         include_additional_context = True, 
                         overwrite = False)

_ = add_client_url_to_extraction_output(processed_name = processed_name,
                                    extraction_file_path = extraction_file_path)

_ = get_product_embedding(processed_name = processed_name,
                      extraction_file_path = extraction_file_path)

_ = update_client_list(processed_name = processed_name,
                   extraction_file_path = extraction_file_path,
                   client_file_path = 'data/client_info.json')

Extracting data:   0%|          | 0/7 [00:00<?, ?it/s]

Extracting data:  57%|█████▋    | 4/7 [00:05<00:04,  1.36s/it]

Company: auquan; Content in main_page is extracted.


Extracting data:  71%|███████▏  | 5/7 [00:10<00:04,  2.38s/it]

Company: auquan; Content in /professional-services is extracted.


Extracting data:  86%|████████▌ | 6/7 [00:19<00:04,  4.09s/it]

Company: auquan; Content in /success-cases/ is extracted.


Extracting data: 100%|██████████| 7/7 [00:26<00:00,  3.78s/it]

Company: auquan; Content in /success-cases is extracted.
Company: auquan; Information extraction begins.
Company: auquan; Estimated Cost: $0.010124999999999999
Company: auquan; Pitchbook description obtained: Developer of a data science platform intended to discover and implement newer trading ideas. The company's platform develops high-quality trading strategies and bridges the gap between data science and finance, enabling clients to translate the analytical skills of talented people into trading profits.





Company: auquan; PART 1 - Initial extraction is completed.
Company: auquan; PART 2 - Information validation is completed.
Company: auquan; Client is extracted.
Company: auquan; Embedding is completed.
Company: auquan; Clients information is updated.


In [None]:
doc_list = os.listdir('extraction_summary_v2')
for doc in doc_list:
    processed_name = doc.replace('_summary_str.json', '')
    summary_file_path = f'extraction_summary_v2/{processed_name}_summary_str.json'
    extraction_file_path = f'extraction_output_v2/{processed_name}_extraction.json'
    try:
        _ = llm_extraction_execution(processed_name = processed_name,
                                summary_file_path = summary_file_path,
                                extraction_file_path = extraction_file_path, 
                                include_additional_context = True, 
                                overwrite = False)

        _ = get_product_embedding(processed_name = processed_name,
                            extraction_file_path = extraction_file_path)

        _ = add_client_url_to_extraction_output(processed_name = processed_name,
                                            extraction_file_path = extraction_file_path, overwrite = False)
        _ = update_client_list(processed_name = processed_name,
                        extraction_file_path = extraction_file_path,
                        client_file_path = 'data/client_info.json')
        
    except Exception as e:
        print(f'Error occured on company {processed_name}: {e}')

## Get client

In [None]:
# Client data crawling

client_data = read_json_file('data/client_info.json')
for base_url in tqdm(list(client_data.keys())[700:900], desc="Scraping data", position=0, leave=True):
    if not base_url:
        continue
    if base_url in ['https://www.autodesk.com']:
        continue
    try:
        all_urls, related_urls = get_related_urls(base_url)
        if len(related_urls) > 10:
            related_urls = select_urls(related_urls, 10)
            
        processed_name = client_data[base_url]["processed_name"]
        
        if os.path.exists(f'client_scraping_output/{processed_name}.json'):
            data = read_json_file(f'client_scraping_output/{processed_name}.json')
            scraped_urls = [i for i in list(data.keys()) if i not in ["processed_company", "url", "timestamp"]]
            if len(scraped_urls) >= 10 and 'main_page' in scraped_urls:
                print(f'Company {processed_name} has already collected 10 pages')
                continue
            
        result = crawl_data(base_url, related_urls, f'client_scraping_output/{client_data[base_url]["processed_name"]}.json', overwrite=False)
    except Exception as e:
        print(f'Company {client_data[base_url]["processed_name"]} has error: {e}')

In [None]:
import os
import signal
from tqdm import tqdm

class TimeoutException(Exception):
    pass

def timeout_handler(signum, frame):
    raise TimeoutException

# Set the timeout limit (in seconds)
TIMEOUT_LIMIT = 180  # 3 minutes

client_data = read_json_file('data/client_info.json')
for base_url in tqdm(list(client_data.keys())[1000:1600], desc="Scraping data", position=0, leave=True):
    if not base_url:
        continue
    if base_url in ['https://www.autodesk.com', 'https://www.docuflow.co.uk']:
        continue

    try:
        # Set the signal handler and a timeout limit
        signal.signal(signal.SIGALRM, timeout_handler)
        signal.alarm(TIMEOUT_LIMIT)

        all_urls, related_urls = get_related_urls(base_url)
        if len(related_urls) > 10:
            related_urls = select_urls(related_urls, 10)

        processed_name = client_data[base_url]["processed_name"]

        if os.path.exists(f'client_scraping_output/{processed_name}.json'):
            data = read_json_file(f'client_scraping_output/{processed_name}.json')
            scraped_urls = [i for i in list(data.keys()) if i not in ["processed_company", "url", "timestamp"]]
            if len(scraped_urls) >= 10 and 'main_page' in scraped_urls:
                print(f'Company {processed_name} has already collected 10 pages')
                continue

        result = crawl_data(base_url, related_urls, f'client_scraping_output/{client_data[base_url]["processed_name"]}.json', overwrite=False)

        # Disable the alarm after successful completion
        signal.alarm(0)

    except TimeoutException:
        print(f'Timed out scraping {base_url}, moving to the next URL.')
    except Exception as e:
        print(f'Company {client_data[base_url]["processed_name"]} encountered an error: {e}')

    finally:
        # Ensure the alarm is disabled in case of other exceptions
        signal.alarm(0)


## Full pipeline for clients

In [42]:
# Example usage
processed_name = 'holded'
scrape_file_path = f'client_scraping_output/{processed_name}.json'
summary_file_path = f'client_extraction_summary/{processed_name}_summary.json'
extraction_file_path = f'client_extraction_output/{processed_name}_extraction.json'

_ = llm_summary_execution(processed_name = processed_name,
                                 scrape_file_path = scrape_file_path,
                                 summary_file_path = summary_file_path,
                                 overwrite = False)

_ = llm_extraction_execution(processed_name = processed_name,
                         summary_file_path = summary_file_path,
                         extraction_file_path = extraction_file_path, 
                         include_additional_context = False, 
                         overwrite = False)

_ = add_client_url_to_extraction_output(processed_name = processed_name,
                                    extraction_file_path = extraction_file_path)

_ = get_product_embedding(processed_name = processed_name,
                      extraction_file_path = extraction_file_path)



Extracting data: 100%|██████████| 7/7 [00:00<00:00, 85349.21it/s]


Company: holded; Skipping main_page as it already exists and overwrite is set to False.
Company: holded; Skipping /professional-services as it already exists and overwrite is set to False.
Company: holded; Skipping /success-cases/ as it already exists and overwrite is set to False.
Company: holded; Skipping /success-cases as it already exists and overwrite is set to False.
Company: holded; Information extraction begins.
Company: holded; Estimated Cost: $0.01019
Company: holded; PART 1 - Initial extraction is completed.
Company: holded; PART 2 - Information validation is completed.
Company: holded; Client is extracted.
Company: holded; Embedding is completed.
Company: holded; Error occurred: can only concatenate str (not "NoneType") to str


## Token reduction from post processing raw scraped contents

In [11]:
doc_list = os.listdir('scraping_output_v2_raw')

count = 0
doc_index = 0

original_tok_lst = []
clean_tok_lst = []
proportion_lst = []


for index in range(500):
    
    doc = doc_list[index]
    if doc == '.DS_Store':
        continue
    processed_name = doc.replace('.json', '')
    data = read_json_file(f'scraping_output_v2_raw/{processed_name}.json')
    original_tok = count_tokens(data['main_page'])
    clean_tok = count_tokens(clean_scraped_content(data['main_page']))
    
    original_tok_lst.append(original_tok)
    clean_tok_lst.append(clean_tok)
    proportion = clean_tok/(original_tok+0.001)
    proportion_lst.append(proportion)
    
    

In [14]:
1-sum(proportion_lst)/len(proportion_lst)

0.5672967198317319

In [13]:
proportion_array = np.array(proportion_lst)
proportion_array.mean()

0.432703280168268