In [1]:
from neo4j_utility import *
from llm_extraction import *
from firecrawl_scraping import *
from utility import *
import os
from tqdm import tqdm
from dotenv import load_dotenv

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
import dotenv
import os
from neo4j import GraphDatabase


URI = os.getenv("NEO4J_URI")
AUTH = (os.getenv("NEO4J_USERNAME"), os.getenv("NEO4J_INSTANCE_PASSWORD"))

with GraphDatabase.driver(URI, auth=AUTH) as driver:
    driver.verify_connectivity()
    print("Connection established.")


Connection established.


In [None]:
doc_list = os.listdir('extraction_output_v2')
for doc in tqdm(doc_list):
    
    try:
        processed_name = doc.replace('_extraction.json', '')
        
        extraction_file_path = f'extraction_output_v2/{processed_name}_extraction.json'
        
        kg_construction(processed_name, extraction_file_path)
        
    except Exception as e:
        print(f'Error: {e}')

In [None]:
doc_list = os.listdir('client_extraction_output')
for doc in tqdm(doc_list):
    
    try:
        processed_name = doc.replace('_extraction.json', '')
        
        extraction_file_path = f'client_extraction_output/{processed_name}_extraction.json'
        
        kg_construction(processed_name, extraction_file_path)
        
    except Exception as e:
        print(f'Error: {e}')

In [4]:
processed_name = 'bennie'
extraction_file_path = f'extraction_output_v2/{processed_name}_extraction.json'
kg_construction(processed_name, extraction_file_path)

Company bennie is added to the graph.


In [2]:
def get_additional_info(processed_name:str, column_name:str, verbose:bool = False):
    
    try: 
        df_all = pd.read_csv('data/merge_url_companies.csv')
        df_all['companies'] = df_all['companies'].str.replace(r'\s*\(.*?\)\s*', '', regex=True)
        df_all['processed_name'] = df_all['companies'].apply(process_company_name)
        
        df_select = df_all[df_all['processed_name'] == processed_name]
        if len(df_select) > 0:
            return df_select[column_name].iloc[0]
        else:
            return None
    except Exception as e:
        if verbose:
            print(f'Error has occured when searching {processed_name}: {e}')
        return None

In [3]:
get_additional_info('guesty', 'companies')

In [10]:
data = read_json_file('data/client_info.json')

1572

In [26]:
df = pd.read_csv('data/merge_url_companies.csv')
print(f"Total number of companies: {len(df)}")
print(f"Total number of relevant pages: {df['num_of_related_urls'].sum()}")
print(f"Total number of relevant pages per company: {df['num_of_related_urls'].sum()/len(df)}")

Total number of companies: 791
Total number of relevant pages: 3217
Total number of relevant pages per company: 4.067003792667509


In [34]:
doc_list = os.listdir('client_extraction_summary')
num_companies = 0
num_pages = 0
for doc in doc_list:
    try:
        data = read_json_file(f'client_extraction_summary/{doc}')
        pages = data.keys()
        num_companies += 1
        valid_pages = [i for i in pages if i not in ['processed_name', 'timestamp', 'url', 'model_name']]
        num_pages += len(valid_pages)
    except Exception as e:
        print(f'Errors found at {doc}: {e}')
        
print(f"Total number of companies: {num_companies}")
print(f"Total number of relevant pages: {num_pages}")
print(f"Total number of relevant pages per company: {num_pages/num_companies}")

Errors found at .DS_Store: 'utf-8' codec can't decode byte 0x80 in position 3131: invalid start byte
Total number of companies: 1413
Total number of relevant pages: 7545
Total number of relevant pages per company: 5.339702760084926


### Trouble shooting

In [None]:
def troubleshoot_llm_output(processed_name:str, extraction_file_path:str):
    
    try:
        data = read_json_file(extraction_file_path)
        clients = data['validated_client_descriptions']
        modified = False
        product_list = [product['name'] for product in data['product_descriptions']] + [data['summary_product_description']['name']]
        
        if clients:
            for client in clients:
                # Troubleshoot 1: If the client's URL == company's URL
                if client['url']:
                    if client['url'] == data["url"]:
                        print(f'Issue found: Client URL == Company URL: Company {processed_name}; Client {client["name"]}; {client["url"]}')
                        client['url'] = None
                        modified = True
                        
                # Troubleshoot 2: If product_used is not in the product list previously extracted
                if client['product_used'] not in product_list:
                    print(f'Issue found: Company {processed_name}; Client {client["name"]}; Product used {client["product_used"]}')
                    client['product_used'] = data['summary_product_description']['name']
                    modified = True
                    
        if modified:
            write_json_file(extraction_file_path, data)
    except Exception as e:
        print(f'Error found at {processed_name}: {e}')
            
    

### Troubleshooting 1
Issues encountered when client's URL = company's URL

Solution: Set those URLs to None

In [5]:
doc_list = os.listdir('extraction_output_v2_original')
for doc in doc_list:
    
    try:
        processed_name = doc.replace('_extraction.json', '')
        modified = False
        extraction_file_path = f'extraction_output_v2_original/{processed_name}_extraction.json'
        data = read_json_file(extraction_file_path)
        
        clients = data['validated_client_descriptions']
        
        if clients:
            for client in clients:
                if client['url']:
                    if client['url'] == data["url"]:
                        print(f'Issue found: Company {processed_name}; Client {client["name"]}; {client["url"]}')
        #                 client['url'] = None
        #                 modified = True
                        
        # if modified:
        #     write_json_file(extraction_file_path, data)
    except Exception as e:
        print(f'Error found at {processed_name}: {e}')
    

Issue found: Company goodfolio; Client GOODFOLIO; https://www.goodfolio.com
Issue found: Company client_hub; Client Logic Accounting Solutions, Inc; https://www.clienthub.app
Error found at .DS_Store: [Errno 2] No such file or directory: 'extraction_output_v2_original/.DS_Store_extraction.json'
Issue found: Company hemlane; Client HP2 RESIDENTIAL; https://www.hemlane.com
Issue found: Company per_diem; Client Masala Wok and Tikka Shack; https://www.tryperdiem.com
Issue found: Company nami_ml; Client Toronto App Factory; https://www.namiml.com
Issue found: Company likely_ai; Client 1000calls; https://www.likely.ai
Issue found: Company homesearch; Client Peaksons Property Limited; https://www.homesearch.co.uk
Issue found: Company workmarket; Client Quantum Installations Group; https://www.workmarket.com
Error found at celebrity_agent: 'url'
Issue found: Company pf_nexus; Client Debt Capital Advisor Firm; https://www.pfnexus.com
Issue found: Company pf_nexus; Client Basso Group Capital Adv

### Troubleshooting 2
Issues encountered when product used is not in the product list (including NaN)

Solution: use the summary product node

In [6]:
doc_list = os.listdir('extraction_output_v2_original')
for doc in doc_list:
    
    try:
        processed_name = doc.replace('_extraction.json', '')
        modified = False
        extraction_file_path = f'extraction_output_v2_original/{processed_name}_extraction.json'
        data = read_json_file(extraction_file_path)
        
        product_list = [product['name'] for product in data['product_descriptions']] + [data['summary_product_description']['name']]
        
        clients = data['validated_client_descriptions']
        
        if clients:
            for client in clients:
                if client['product_used'] not in product_list:
                    print(f'Issue found: Company {processed_name}; Client {client["name"]}; Product used {client["product_used"]}; URL: {data["url"]}')
        #             client['product_used'] = data['summary_product_description']['name']
        #             modified = True
        
        # if modified:
        #     write_json_file(extraction_file_path, data)
                        
    except Exception as e:
        print(f'Error found at {processed_name}: {e}')

Issue found: Company raken; Client Level 10; Product used None; URL: https://www.rakenapp.com
Issue found: Company raken; Client Webcor; Product used None; URL: https://www.rakenapp.com
Issue found: Company raken; Client Barton Malow; Product used None; URL: https://www.rakenapp.com
Issue found: Company raken; Client Danforth; Product used None; URL: https://www.rakenapp.com
Issue found: Company raken; Client Wadman Corp; Product used None; URL: https://www.rakenapp.com
Issue found: Company raken; Client Fortis Construction; Product used None; URL: https://www.rakenapp.com
Issue found: Company raken; Client D.L. Henricksen; Product used None; URL: https://www.rakenapp.com
Issue found: Company raken; Client Central Plumbing; Product used None; URL: https://www.rakenapp.com
Issue found: Company raken; Client Kyne Construction; Product used None; URL: https://www.rakenapp.com
Issue found: Company crowdbureau; Client CoreLogic; Product used None; URL: https://www.crowdbureau.com
Issue foun

In [None]:
doclist = os.listdir('client_scraping_output')
for doc in tqdm(doclist):
    try:
        processed_name = doc.replace('.json', '')
        scrape_file_path = f'client_scraping_output/{processed_name}.json'
        summary_file_path = f'client_extraction_summary/{processed_name}_summary_str.json'
        extraction_file_path = f'client_extraction_output/{processed_name}_extraction.json'

        # _ = llm_summary_execution(processed_name = processed_name,
        #                     scrape_file_path = scrape_file_path,
        #                     summary_file_path = summary_file_path,
        #                     model_name = 'gpt-4o-mini',
        #                     overwrite = False)

        # _ = llm_extraction_execution(processed_name = processed_name,
        #                                 summary_file_path = summary_file_path,
        #                                 extraction_file_path = extraction_file_path, 
        #                                 include_additional_context = True, 
        #                                 model_name = 'gpt-4o',
        #                                 overwrite = False)

        _ = get_product_embedding(processed_name = processed_name,
                            extraction_file_path = extraction_file_path)

        _ = add_client_url_to_extraction_output(processed_name = processed_name,
                                    extraction_file_path = extraction_file_path)

        # _ = update_client_list(processed_name = processed_name,
        #                 extraction_file_path = extraction_file_path,
        #                 client_file_path = 'data/client_info.json')
    except Exception as e:
        print(f'Error at {processed_name}: {e}')
        

In [12]:
processed_name = 'booking_automation'
scrape_file_path = f'client_scraping_output/{processed_name}.json'
summary_file_path = f'client_extraction_summary/{processed_name}_summary_str.json'
extraction_file_path = f'client_extraction_output/{processed_name}_extraction.json'

_ = get_product_embedding(processed_name = processed_name,
                    extraction_file_path = extraction_file_path)


Company: booking_automation; Product embedding is completed.
Company: booking_automation; Summary product embedding is completed.


In [None]:
doclist = os.listdir('client_scraping_output')
for doc in tqdm(doclist):

    try:
        processed_name = doc.replace('.json', '')
        scrape_file_path = f'client_scraping_output/{processed_name}.json'
        summary_file_path = f'client_extraction_summary/{processed_name}_summary_str.json'
        extraction_file_path = f'client_extraction_output/{processed_name}_extraction.json'
        
        _ = llm_summary_execution(processed_name = processed_name,
                        scrape_file_path = scrape_file_path,
                        summary_file_path = summary_file_path,
                        model_name= 'gpt-4o-mini',
                        overwrite = False)
        
        _ = llm_extraction_execution(processed_name = processed_name,
                                summary_file_path = summary_file_path,
                                extraction_file_path = extraction_file_path, 
                                include_additional_context = False, 
                                model_name = 'gpt-4o',
                                overwrite = False)
        
        _ = get_product_embedding(processed_name = processed_name,
                            extraction_file_path = extraction_file_path)

        _ = add_client_url_to_extraction_output(processed_name = processed_name,
                                    extraction_file_path = extraction_file_path)
        
    except Exception as e:
        print(f'Error at {processed_name}: {e}')
    

In [14]:
processed_name = 'crunch_'
    
scrape_file_path = f'scraping_output_v2_raw/{processed_name}.json'
summary_file_path = f'extraction_summary_v2/{processed_name}_summary_str.json'
extraction_file_path = f'extraction_output_v2/{processed_name}_extraction.json'

_ = llm_summary_execution(processed_name = processed_name,
                      scrape_file_path = scrape_file_path,
                      summary_file_path = summary_file_path,
                      model_name = 'gpt-4o-mini',
                      overwrite = True)

_ = llm_extraction_execution(processed_name = processed_name,
                                summary_file_path = summary_file_path,
                                extraction_file_path = extraction_file_path, 
                                include_additional_context = True, 
                                overwrite = True)

_ = get_product_embedding(processed_name = processed_name,
                    extraction_file_path = extraction_file_path)

_ = add_client_url_to_extraction_output(processed_name = processed_name,
                            extraction_file_path = extraction_file_path)

_ = update_client_list(processed_name = processed_name,
                extraction_file_path = extraction_file_path,
                client_file_path = 'data/client_info.json')

Company: crunch_; Content in /partners is extracted.
Company: crunch_; Content in main_page is extracted.
Company: crunch_; Information extraction begins.
Company: crunch_; Estimated Cost: $0.003855
Company: crunch_; Pitchbook description obtained: Developer of financial business management software designed to serve organizations with all their accounting needs. The company's software is designed for the business owner and includes features like real-time dashboards to get an immediate overview of how the business is performing, enabling them to get the absolute most out of running their own business.
Company: crunch_; PART 1 - Initial extraction is completed.
Company: crunch_; PART 2 - Skipped, due to lack of client information.
Company: crunch_; Embedding is completed.
Company: crunch_; No clients' information.
Company: crunch_; No clients to be updated


Issue page:
- zenefits

In [None]:

def inspect_page(processed_name, endpoint = 'main_page'):
   
    scraping_data = read_json_file(f'scraping_output_v2_raw/{processed_name}.json')
    print('---------------------------------------------')
    print(f'Before cleaning: {endpoint}')
    print('---------------------------------------------')
    print(scraping_data[endpoint])
    
    print('---------------------------------------------')
    print(f'After cleaning: {endpoint}')
    print('---------------------------------------------')
    print(clean_scraped_content(scraping_data[endpoint]))
    
    print('---------------------------------------------')
    print(f'Summary of page: {endpoint}')
    print('---------------------------------------------')
    summary_data = read_json_file(f'extraction_summary_v2/{processed_name}_summary_str.json')
    print(summary_data[endpoint])

 

In [11]:
import pandas as pd
import re

# Path to the log file
log_file_path = 'pipeline.log'

# Read the log file
with open(log_file_path, 'r') as file:
    log_lines = file.readlines()

# Regex to extract information
time_pattern = re.compile(r":Function (\w+) executed in ([\d\.]+) seconds")
page_pattern = re.compile(r":Function (\w+) executed on company (\w+) with (\d+) pages")

# Lists to hold the data
function_times = []
function_pages = []

# Parse the log file
for line in log_lines:
    time_match = time_pattern.search(line)
    page_match = page_pattern.search(line)

    if time_match:
        function, time = time_match.groups()
        time = float(time)
        function_times.append({'function': function, 'time': time})

    if page_match:
        function, company, pages = page_match.groups()
        pages = int(pages)
        function_pages.append({'function': function, 'company': company, 'pages': pages})

# Convert lists to DataFrames
df_times = pd.DataFrame(function_times)
df_pages = pd.DataFrame(function_pages)




2193.3076

In [18]:
print(f"Total time spent: {df_times['time'].sum()}s -- Total no. companies: {len(df_pages)} -- Total no. pages: {df_pages['pages'].sum()}")
print(f"Each company roughly have {df_pages['pages'].sum()/len(df_pages)} pages")
print(f"Total time / Number of pages: {df_times['time'].sum()/len(df_pages)}s per company")
print(f"Total time / Number of pages: {df_times['time'].sum()/df_pages['pages'].sum()}s per page")


Total time spent: 2193.3076s -- Total no. companies: 25 -- Total no. pages: 127
Each company roughly have 5.08 pages
Total time / Number of pages: 87.732304s per company
Total time / Number of pages: 17.270138582677166s per page
