In [1]:
from neo4j_utility import *
from llm_extraction import *
from firecrawl_scraping import *
from utility import *
import os
from tqdm import tqdm
from dotenv import load_dotenv

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
import dotenv
import os
from neo4j import GraphDatabase


URI = os.getenv("NEO4J_URI")
AUTH = (os.getenv("NEO4J_USERNAME"), os.getenv("NEO4J_INSTANCE_PASSWORD"))

with GraphDatabase.driver(URI, auth=AUTH) as driver:
    driver.verify_connectivity()
    print("Connection established.")


Connection established.


In [None]:
doc_list = os.listdir('extraction_output_v2')
for doc in tqdm(doc_list):
    
    try:
        processed_name = doc.replace('_extraction.json', '')
        
        extraction_file_path = f'extraction_output_v2/{processed_name}_extraction.json'
        
        kg_construction(processed_name, extraction_file_path)
        
    except Exception as e:
        print(f'Error: {e}')

In [None]:
doc_list = os.listdir('client_extraction_output')
for doc in tqdm(doc_list):
    
    try:
        processed_name = doc.replace('_extraction.json', '')
        
        extraction_file_path = f'client_extraction_output/{processed_name}_extraction.json'
        
        kg_construction(processed_name, extraction_file_path)
        
    except Exception as e:
        print(f'Error: {e}')

In [4]:
processed_name = 'bennie'
extraction_file_path = f'extraction_output_v2/{processed_name}_extraction.json'
kg_construction(processed_name, extraction_file_path)

Company bennie is added to the graph.


In [2]:
def get_additional_info(processed_name:str, column_name:str, verbose:bool = False):
    
    try: 
        df_all = pd.read_csv('data/merge_url_companies.csv')
        df_all['companies'] = df_all['companies'].str.replace(r'\s*\(.*?\)\s*', '', regex=True)
        df_all['processed_name'] = df_all['companies'].apply(process_company_name)
        
        df_select = df_all[df_all['processed_name'] == processed_name]
        if len(df_select) > 0:
            return df_select[column_name].iloc[0]
        else:
            return None
    except Exception as e:
        if verbose:
            print(f'Error has occured when searching {processed_name}: {e}')
        return None

In [3]:
get_additional_info('guesty', 'companies')

In [10]:
data = read_json_file('data/client_info.json')

1572

### Troubleshooting 1
Issues encountered when client's URL = company's URL

Solution: Set those URLs to None

In [33]:
doc_list = os.listdir('extraction_output_v2')
for doc in doc_list:
    
    try:
        processed_name = doc.replace('_extraction.json', '')
        modified = False
        extraction_file_path = f'extraction_output_v2/{processed_name}_extraction.json'
        data = read_json_file(extraction_file_path)
        
        clients = data['validated_client_descriptions']
        
        if clients:
            for client in clients:
                if client['url']:
                    if client['url'] == data["url"]:
                        print(f'Issue found: Company {processed_name}; Client {client["name"]}')
                        client['url'] = None
                        modified = True
                        
        if modified:
            write_json_file(extraction_file_path, data)
    except Exception as e:
        print(f'Error found at {processed_name}: {e}')
    

Error found at .DS_Store: [Errno 2] No such file or directory: 'extraction_output_v2/.DS_Store_extraction.json'
Error found at celebrity_agent: 'url'


### Troubleshooting 2
Issues encountered when product used is not in the product list (including NaN)

Solution: use the summary product node

In [35]:
doc_list = os.listdir('extraction_output_v2')
for doc in doc_list:
    
    try:
        processed_name = doc.replace('_extraction.json', '')
        modified = False
        extraction_file_path = f'extraction_output_v2/{processed_name}_extraction.json'
        data = read_json_file(extraction_file_path)
        
        product_list = [product['name'] for product in data['product_descriptions']] + [data['summary_product_description']['name']]
        
        clients = data['validated_client_descriptions']
        
        if clients:
            for client in clients:
                if client['product_used'] not in product_list:
                    print(f'Issue found: Company {processed_name}; Client {client["name"]}; Product used {client["product_used"]}')
                    client['product_used'] = data['summary_product_description']['name']
                    modified = True
        
        if modified:
            write_json_file(extraction_file_path, data)
                        
    except Exception as e:
        print(f'Error found at {processed_name}: {e}')

Error found at .DS_Store: [Errno 2] No such file or directory: 'extraction_output_v2/.DS_Store_extraction.json'
Error found at stacker_news: 'NoneType' object is not iterable
Error found at celebrity_agent: 'NoneType' object is not iterable


In [None]:
doclist = os.listdir('client_scraping_output')
for doc in tqdm(doclist):
    try:
        processed_name = doc.replace('.json', '')
        scrape_file_path = f'client_scraping_output/{processed_name}.json'
        summary_file_path = f'client_extraction_summary/{processed_name}_summary_str.json'
        extraction_file_path = f'client_extraction_output/{processed_name}_extraction.json'

        # _ = llm_summary_execution(processed_name = processed_name,
        #                     scrape_file_path = scrape_file_path,
        #                     summary_file_path = summary_file_path,
        #                     model_name = 'gpt-4o-mini',
        #                     overwrite = False)

        # _ = llm_extraction_execution(processed_name = processed_name,
        #                                 summary_file_path = summary_file_path,
        #                                 extraction_file_path = extraction_file_path, 
        #                                 include_additional_context = True, 
        #                                 model_name = 'gpt-4o',
        #                                 overwrite = False)

        _ = get_product_embedding(processed_name = processed_name,
                            extraction_file_path = extraction_file_path)

        _ = add_client_url_to_extraction_output(processed_name = processed_name,
                                    extraction_file_path = extraction_file_path)

        # _ = update_client_list(processed_name = processed_name,
        #                 extraction_file_path = extraction_file_path,
        #                 client_file_path = 'data/client_info.json')
    except Exception as e:
        print(f'Error at {processed_name}: {e}')
        

In [12]:
processed_name = 'booking_automation'
scrape_file_path = f'client_scraping_output/{processed_name}.json'
summary_file_path = f'client_extraction_summary/{processed_name}_summary_str.json'
extraction_file_path = f'client_extraction_output/{processed_name}_extraction.json'

_ = get_product_embedding(processed_name = processed_name,
                    extraction_file_path = extraction_file_path)


Company: booking_automation; Product embedding is completed.
Company: booking_automation; Summary product embedding is completed.


In [None]:
doclist = os.listdir('client_scraping_output')
for doc in tqdm(doclist):

    try:
        processed_name = doc.replace('.json', '')
        scrape_file_path = f'client_scraping_output/{processed_name}.json'
        summary_file_path = f'client_extraction_summary/{processed_name}_summary_str.json'
        extraction_file_path = f'client_extraction_output/{processed_name}_extraction.json'
        
        _ = llm_summary_execution(processed_name = processed_name,
                        scrape_file_path = scrape_file_path,
                        summary_file_path = summary_file_path,
                        model_name= 'gpt-4o-mini',
                        overwrite = False)
        
        _ = llm_extraction_execution(processed_name = processed_name,
                                summary_file_path = summary_file_path,
                                extraction_file_path = extraction_file_path, 
                                include_additional_context = False, 
                                model_name = 'gpt-4o',
                                overwrite = False)
        
        _ = get_product_embedding(processed_name = processed_name,
                            extraction_file_path = extraction_file_path)

        _ = add_client_url_to_extraction_output(processed_name = processed_name,
                                    extraction_file_path = extraction_file_path)
        
    except Exception as e:
        print(f'Error at {processed_name}: {e}')
    

In [14]:
processed_name = 'crunch_'
    
scrape_file_path = f'scraping_output_v2_raw/{processed_name}.json'
summary_file_path = f'extraction_summary_v2/{processed_name}_summary_str.json'
extraction_file_path = f'extraction_output_v2/{processed_name}_extraction.json'

_ = llm_summary_execution(processed_name = processed_name,
                      scrape_file_path = scrape_file_path,
                      summary_file_path = summary_file_path,
                      model_name = 'gpt-4o-mini',
                      overwrite = True)

_ = llm_extraction_execution(processed_name = processed_name,
                                summary_file_path = summary_file_path,
                                extraction_file_path = extraction_file_path, 
                                include_additional_context = True, 
                                overwrite = True)

_ = get_product_embedding(processed_name = processed_name,
                    extraction_file_path = extraction_file_path)

_ = add_client_url_to_extraction_output(processed_name = processed_name,
                            extraction_file_path = extraction_file_path)

_ = update_client_list(processed_name = processed_name,
                extraction_file_path = extraction_file_path,
                client_file_path = 'data/client_info.json')

Company: crunch_; Content in /partners is extracted.
Company: crunch_; Content in main_page is extracted.
Company: crunch_; Information extraction begins.
Company: crunch_; Estimated Cost: $0.003855
Company: crunch_; Pitchbook description obtained: Developer of financial business management software designed to serve organizations with all their accounting needs. The company's software is designed for the business owner and includes features like real-time dashboards to get an immediate overview of how the business is performing, enabling them to get the absolute most out of running their own business.
Company: crunch_; PART 1 - Initial extraction is completed.
Company: crunch_; PART 2 - Skipped, due to lack of client information.
Company: crunch_; Embedding is completed.
Company: crunch_; No clients' information.
Company: crunch_; No clients to be updated


Issue page:
- zenefits

In [None]:

def inspect_page(processed_name, endpoint = 'main_page'):
   
    scraping_data = read_json_file(f'scraping_output_v2_raw/{processed_name}.json')
    print('---------------------------------------------')
    print(f'Before cleaning: {endpoint}')
    print('---------------------------------------------')
    print(scraping_data[endpoint])
    
    print('---------------------------------------------')
    print(f'After cleaning: {endpoint}')
    print('---------------------------------------------')
    print(clean_scraped_content(scraping_data[endpoint]))
    
    print('---------------------------------------------')
    print(f'Summary of page: {endpoint}')
    print('---------------------------------------------')
    summary_data = read_json_file(f'extraction_summary_v2/{processed_name}_summary_str.json')
    print(summary_data[endpoint])

 

In [None]:
inspect_page('truelytics')

---------------------------------------------
Before cleaning: main_page
---------------------------------------------
[![truelytics-envestnet-logo-white](https://www.truelytics.com/hubfs/truelytics-envestnet-logo-white.svg "truelytics-envestnet-logo-white")](https://www.truelytics.com)

*   [SOFTWARE](https://www.truelytics.com/software)
    *   [PLATFORM OVERVIEW](https://www.truelytics.com/software)
        *   [TrueRecruit](https://www.truelytics.com/truerecruit)
            
        *   [TruePerformance](https://www.truelytics.com/trueperformance)
            
        *   [TrueContinuity](https://www.truelytics.com/truecontinuity)
            
        *   [TrueMatch](https://www.truelytics.com/truematch)
            
    *   [NOTABLE FEATURES](https://www.truelytics.com/software)
        *   [Data & Benchmarking](https://www.truelytics.com/data-and-benchmarking)
            
        *   [Enterprise Functionality](https://www.truelytics.com/enterprise-functionality)
            
  