In [1]:
import pandas as pd
import numpy as np
import os
from neo4j import GraphDatabase
from dotenv import load_dotenv
from firecrawl_scraping import *
from utility import *
from llm_extraction import *
import glob


### PitchBook Data Preprocessing

In [22]:
# Sanitize property names
def process_column_name(name):
    name = name.lower()
    return name.replace(' ', '_').replace('#', 'number').replace('/', '_').replace(';', '_').replace('-', '_').replace(',', '')
    
def preprocess_url(url):
    # Check for NaN (missing URL)
    if isinstance(url, float):
        return None

    # Ensure the URL has 'http://' or 'https://' at the start
    if not url.startswith('www.'):
        url = 'www.' + url
    
    return url

In [23]:
df = pd.read_excel('PitchBook_All_Columns_2024_07_04_14_48_36.xlsx', header=8)

KeyboardInterrupt: 

In [None]:
columns_of_interest = ['Company ID','Companies','Company Former Name','Company Legal Name','Competitors',
 'Description','Primary Industry Sector','Primary Industry Group','Primary Industry Code','All Industries','Verticals',
 'Keywords', 'Company Financing Status','Total Raised','Business Status','Ownership Status','Universe','Website', 'Employees',
 'Year Founded','Parent Company','Market Cap', 'Revenue','Gross Profit','Net Income','Enterprise Value','HQ Location', 'HQ Country/Territory/Region',
 'HQ City', 'HQ State/Province','HQ Global Region','HQ Global Sub Region','Active Investors','# Active Investors','Acquirers','IPO Probability','M&A Probability',
 'First Financing Valuation', 'First Financing Valuation Status','Last Financing Valuation','Last Financing Valuation Status', 
]

df = df[columns_of_interest]

df = df[~df['Company ID'].isna()]
processed_column_names = [process_column_name(i) for i in columns_of_interest]
column_mapper = dict(zip(columns_of_interest, processed_column_names))
df = df.rename(column_mapper, axis=1)
df['processed_url'] = df['website'].apply(preprocess_url)
df = df.dropna(subset=['processed_url'])

df = df[~df['business_status'].isin(['Out of Business', 'Bankruptcy: Liquidation', 'Bankruptcy: Admin/Reorg'])]

df

Unnamed: 0,company_id,companies,company_former_name,company_legal_name,competitors,description,primary_industry_sector,primary_industry_group,primary_industry_code,all_industries,...,active_investors,number_active_investors,acquirers,ipo_probability,m&a_probability,first_financing_valuation,first_financing_valuation_status,last_financing_valuation,last_financing_valuation_status,processed_url
0,55185-04,Estimize,,"Estimize, Inc.","Neudata, SigFig, Motif (Financial Software), Y...",Developer of an open financial estimates platf...,Information Technology,Software,Financial Software,"Financial Software*, Media and Information Ser...",...,,,ExtractAlpha,,,6.34,Actual,,,www.estimize.com
2,56288-62,New Constructs,,"New Constructs, LLC","Morningstar, CFRA, Finbox (Media and Informati...",Operator of an investment research firm intend...,Information Technology,Software,Financial Software,"Financial Software*, Media and Information Ser...",...,SixThirty Ventures,1.0,,,,2.17,Actual,,,www.newconstructs.com
3,59990-50,Justworks,Clockwork,"Justworks, Inc.","UKG, Personio, Zenefits, Namely, Flock (Busine...",Developer of a human resource management platf...,Information Technology,Software,Business/Productivity Software,"Business/Productivity Software*, Human Capital...",...,"Bain Capital Ventures, Capital Factory, Daring...",16.0,,33.0,52.0,,,,,www.justworks.com
4,53739-01,Procore Technologies (NYS: PCOR),,"Procore Technologies, Inc.","Projectmates, eBuilder, CMiC",Procore Technologies Inc is a cloud-based cons...,Information Technology,Software,Business/Productivity Software,"Business/Productivity Software*, Construction ...",...,,,,,,4.00,Actual,8585.03,Estimated,www.procore.com
5,58716-37,Driveway (Business/Productivity Software),,Driveway Software Corporation,"Viaduct (Business/Productivity Software), Tour...",Developer of a telematics technology designed ...,Information Technology,Software,Business/Productivity Software,"Business/Productivity Software*, Media and Inf...",...,,,Earnix,,,,,,,www.driveway.ai
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1057,180356-77,Your Front Desk,,VR Front Desk Corporation,,Developer of a virtual property management pla...,Information Technology,Software,Business/Productivity Software,"Business/Productivity Software*, Media and Inf...",...,Blue Startups,1.0,,,,,,,,www.yourfrontdesk.co
1058,54093-61,Zanbato,,"Zanbato, Inc.","Axial, Forge Global, Netcapital, Nasdaq Privat...",Developer of an alternative trading system des...,Information Technology,Software,Financial Software,"Brokerage, Financial Software*, Media and Info...",...,"8VC, Alberta Investment Management, Altriarch,...",27.0,,1.0,41.0,,,120.50,Actual,www.zanbato.com
1060,171336-07,Zenplace,,"Zenplace, Inc.","Guesty, Poplar Homes, Hometime, Different (Bus...",Developer of a rental management and leasing p...,Information Technology,Software,Business/Productivity Software,"Business/Productivity Software*, Media and Inf...",...,"Catapult VC, Focal (VC), Oriza Ventures, Plug ...",4.0,,,,,,,,www.zenplace.com
1061,459585-01,Zorba (Business/Productivity Software),,"Zorba, Inc.",,Developer of a property management platform in...,Information Technology,Software,Business/Productivity Software,"Business/Productivity Software*, Media and Inf...",...,"Fresh.Fund, Goodwater Capital, Heroic Ventures...",7.0,,,,,,12.20,Estimated,www.getzorba.com


In [None]:
df.to_csv('PitchBook_All_Columns_2024_07_04_14_48_36.csv', index = False)

### Exploratory Analysis

In [None]:
df = pd.read_csv('PitchBook_Company_General_Information_2024_06_28_12_35_25_processed.csv')
print(df.company_business_status.unique())
df = df[~df['company_business_status'].isin(['Out of Business', 'Bankruptcy: Liquidation', 'Bankruptcy: Admin/Reorg'])]
df


['Generating Revenue' 'Out of Business' 'Profitable'
 'Generating Revenue/Not Profitable' 'Startup' 'Stealth'
 'Product In Beta Test' 'Clinical Trials - Phase 1'
 'Bankruptcy: Liquidation' 'Restart' 'Bankruptcy: Admin/Reorg']


Unnamed: 0,company_id,company_name,company_former_name,description,financing_status_note,primary_industry_sector,primary_industry_group,primary_industry_code,verticals,all_industries,...,hq_phone,hq_fax,hq_email,primary_contact,primary_contact_title,secondary_contact,pitchbook_link,linkedin_url,processed_url,is_accessible
0,55185-04,Estimize,,Developer of an open financial estimates platf...,The company was acquired by ExtractAlpha for a...,Information Technology,Software,Financial Software,"FinTech, SaaS, TMT",Financial Software*;Other Financial Services;M...,...,+1 (212) 634-9085,,info@estimize.com,Vinesh Jha,Chief Executive Officer,,,,www.estimize.com,True
2,56288-62,New Constructs,,Operator of an investment research firm intend...,"Solidus, Lucius Burch, Lhoist North America of...",Information Technology,Software,Financial Software,"Artificial Intelligence & Machine Learning, Bi...",Financial Software*;Media and Information Serv...,...,+1 (615) 377-0443,,info@newconstructs.com,David Trainer,Co-Founder & Chief Executive Officer,,,,www.newconstructs.com,True
3,59990-50,Justworks,Clockwork,Developer of a human resource management platf...,The company raised $16.84 million of venture f...,Information Technology,Software,Business/Productivity Software,"FinTech, HR Tech, SaaS",Business/Productivity Software*;Human Capital ...,...,+1 (888) 534-1711,,hello@justworks.com,Isaac Oates,Co-Founder & Executive Chairman,,,,www.justworks.com,False
4,53739-01,Procore Technologies,,Procore Technologies Inc is a cloud-based cons...,The company raised $634.49 million in its init...,Information Technology,Software,Business/Productivity Software,"Construction Technology, Real Estate Technolog...",Business/Productivity Software*;Media and Info...,...,+1 (866) 477-6267,,info@procore.com,Paul Lyandres,President,Craig Courtemanche Jr.,,,www.procore.com,True
5,58716-37,Driveway (Business/Productivity Software),,Developer of a telematics technology designed ...,The company was acquired by Earnix for an undi...,Information Technology,Software,Business/Productivity Software,"Artificial Intelligence & Machine Learning, Fi...",Business/Productivity Software*;Media and Info...,...,,,,,,,,,www.driveway.ai,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1050,180356-77,Your Front Desk,,Developer of a virtual property management pla...,The company joined Blue Startups as part of it...,Information Technology,Software,Business/Productivity Software,"Real Estate Technology, SaaS",Business/Productivity Software*;Media and Info...,...,,,info@vrfrontdesk.com,Aaron MacDaniel,Founder & Chief Executive Officer,,,,www.yourfrontdesk.co,True
1051,54093-61,Zanbato,,Developer of an alternative trading system des...,The company raised $20.50 million of Series C ...,Information Technology,Software,Financial Software,"FinTech, SaaS, TMT",Financial Software*;Media and Information Serv...,...,+1 (866) 926-2286,,,Knut Sand,"Co-Founder, Chief Executive Officer & Board Me...",,,,www.zanbato.com,True
1053,171336-07,Zenplace,,Developer of a rental management and leasing p...,"The company raised venture funding from Focal,...",Information Technology,Software,Business/Productivity Software,"Artificial Intelligence & Machine Learning, Bi...",Business/Productivity Software*;Real Estate Se...,...,+1 (888) 936-7522,,contactus@zenplace.com,Eric Holly,Chief Executive Officer & President,,,,www.zenplace.com,True
1054,459585-01,Zorba (Business/Productivity Software),,Developer of a property management platform in...,The company raised $2.2 million of venture fun...,Information Technology,Software,Business/Productivity Software,"Real Estate Technology, SaaS",Business/Productivity Software*;Media and Info...,...,+1 (813) 212-6319,,hello@getzorba.com,Or Preiss,Co-Founder & Chief Executive Officer,,,,www.getzorba.com,True


In [None]:
df_fintech = df[df['verticals'].str.contains('FinTech')]

df_fintech_filter = df_fintech[df_fintech['is_accessible'] == True]

df_fintech_filter.head()

Unnamed: 0,company_id,company_name,company_former_name,description,financing_status_note,primary_industry_sector,primary_industry_group,primary_industry_code,verticals,all_industries,...,hq_phone,hq_fax,hq_email,primary_contact,primary_contact_title,secondary_contact,pitchbook_link,linkedin_url,processed_url,is_accessible
0,55185-04,Estimize,,Developer of an open financial estimates platf...,The company was acquired by ExtractAlpha for a...,Information Technology,Software,Financial Software,"FinTech, SaaS, TMT",Financial Software*;Other Financial Services;M...,...,+1 (212) 634-9085,,info@estimize.com,Vinesh Jha,Chief Executive Officer,,,,www.estimize.com,True
2,56288-62,New Constructs,,Operator of an investment research firm intend...,"Solidus, Lucius Burch, Lhoist North America of...",Information Technology,Software,Financial Software,"Artificial Intelligence & Machine Learning, Bi...",Financial Software*;Media and Information Serv...,...,+1 (615) 377-0443,,info@newconstructs.com,David Trainer,Co-Founder & Chief Executive Officer,,,,www.newconstructs.com,True
8,54118-36,Magnite,"The Rubicon Project, Inc., Rubicon Project, Inc.",Magnite is one of the largest supply-side plat...,The company completed a $540 million of debt r...,Information Technology,Software,Business/Productivity Software,"AdTech, FinTech, SaaS, TMT",Business/Productivity Software*;Media and Info...,...,+1 (212) 243-2769,+1 (212) 414-8748,contact@magnite.com,David Day,Chief Financial Officer,,,,www.magnite.com,True
14,50851-45,Cardlytics,,Cardlytics Inc operates an advertising platfor...,The company raised $70.2 million in its initia...,Business Products and Services (B2B),Commercial Services,Media and Information Services (B2B),"Artificial Intelligence & Machine Learning, Ad...",Media and Information Services (B2B)*;Business...,...,+1 (888) 798-5802,,info@cardlytics.com,Alexis DeSieno,"Chief Financial Officer, Finance & Chief Accou...",,,,www.cardlytics.com,True
16,40672-81,Mindbody,,Developer of business management software inte...,The company received an undisclosed amount of ...,Information Technology,Software,Business/Productivity Software,"FinTech, Marketing Tech, SaaS",Business/Productivity Software*;Media and Info...,...,+1 (877) 755-4279,,,Tom Aveston,Chief Financial Officer,,,,www.mindbodyonline.com,True


In [None]:
df_not_accessible = df_fintech[df_fintech['is_accessible'] != True]
df_not_accessible.head()

Unnamed: 0,company_id,company_name,company_former_name,description,financing_status_note,primary_industry_sector,primary_industry_group,primary_industry_code,verticals,all_industries,...,hq_phone,hq_fax,hq_email,primary_contact,primary_contact_title,secondary_contact,pitchbook_link,linkedin_url,processed_url,is_accessible
3,59990-50,Justworks,Clockwork,Developer of a human resource management platf...,The company raised $16.84 million of venture f...,Information Technology,Software,Business/Productivity Software,"FinTech, HR Tech, SaaS",Business/Productivity Software*;Human Capital ...,...,+1 (888) 534-1711,,hello@justworks.com,Isaac Oates,Co-Founder & Executive Chairman,,,,www.justworks.com,False
5,58716-37,Driveway (Business/Productivity Software),,Developer of a telematics technology designed ...,The company was acquired by Earnix for an undi...,Information Technology,Software,Business/Productivity Software,"Artificial Intelligence & Machine Learning, Fi...",Business/Productivity Software*;Media and Info...,...,,,,,,,,,www.driveway.ai,False
10,55758-97,Gusto,"Switchboard Labs, ZenPayroll","Developer of cloud-based payroll, benefits, an...",The company raised $230 million of Series E ve...,Information Technology,Software,Business/Productivity Software,"FinTech, HR Tech, Industrials, SaaS, TMT",Business/Productivity Software*;Human Capital ...,...,+1 (800) 936-0383,,info@gusto.com,Joshua Reeves,"Co-Founder, Chief Executive Officer & Board Me...",,,,www.gusto.com,False
12,56265-94,Carta,"eShares, Eshares Securities",Developer of an equity management platform des...,"Kima Ventures, Anthemis, NKM Capital, SV Angel...",Information Technology,Software,Financial Software,"FinTech, Mobile, SaaS, TMT",Financial Software*;Media and Information Serv...,...,+1 (650) 669-8381,,info@carta.com,Henry Ward,"Co-Founder, Chief Executive Officer & Board Me...",,,,www.carta.com,False
13,52260-85,YapStone,,Developer of a global payments platform design...,The company reached a definitive agreement to ...,Information Technology,Software,Financial Software,"B2B Payments, FinTech, Mobile Commerce, Mobile...",Financial Software*;Media and Information Serv...,...,+1 (866) 289-5977,,info@yapstone.com,Frank Mastrangelo,Co-Chief Executive Officer & Board Member,,,,www.yapstone.com,False


In [None]:
dict(zip(list(df_fintech_filter.company_name), list(df_fintech_filter.processed_url)))

{'Estimize': 'www.estimize.com',
 'New Constructs': 'www.newconstructs.com',
 'Magnite': 'www.magnite.com',
 'Cardlytics': 'www.cardlytics.com',
 'Mindbody': 'www.mindbodyonline.com',
 'Plaid': 'www.plaid.com',
 'Human Interest': 'www.humaninterest.com',
 'Agro.Club': 'www.agro.club',
 'ChargeBee': 'www.chargebee.com',
 'WorkMarket': 'www.workmarket.com',
 'Xactly': 'www.xactlycorp.com',
 'Catalis (Alpharetta)': 'www.catalisgov.com',
 'FloQast': 'www.floqast.com',
 'Zenefits': 'www.zenefits.com',
 'Pinnacle Realty Advisors': 'www.pinnaclera.com',
 'Quantexa': 'www.quantexa.com',
 'Ooyala': 'www.ooyala.com',
 'Tradeshift': 'www.tradeshift.com',
 'Landbay': 'www.landbay.co.uk',
 'Acumatica': 'www.acumatica.com',
 'Ocrolus': 'www.ocrolus.com',
 'Softheon': 'www.softheon.com',
 'Side': 'www.side.com',
 'Karbon': 'www.karbonhq.com',
 'GoSite': 'www.gosite.com',
 'ShowingTime': 'www.showingtime.com',
 'Tango Card': 'www.tangocard.com',
 'Beacon (London)': 'www.beacon.com',
 'Axial': 'www.axi

In [None]:
dict(zip(list(df_not_accessible.company_name), list(df_not_accessible.processed_url)))

{'Justworks': 'www.justworks.com',
 'Driveway (Business/Productivity Software)': 'www.driveway.ai',
 'Gusto': 'www.gusto.com',
 'Carta': 'www.carta.com',
 'YapStone': 'www.yapstone.com',
 'PitchBook Data': 'www.pitchbook.com',
 'Shyft': 'www.shyftmoving.com',
 'SuperRewards': 'www.superrewards.com',
 'Finaloop': 'www.finaloop.com',
 'Stem (Financial Software)': 'www.stem.is',
 'idaciti': 'www.hello.idaciti.com',
 'FixtHub': 'www.fixthub.com',
 'DiliVer': 'www.diliver.com',
 '6fusion': 'www.6fusion.com',
 'Adapt Ready': 'www.adaptready.com',
 'AddZest': 'www.addzest.ai',
 'AuthorityData': 'www.authoritydata.com',
 'BeneStream': 'www.benestream.com',
 'Betterfin': 'www.betterfin.com',
 'Capdesk': 'www.capdesk.com',
 'CareerGig': 'www.careergig.com',
 'CodexDF': 'www.codexdf.com',
 'CodiPark': 'www.codipark.us',
 'COG Network': 'www.cog.network',
 'Concert (Financial Software)': 'www.concertfinance.com',
 'CrowdTwist': 'www.crowdtwist.com',
 'CxO Analytics': 'www.cxo-analytics.com',
 'Cyo

In [None]:
# 'Ocrolus': 'www.ocrolus.com',
# 'Softheon': 'www.softheon.com',
# 'Side': 'www.side.com',


LLM_extraction_agent(company_name = 'Softheon', 
                    url = 'www.softheon.com')

https://www.softheon.com




1. Scrape URL with Firecrawl
2: Save scraped contents as a MD file
Raw data saved to scraping_output/softheon_20240708.md
3: Clean scraped contents
4: Extract information using LLM
5: Save extracted information as a JSON file
Output saved to extraction_output/softheon.json


{'product_offering': {'Compliance': 'Grow membership by over 2,000% while reducing the proportion of HICS cases.',
  'Shopping & Enrollment': 'Offer Phase III Enhanced Direct Enrollment, accounting for 17% of all ACA enrollments.',
  'Premium Billing': 'Consolidate vendors for a smoother handoff from enrollment to billing.'},
 'partners': {'Hometown Health': 'Softheon’s expert knowledge of the ACA marketplace coupled with their leading enrollment and billing technology has made them the perfect partner during Hometown Health’s first open enrollment period on the exchange.',
  'CVS/Aetna': '[Softheon has] a fantastic solution and its performance has been great… the strength of the vendor relationship isn’t whether you have problems or not, it’s how you go about solving them and those issues.'},
 'logos': ['https://www.softheon.com/wp-content/uploads/2023/03/amerihealth-norm.webp',
  'https://www.softheon.com/wp-content/uploads/2023/03/scott-white-health.webp',
  'https://www.softheon.co

In [None]:
files = os.listdir('extraction_output')
# Valid files means companies that has an accessible webpage
valid_files = [file for file in files if len(read_json_file(os.path.join('extraction_output',file))) > 0]
companies = [read_json_file(os.path.join('extraction_output',file))['company_name'] for file in valid_files]
companies_data = [read_json_file(os.path.join('extraction_output',file)) for file in valid_files]
sample_df = df_fintech[df_fintech['company_name'].isin(companies)]
sample_df.head()

Unnamed: 0,company_id,company_name,company_former_name,description,financing_status_note,primary_industry_sector,primary_industry_group,primary_industry_code,verticals,all_industries,...,hq_phone,hq_fax,hq_email,primary_contact,primary_contact_title,secondary_contact,pitchbook_link,linkedin_url,processed_url,is_accessible
0,55185-04,Estimize,,Developer of an open financial estimates platf...,The company was acquired by ExtractAlpha for a...,Information Technology,Software,Financial Software,"FinTech, SaaS, TMT",Financial Software*;Other Financial Services;M...,...,+1 (212) 634-9085,,info@estimize.com,Vinesh Jha,Chief Executive Officer,,,,www.estimize.com,True
2,56288-62,New Constructs,,Operator of an investment research firm intend...,"Solidus, Lucius Burch, Lhoist North America of...",Information Technology,Software,Financial Software,"Artificial Intelligence & Machine Learning, Bi...",Financial Software*;Media and Information Serv...,...,+1 (615) 377-0443,,info@newconstructs.com,David Trainer,Co-Founder & Chief Executive Officer,,,,www.newconstructs.com,True
3,59990-50,Justworks,Clockwork,Developer of a human resource management platf...,The company raised $16.84 million of venture f...,Information Technology,Software,Business/Productivity Software,"FinTech, HR Tech, SaaS",Business/Productivity Software*;Human Capital ...,...,+1 (888) 534-1711,,hello@justworks.com,Isaac Oates,Co-Founder & Executive Chairman,,,,www.justworks.com,False
8,54118-36,Magnite,"The Rubicon Project, Inc., Rubicon Project, Inc.",Magnite is one of the largest supply-side plat...,The company completed a $540 million of debt r...,Information Technology,Software,Business/Productivity Software,"AdTech, FinTech, SaaS, TMT",Business/Productivity Software*;Media and Info...,...,+1 (212) 243-2769,+1 (212) 414-8748,contact@magnite.com,David Day,Chief Financial Officer,,,,www.magnite.com,True
10,55758-97,Gusto,"Switchboard Labs, ZenPayroll","Developer of cloud-based payroll, benefits, an...",The company raised $230 million of Series E ve...,Information Technology,Software,Business/Productivity Software,"FinTech, HR Tech, Industrials, SaaS, TMT",Business/Productivity Software*;Human Capital ...,...,+1 (800) 936-0383,,info@gusto.com,Joshua Reeves,"Co-Founder, Chief Executive Officer & Board Me...",,,,www.gusto.com,False


## Use Neomodel

In [None]:
from neomodel import (config, StructuredNode, StringProperty, IntegerProperty,
    UniqueIdProperty, RelationshipTo)

# Configure the database connection
config.DATABASE_URL = f"bolt://neo4j:{os.getenv('NEO4J_PASSWORD')}@localhost:7687"

class Company(StructuredNode):
    name = StringProperty(unique_index=True)
    company_former_name = StringProperty()
    description = StringProperty()
    financing_status_note = StringProperty()
    primary_industry_sector = StringProperty()
    primary_industry_group = StringProperty()
    primary_industry_code = StringProperty()
    verticals = StringProperty()
    all_industries = StringProperty()
    website_url = StringProperty()
    number_of_employees_at_company = StringProperty()
    year_founded = StringProperty()
    
    partners = RelationshipTo("Partner", 'PARTNERS_WITH')
    country = RelationshipTo("Country", "HQ_IN")
    products = RelationshipTo("Product", "OFFERS")

class Partner(StructuredNode):
    name = StringProperty(unique_index=True)
    description = StringProperty()
    
class Product(StructuredNode):
    name = StringProperty(unique_index=True)
    description = StringProperty()

class Country(StructuredNode):
    name = StringProperty(unique_index=True)



In [None]:
def load_json_file(directory, company_name):
    # Search for files that match the company name prefix
    search_pattern = os.path.join(directory, f"{company_name}.json")
    matching_files = glob.glob(search_pattern)
    
    if matching_files:
        # If there are multiple matches, pick the first one (you can adjust this as needed)
        file_path = matching_files[0]
        with open(file_path, 'r') as file:
            return json.load(file)
    else:
        print(f"No JSON file found for company {company_name}.")
        return None

def create_nodes_and_relationships(df):
    
    for _, row in df.iterrows():
        company = Company(name=row['company_name'], 
                        company_former_name=row['company_former_name'], 
                        description=row['description'], 
                        financing_status_note=row['financing_status_note'],
                        primary_industry_sector=row['primary_industry_sector'], 
                        primary_industry_group=row['primary_industry_group'],
                        primary_industry_code=row['primary_industry_code'],
                        verticals=row['verticals'],
                        all_industries=row['all_industries'], 
                        website_url=row['processed_url'],
                        number_of_employees_at_company = row['number_of_employees_at_company'],
                        year_founded = row['year_founded']).save()
        company_name = process_company_name(row['company_name'])

        json_data = load_json_file('extraction_output', company_name)
        if json_data:
            # Process the loaded JSON data
            print(f"Loaded data for {company_name}: {json_data}")
    
            if 'product_offering' in json_data:
                for product, description in json_data['product_offering'].items():
                    product_node = Product(name=product,
                                        description=description).save()
                    company.product.connect(product_node)

    # for version in data['versions']:
    #     version_node = Version(version=version['version'],
    #                             created=version['created']).save()
    #     paper.versions.connect(version_node)
    
    

In [None]:
create_nodes_and_relationships(sample_df)

Loaded data for estimize: {'company_name': 'Estimize', 'url': 'www.estimize.com', 'product_offering': {'Earnings Estimates Dataset': 'Crowdsources earnings and macroeconomic estimates from over 120,000 contributors. Features include a 70% win rate, 2x deeper estimates, and over 10 years of data.', 'Estimize Platform': 'Collects opinions from a wide range of contributors, maintains high-quality data through advanced algorithms, and offers features like anonymous contribution, give-to-get access, quality control, smart consensus, alerts, screening, and Excel files.'}, 'partners': {'University of Pennsylvania': 'Research into the accuracy of Estimize-covered firms meeting or beating analyst earnings expectations.', 'Temple University': 'Research on improving consensus forecast accuracy through crowdsourcing.', 'University of Kentucky': 'Research on the reduction in consensus bias and increase in consensus accuracy.', 'George Washington University': 'Research on providing a less biased and

In [None]:
os.listdir('scraping_output')[0].startswith('quantexa')

True

In [None]:




# Iterate through the DataFrame
for index, row in df.iterrows():
    company_name = row['company_name']
    json_data = load_json_file('extraction_output', process_company_name(company_name))
    
    if json_data:
        # Process the loaded JSON data
        print(f"Loaded data for {company_name}: {json_data}")


No JSON file found for company companya.
No JSON file found for company companyb.
No JSON file found for company companyc.


In [None]:
load_json_file('extraction_output', process_company_name('agro.club'))

{'company_name': 'Agro.Club',
 'url': 'www.agro.club',
 'product_offering': {'Grain Trading': 'Simplified grain trading with a commitment to reliability and trustworthiness.',
  'Contract Types': 'Offers Spot, Cash, and Basis contracts for grain trading with specific terms and payment upon receipt of scale ticket.',
  'Logistics': 'Defines the best way to move grain in collaboration with partners, allowing for own or provided transport.'},
 'partners': {'Farmers': 'Trusted by over 50,000 farmers for grain trading.',
  'Grain Companies': 'Used by thousands of grain companies for efficient trading and logistics.'},
 'logos': ['https://www.agro.club/us/_next/image?url=%2Fus%2F_next%2Fstatic%2Fmedia%2Fus-globe.dd3e22cd.png&w=1080&q=75',
  'https://www.agro.club/us/_next/image?url=%2Fus%2F_next%2Fstatic%2Fmedia%2Fus-handshake.ea302834.png&w=640&q=75']}

In [None]:
import os
import json
import glob
import pandas as pd
from neomodel import (config, StructuredNode, StringProperty, RelationshipTo)

# Configure the database connection
config.DATABASE_URL = f"bolt://neo4j:{os.getenv('NEO4J_PASSWORD')}@localhost:7687"

# class Company(StructuredNode):
#     name = StringProperty(unique_index=True)
#     company_former_name = StringProperty()
#     description = StringProperty()
#     financing_status_note = StringProperty()
#     primary_industry_sector = StringProperty()
#     primary_industry_group = StringProperty()
#     primary_industry_code = StringProperty()
#     verticals = StringProperty()
#     all_industries = StringProperty()
#     website_url = StringProperty()
#     number_of_employees_at_company = StringProperty()
#     year_founded = StringProperty()
    
#     partners = RelationshipTo("Partner", 'PARTNERS_WITH')
#     country = RelationshipTo("Country", "HQ_IN")
#     products = RelationshipTo("Product", "OFFERS")

# class Partner(StructuredNode):
#     name = StringProperty(unique_index=True)
#     description = StringProperty()
    
# class Product(StructuredNode):
#     name = StringProperty(unique_index=True)
#     description = StringProperty()

# class Country(StructuredNode):
#     name = StringProperty(unique_index=True)

def load_json_file(directory, company_name):
    # Search for files that match the company name prefix
    search_pattern = os.path.join(directory, f"{company_name}*.json")
    matching_files = glob.glob(search_pattern)
    
    if matching_files:
        # If there are multiple matches, pick the first one (you can adjust this as needed)
        file_path = matching_files[0]
        with open(file_path, 'r') as file:
            return json.load(file)
    else:
        print(f"No JSON file found for company {company_name}.")
        return None

def create_nodes_and_relationships(df):
    for _, row in df.iterrows():
        company = Company.get_or_create({
            'name': row['company_name'],
            'company_former_name': row['company_former_name'], 
            'description': row['description'], 
            'financing_status_note': row['financing_status_note'],
            'primary_industry_sector': row['primary_industry_sector'], 
            'primary_industry_group': row['primary_industry_group'],
            'primary_industry_code': row['primary_industry_code'],
            'verticals': row['verticals'],
            'all_industries': row['all_industries'], 
            'website_url': row['processed_url'],
            'number_of_employees_at_company': row['number_of_employees_at_company'],
            'year_founded': row['year_founded']
        })[0]
        
        company_name = process_company_name(row['company_name'])
        json_data = load_json_file('extraction_output', company_name)
        
        if json_data:
            # Process the loaded JSON data
            print(f"Loaded data for {company_name}: {json_data}")
    
            if 'product_offering' in json_data:
                for product_name, product_description in json_data['product_offering'].items():
                    product_node = Product.get_or_create({
                        'name': product_name,
                        'description': product_description
                    })[0]
                    if not company.products.is_connected(product_node):
                        company.products.connect(product_node)
            
            if 'partners' in json_data:
                for partner_name, partner_description in json_data['partners'].items():
                    partner_node = Partner.get_or_create({
                        'name': partner_name,
                        'description': partner_description
                    })[0]
                    if not company.partners.is_connected(partner_node):
                        company.partners.connect(partner_node)

            if 'hq_country_territory' in row and row['hq_country_territory']:
                country_name = row['hq_country_territory']
                country_node = Country.get_or_create({'name': country_name})[0]
                if not company.country.is_connected(country_node):
                    company.country.connect(country_node)


# Create nodes and relationships
create_nodes_and_relationships(sample_df)


Loaded data for estimize: {'company_name': 'Estimize', 'url': 'www.estimize.com', 'product_offering': {'Earnings Estimates Dataset': 'Crowdsources earnings and macroeconomic estimates from over 120,000 contributors. Features include a 70% win rate, 2x deeper estimates, and over 10 years of data.', 'Estimize Platform': 'Collects opinions from a wide range of contributors, maintains high-quality data through advanced algorithms, and offers features like anonymous contribution, give-to-get access, quality control, smart consensus, alerts, screening, and Excel files.'}, 'partners': {'University of Pennsylvania': 'Research into the accuracy of Estimize-covered firms meeting or beating analyst earnings expectations.', 'Temple University': 'Research on improving consensus forecast accuracy through crowdsourcing.', 'University of Kentucky': 'Research on the reduction in consensus bias and increase in consensus accuracy.', 'George Washington University': 'Research on providing a less biased and

In [None]:
sample_df['hq_country_territory']

0      United States
2      United States
3      United States
8      United States
10     United States
12     United States
14     United States
15     United States
16     United States
17     United States
22     United States
23     United States
24     United States
25     United States
26     United States
28     United States
31     United States
32     United States
33    United Kingdom
35     United States
40     United States
Name: hq_country_territory, dtype: object

## Tavily Search

In [None]:
from tavily import TavilyClient
client = TavilyClient(api_key=os.getenv('TAVILY_KEY'))

In [None]:
client.search("What is official website of company Approveme.com?", search_depth="advanced")

{'query': 'What is official website of company Approveme.com?',
 'follow_up_questions': None,
 'answer': None,
 'images': [],
 'results': [{'url': 'https://markets.businessinsider.com/news/stocks/say-goodbye-to-fake-reviews-revvy-is-focused-on-keeping-it-real-1033182619',
   'published date': 'Thu, 21 Mar 2024 05:22:35 GMT',
   'title': 'Say Goodbye to Fake Reviews: Revvy is Focused on Keeping it Real. - Markets Insider',
   'content': "It’s a community where users can connect over shared interests and help each other discover the best products on the market, whether it's the latest tech, a beauty must-have, or a game-changing tool that you should absolutely know about.  +1 650 557 3889Website: https://www.gorevvy.com Video URL: https://youtu.be/d4o0267V08c?feature=shared Release ID: 89124862 If you come across any problems, discrepancies, or concerns related to the content contained within this press release that necessitate action or if a press release requires takedown, we strongly 

In [27]:
from firecrawl import FirecrawlApp

app = FirecrawlApp(api_key=os.getenv("FIRECRAWL_KEY"))

result = app.search(query="What is official website of company Clarks?")

In [28]:
result

  'metadata': {'title': 'Clarks Shoes & Footwear | Sandals, Shoes, Boots & Accessories',
   'description': 'Discover the latest shoe trends & footwear styles at Clarks. Explore our range of fashionable shoes, trendy sandals, casual trainers & iconic boots.',
   'robots': 'index, follow',
   'ogLocaleAlternate': [],
   'sourceURL': 'http://clarks.com/',
   'pageStatusCode': 200}},
  'metadata': {'title': 'Just a moment...',
   'robots': 'noindex,nofollow',
   'ogLocaleAlternate': [],
   'sourceURL': 'https://www.clarks.com/en-us/about-us',
   'pageStatusCode': 403,
   'pageError': 'Forbidden'}},
  'metadata': {'title': 'Contact Clarks Customer Service - Clarks® Shoes Official Site',
   'description': 'Need Help? Contact Clarks Customer Service team on weekdays from 9:00am to 6:00pm closed Saturday and Sunday',
   'robots': 'index, follow',
   'ogLocaleAlternate': [],
   'sourceURL': 'https://www.clarks.com/en-us/contact-us',
   'pageStatusCode': 200}},
  'metadata': {'title': 'Order and

In [None]:
result

  'metadata': {'title': 'Pendo.io - Product Experience and Digital Adoption Solutions',
   'description': 'Pendo’s product experience and digital adoption solutions help companies become product led and deliver digital experiences users love.',
   'robots': 'index, follow, max-image-preview:large, max-snippet:-1, max-video-preview:-1',
   'ogTitle': 'Pendo.io - Product Experience and Digital Adoption Solutions',
   'ogDescription': 'Pendo’s product experience and digital adoption solutions help companies become product led and deliver digital experiences users love.',
   'ogUrl': 'https://www.pendo.io/',
   'ogImage': 'https://www.pendo.io/wp-content/uploads/2024/02/Pendo_HomeBtest_Users_SocialCard_1200x628.png',
   'ogLocale': 'en_US',
   'ogLocaleAlternate': [],
   'ogSiteName': 'Pendo.io',
   'modifiedTime': '2024-05-14T14:13:10+00:00',
   'sourceURL': 'https://www.pendo.io/',
   'pageStatusCode': 200}},
  'metadata': {'title': 'About us | Pendo.io',
   'description': "We're on a mi

In [None]:
result

  'metadata': {'title': 'Docusign | #1 in Electronic Signature and Intelligent Agreement Management',
   'description': 'Create, commit to, and manage your agreements all in one platform with Docusign IAM. Electronically sign for free.',
   'ogTitle': 'Docusign | #1 in Electronic Signature and Intelligent Agreement Management',
   'ogDescription': 'Create, commit to, and manage your agreements all in one platform with Docusign IAM. Electronically sign for free.',
   'ogImage': 'https://images.ctfassets.net/0jnmtsdzg6p5/4GSPcvcpIZpsXsZ7F0EXTW/48109b007c69fc72233c35234157ad0d/OG_Image.png',
   'ogLocaleAlternate': [],
   'sourceURL': 'https://www.docusign.com/',
   'pageStatusCode': 200}},
  'metadata': {'title': 'Docusign - Upload & Sign Docs - Apps on Google Play',
   'description': 'Fill, sign & send docs, forms, agreements & contracts via electronic signature.',
   'robots': 'NOODP',
   'ogTitle': 'Docusign - Upload & Sign Docs - Apps on Google Play',
   'ogDescription': 'Fill, sign 

In [4]:
import nest_asyncio
from scrapegraphai.graphs import SmartScraperGraph
from playwright.async_api import async_playwright

# Your API keys
OPENAI_API_KEY = os.getenv('OPENAI_KEY')
# GOOGLE_API_KEY = "YOUR API KEY"

graph_config = {
    "llm": {
        "api_key": OPENAI_API_KEY,
        "model": "gpt-3.5-turbo",
    },
}

smart_scraper_graph = SmartScraperGraph(
    prompt="Return me everything that you can scrape from this webpage",
    source="https://www.chargebee.com",
    config=graph_config
)

nest_asyncio.apply()

result = smart_scraper_graph.run()

print(result)

{'Title': 'Chargebee: SaaS for Effective Revenue Growth Management', 'Body': {'Login Demo Select Language Open main menu': None, 'Billing': {'Overview': {'Manage subscriptions, billing, and invoicing at scale.': None}, 'Features': {'RECURRING BILLING': {'Overview': 'Usage-based billing, Custom Quotes, Proration, Billing APIs, Pricing Models, Account Hierarchy'}}}, 'Receivables': {'Overview': {'Recover failed payments and get paid faster.': None}}, 'RevRec': {'Overview': {'Automate GAAP compliant revenue recognition.': None}}, 'Retention': {'Overview': {'Reduce churn and grow customer lifetime value.': None}}, 'Platform': {'Integrations': {'Payment Gateways': None}}, 'Resources': {'Learning Hub': None, 'Blog': None, 'Webinars': None, 'Podcast': None, 'Guides': None, 'Glossaries': None}, 'Documentation': {'Product Documentation': None, 'API Documentation': None}, 'Support': {'Services & Implementation': None, 'Help & Support': None}, 'Company': {'About us': None, 'Contact us': None, 'Car

In [5]:
result

{'Title': 'Chargebee: SaaS for Effective Revenue Growth Management',
 'Body': {'Login Demo Select Language Open main menu': None,
  'Billing': {'Overview': {'Manage subscriptions, billing, and invoicing at scale.': None},
   'Features': {'RECURRING BILLING': {'Overview': 'Usage-based billing, Custom Quotes, Proration, Billing APIs, Pricing Models, Account Hierarchy'}}},
  'Receivables': {'Overview': {'Recover failed payments and get paid faster.': None}},
  'RevRec': {'Overview': {'Automate GAAP compliant revenue recognition.': None}},
  'Retention': {'Overview': {'Reduce churn and grow customer lifetime value.': None}},
  'Platform': {'Integrations': {'Payment Gateways': None}},
  'Resources': {'Learning Hub': None,
   'Blog': None,
   'Webinars': None,
   'Podcast': None,
   'Guides': None,
   'Glossaries': None},
  'Documentation': {'Product Documentation': None, 'API Documentation': None},
  'Support': {'Services & Implementation': None, 'Help & Support': None},
  'Company': {'Abou

In [11]:
import asyncio
from scrapegraphai.docloaders.chromium import ChromiumLoader

# Asynchronous function to scrape content
async def scrape_content(url):
    loader = ChromiumLoader(urls=[url])
    async for document in loader.alazy_load():
        return document


url = "https://www.chargebee.com"
content = await scrape_content(url)
print(content)






In [12]:
content



In [13]:
type(content)

langchain_core.documents.base.Document

In [17]:
print(content.page_content)

<!DOCTYPE html><html lang="en-US" class="  " style=""><head>
    <meta data-n-head="ssr" http-equiv="Content-Type" content="text/html; charset=utf-8"><meta data-n-head="ssr" name="theme-color" content="#012A38"><meta data-n-head="ssr" name="google-site-verification" content="F53LIq9XK0wpbOxoXVy_etqAoHNIuhvC2S8wt46cwcE"><meta data-n-head="ssr" property="og:site_name" content="Chargebee"><meta data-n-head="ssr" property="og:type" content="website"><meta data-n-head="ssr" name="twitter:account_id" content="321192908"><meta data-n-head="ssr" name="twitter:card" content="summary"><meta data-n-head="ssr" name="twitter:site" content="@Chargebee"><meta data-n-head="ssr" name="twitter:creator" content="@Chargebee"><meta data-n-head="ssr" name="twitter:domain" content="https://www.chargebee.com"><meta data-n-head="ssr" name="msvalidate.01" content="2BDFBE565908FC3899BBA4BD498C57C2"><meta data-n-head="ssr" name="facebook-domain-verification" content="yhgkto5dv1ri6zf2l02alev36wa58j"><meta data-n-h

In [26]:
from scrapegraphai.graphs import SearchGraph

# Define the configuration for the graph
graph_config = {
    "llm": {
        "api_key": OPENAI_API_KEY,
        "model": "gpt-3.5-turbo",
    },
    "max_results": 1,
}

# Create the SearchGraph instance
search_graph = SearchGraph(
    prompt="What is official website of company Clarks?",
    config=graph_config
)

# Run the graph
result = search_graph.run()
print(result)

{'answer': 'https://www.clarks.co.jp/'}


In [23]:
from scrapegraphai.graphs import SmartScraperMultiGraph
# Define the configuration for the graph
graph_config = {
    "llm": {
        "api_key": OPENAI_API_KEY,
        "model": "gpt-3.5-turbo",
    }
}

# Create the SearchGraph instance
search_graph = SmartScraperMultiGraph(
    prompt="""
            - What service or product does the company provide?
            - What features does the product or service have?
            """,
    source=["https://www.chargebee.com"],
    config=graph_config
)

result = search_graph.run()
print(result)

[2024-07-08 23:28:32 - asyncio:1744 - ERROR] Task exception was never retrieved
future: <Task finished name='Task-73' coro=<tqdm_asyncio.gather.<locals>.wrap_awaitable() done, defined at /Users/chenkangan/Documents/UCL DSML/Thesis/project/ucl_project/lib/python3.10/site-packages/tqdm/asyncio.py:75> exception=ValueError("No HTML body content found, please try setting the 'headless' flag to False in the graph configuration. HTML content: .")>
Traceback (most recent call last):
  File "/Users/chenkangan/miniforge3/lib/python3.10/asyncio/tasks.py", line 234, in __step
    result = coro.throw(exc)
  File "/Users/chenkangan/Documents/UCL DSML/Thesis/project/ucl_project/lib/python3.10/site-packages/tqdm/asyncio.py", line 76, in wrap_awaitable
    return i, await f
  File "/Users/chenkangan/Documents/UCL DSML/Thesis/project/ucl_project/lib/python3.10/site-packages/scrapegraphai/nodes/graph_iterator_node.py", line 123, in _async_run
    return await asyncio.to_thread(graph.run)
  File "/Users/c

{'What service or product does the company provide?': 'Chargebee provides a Revenue Growth Management platform for subscription businesses.', 'What features does the product or service have?': {'Recurring Billing': ['Usage-based billing', 'Custom Quotes', 'Proration', 'Billing APIs', 'Pricing Models', 'Account Hierarchy'], 'Subscription Management': ['Product Catalog', 'Feature Provisioning', 'Trial & Self-Serve Management', 'Email Notifications', 'Reporting & Analytics'], 'Payment Management': ['Chargebacks Management', 'Dunning Management', 'Checkout', 'In-app purchases', 'Payment methods']}}


In [24]:
result

{'What service or product does the company provide?': 'Chargebee provides a Revenue Growth Management platform for subscription businesses.',
 'What features does the product or service have?': {'Recurring Billing': ['Usage-based billing',
   'Custom Quotes',
   'Proration',
   'Billing APIs',
   'Pricing Models',
   'Account Hierarchy'],
  'Subscription Management': ['Product Catalog',
   'Feature Provisioning',
   'Trial & Self-Serve Management',
   'Email Notifications',
   'Reporting & Analytics'],
  'Payment Management': ['Chargebacks Management',
   'Dunning Management',
   'Checkout',
   'In-app purchases',
   'Payment methods']}}

In [2]:
company_name = 'Clarks'

In [3]:
import requests

SEARCH_API = os.getenv("GOOGLE_SEARCH_KEY")
SEARCH_ENGINE_ID = os.getenv("SEARCH_ENGINE_ID")

search_query = f"Website of company {company_name}"
url = "https://www.googleapis.com/customsearch/v1"
params = {
    "q": search_query,
    "key": SEARCH_API,
    "cx": SEARCH_ENGINE_ID
}

response = requests.get(url, params=params)

In [7]:
result = response.json()

if 'items' in result:
    print(result['items'][0]['link'])

http://clarks.com/
