In [1]:
import pandas as pd
import numpy as np
import os
from neo4j import GraphDatabase
from dotenv import load_dotenv
from firecrawl_scraping import *
from utility import *
from llm_extraction import *


### PitchBook Data Preprocessing

In [2]:
# Sanitize property names
def process_column_name(name):
    name = name.lower()
    return name.replace(' ', '_').replace('#', 'number').replace('/', '_').replace(';', '_').replace('-', '_').replace(',', '')
    
def preprocess_url(url):
    # Check for NaN (missing URL)
    if isinstance(url, float):
        return None

    # Ensure the URL has 'http://' or 'https://' at the start
    if not url.startswith('www.'):
        url = 'www.' + url
    
    return url

In [3]:
df = pd.read_excel('PitchBook_Company_General_Information_2024_06_28_12_35_25.xlsx', header=7)
df = df[~df['Company Name'].isna()]

column_names = list(df.columns)
print(column_names)
df.head()

['Company ID', 'Company Name', 'Company Former Name', 'Description', 'Financing Status Note', 'Primary Industry Sector', 'Primary Industry Group', 'Primary Industry Code', 'Verticals', 'All Industries', 'Website', '# of Employees at Company', 'Former Listing', 'Company Business Status', 'Company Financing Status', 'Year Founded', 'Parent Company', 'HQ Address Line 1', 'HQ Address Line 2', 'HQ City', 'HQ State/Province', 'HQ Post Code', 'HQ Country/Territory', 'HQ Phone', 'HQ Fax', 'HQ Email', 'Primary Contact', 'Primary Contact Title', 'Secondary Contact', 'PitchBook Link', 'LinkedIn URL']


Unnamed: 0,Company ID,Company Name,Company Former Name,Description,Financing Status Note,Primary Industry Sector,Primary Industry Group,Primary Industry Code,Verticals,All Industries,...,HQ Post Code,HQ Country/Territory,HQ Phone,HQ Fax,HQ Email,Primary Contact,Primary Contact Title,Secondary Contact,PitchBook Link,LinkedIn URL
0,55185-04,Estimize,,Developer of an open financial estimates platf...,The company was acquired by ExtractAlpha for a...,Information Technology,Software,Financial Software,"FinTech, SaaS, TMT",Financial Software*;Other Financial Services;M...,...,10001,United States,+1 (212) 634-9085,,info@estimize.com,Vinesh Jha,Chief Executive Officer,,,
1,167096-35,JetClosing,,Developer of a real estate settlement applicat...,The company is no longer actively in business....,Consumer Products and Services (B2C),Services (Non-Financial),Real Estate Services (B2C),"Mobile, Real Estate Technology, SaaS, TMT",Real Estate Services (B2C)*;Other Commercial S...,...,98154,United States,+1 (866) 538-1000,,,,,,,
2,56288-62,New Constructs,,Operator of an investment research firm intend...,"Solidus, Lucius Burch, Lhoist North America of...",Information Technology,Software,Financial Software,"Artificial Intelligence & Machine Learning, Bi...",Financial Software*;Media and Information Serv...,...,37027,United States,+1 (615) 377-0443,,info@newconstructs.com,David Trainer,Co-Founder & Chief Executive Officer,,,
3,59990-50,Justworks,Clockwork,Developer of a human resource management platf...,The company raised $16.84 million of venture f...,Information Technology,Software,Business/Productivity Software,"FinTech, HR Tech, SaaS",Business/Productivity Software*;Human Capital ...,...,10008-7119,United States,+1 (888) 534-1711,,hello@justworks.com,Isaac Oates,Co-Founder & Executive Chairman,,,
4,53739-01,Procore Technologies,,Procore Technologies Inc is a cloud-based cons...,The company raised $634.49 million in its init...,Information Technology,Software,Business/Productivity Software,"Construction Technology, Real Estate Technolog...",Business/Productivity Software*;Media and Info...,...,93013,United States,+1 (866) 477-6267,,info@procore.com,Paul Lyandres,President,Craig Courtemanche Jr.,,


In [None]:
processed_column_names = [process_column_name(i) for i in column_names]
column_mapper = dict(zip(column_names, processed_column_names))
df = df.rename(column_mapper, axis=1)
df['processed_url'] = df['website'].apply(preprocess_url)
df = df.dropna(subset=['processed_url'])
# df['is_accessible'] = df['processed_url'].apply(is_webpage_accessible)
# df.to_csv('PitchBook_Company_General_Information_2024_06_28_12_35_25_processed.csv', index = False)

### Exploratory Analysis

In [22]:
df = pd.read_csv('PitchBook_Company_General_Information_2024_06_28_12_35_25_processed.csv')
print(df.company_business_status.unique())
df = df[~df['company_business_status'].isin(['Out of Business', 'Bankruptcy: Liquidation', 'Bankruptcy: Admin/Reorg'])]
df


['Generating Revenue' 'Out of Business' 'Profitable'
 'Generating Revenue/Not Profitable' 'Startup' 'Stealth'
 'Product In Beta Test' 'Clinical Trials - Phase 1'
 'Bankruptcy: Liquidation' 'Restart' 'Bankruptcy: Admin/Reorg']


Unnamed: 0,company_id,company_name,company_former_name,description,financing_status_note,primary_industry_sector,primary_industry_group,primary_industry_code,verticals,all_industries,...,hq_phone,hq_fax,hq_email,primary_contact,primary_contact_title,secondary_contact,pitchbook_link,linkedin_url,processed_url,is_accessible
0,55185-04,Estimize,,Developer of an open financial estimates platf...,The company was acquired by ExtractAlpha for a...,Information Technology,Software,Financial Software,"FinTech, SaaS, TMT",Financial Software*;Other Financial Services;M...,...,+1 (212) 634-9085,,info@estimize.com,Vinesh Jha,Chief Executive Officer,,,,www.estimize.com,True
2,56288-62,New Constructs,,Operator of an investment research firm intend...,"Solidus, Lucius Burch, Lhoist North America of...",Information Technology,Software,Financial Software,"Artificial Intelligence & Machine Learning, Bi...",Financial Software*;Media and Information Serv...,...,+1 (615) 377-0443,,info@newconstructs.com,David Trainer,Co-Founder & Chief Executive Officer,,,,www.newconstructs.com,True
3,59990-50,Justworks,Clockwork,Developer of a human resource management platf...,The company raised $16.84 million of venture f...,Information Technology,Software,Business/Productivity Software,"FinTech, HR Tech, SaaS",Business/Productivity Software*;Human Capital ...,...,+1 (888) 534-1711,,hello@justworks.com,Isaac Oates,Co-Founder & Executive Chairman,,,,www.justworks.com,False
4,53739-01,Procore Technologies,,Procore Technologies Inc is a cloud-based cons...,The company raised $634.49 million in its init...,Information Technology,Software,Business/Productivity Software,"Construction Technology, Real Estate Technolog...",Business/Productivity Software*;Media and Info...,...,+1 (866) 477-6267,,info@procore.com,Paul Lyandres,President,Craig Courtemanche Jr.,,,www.procore.com,True
5,58716-37,Driveway (Business/Productivity Software),,Developer of a telematics technology designed ...,The company was acquired by Earnix for an undi...,Information Technology,Software,Business/Productivity Software,"Artificial Intelligence & Machine Learning, Fi...",Business/Productivity Software*;Media and Info...,...,,,,,,,,,www.driveway.ai,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1050,180356-77,Your Front Desk,,Developer of a virtual property management pla...,The company joined Blue Startups as part of it...,Information Technology,Software,Business/Productivity Software,"Real Estate Technology, SaaS",Business/Productivity Software*;Media and Info...,...,,,info@vrfrontdesk.com,Aaron MacDaniel,Founder & Chief Executive Officer,,,,www.yourfrontdesk.co,True
1051,54093-61,Zanbato,,Developer of an alternative trading system des...,The company raised $20.50 million of Series C ...,Information Technology,Software,Financial Software,"FinTech, SaaS, TMT",Financial Software*;Media and Information Serv...,...,+1 (866) 926-2286,,,Knut Sand,"Co-Founder, Chief Executive Officer & Board Me...",,,,www.zanbato.com,True
1053,171336-07,Zenplace,,Developer of a rental management and leasing p...,"The company raised venture funding from Focal,...",Information Technology,Software,Business/Productivity Software,"Artificial Intelligence & Machine Learning, Bi...",Business/Productivity Software*;Real Estate Se...,...,+1 (888) 936-7522,,contactus@zenplace.com,Eric Holly,Chief Executive Officer & President,,,,www.zenplace.com,True
1054,459585-01,Zorba (Business/Productivity Software),,Developer of a property management platform in...,The company raised $2.2 million of venture fun...,Information Technology,Software,Business/Productivity Software,"Real Estate Technology, SaaS",Business/Productivity Software*;Media and Info...,...,+1 (813) 212-6319,,hello@getzorba.com,Or Preiss,Co-Founder & Chief Executive Officer,,,,www.getzorba.com,True


In [23]:
df_fintech = df[df['verticals'].str.contains('FinTech')]

df_fintech_filter = df_fintech[df_fintech['is_accessible'] == True]

df_fintech_filter

Unnamed: 0,company_id,company_name,company_former_name,description,financing_status_note,primary_industry_sector,primary_industry_group,primary_industry_code,verticals,all_industries,...,hq_phone,hq_fax,hq_email,primary_contact,primary_contact_title,secondary_contact,pitchbook_link,linkedin_url,processed_url,is_accessible
0,55185-04,Estimize,,Developer of an open financial estimates platf...,The company was acquired by ExtractAlpha for a...,Information Technology,Software,Financial Software,"FinTech, SaaS, TMT",Financial Software*;Other Financial Services;M...,...,+1 (212) 634-9085,,info@estimize.com,Vinesh Jha,Chief Executive Officer,,,,www.estimize.com,True
2,56288-62,New Constructs,,Operator of an investment research firm intend...,"Solidus, Lucius Burch, Lhoist North America of...",Information Technology,Software,Financial Software,"Artificial Intelligence & Machine Learning, Bi...",Financial Software*;Media and Information Serv...,...,+1 (615) 377-0443,,info@newconstructs.com,David Trainer,Co-Founder & Chief Executive Officer,,,,www.newconstructs.com,True
8,54118-36,Magnite,"The Rubicon Project, Inc., Rubicon Project, Inc.",Magnite is one of the largest supply-side plat...,The company completed a $540 million of debt r...,Information Technology,Software,Business/Productivity Software,"AdTech, FinTech, SaaS, TMT",Business/Productivity Software*;Media and Info...,...,+1 (212) 243-2769,+1 (212) 414-8748,contact@magnite.com,David Day,Chief Financial Officer,,,,www.magnite.com,True
14,50851-45,Cardlytics,,Cardlytics Inc operates an advertising platfor...,The company raised $70.2 million in its initia...,Business Products and Services (B2B),Commercial Services,Media and Information Services (B2B),"Artificial Intelligence & Machine Learning, Ad...",Media and Information Services (B2B)*;Business...,...,+1 (888) 798-5802,,info@cardlytics.com,Alexis DeSieno,"Chief Financial Officer, Finance & Chief Accou...",,,,www.cardlytics.com,True
16,40672-81,Mindbody,,Developer of business management software inte...,The company received an undisclosed amount of ...,Information Technology,Software,Business/Productivity Software,"FinTech, Marketing Tech, SaaS",Business/Productivity Software*;Media and Info...,...,+1 (877) 755-4279,,,Tom Aveston,Chief Financial Officer,,,,www.mindbodyonline.com,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1040,104511-79,XtremeMortgageWorx,,Developer of cloud-based automation marketing ...,An undisclosed investor sold a stake in the co...,Business Products and Services (B2B),Commercial Services,Media and Information Services (B2B),"FinTech, SaaS, TMT",Media and Information Services (B2B)*;Business...,...,+1 (866) 855-5070,+1 (866) 855-5070,,Rhett Broussard,"Founder, Chief Executive Officer & President",,,,www.xtrememortgageworx.com,True
1046,471280-69,YesRef,,Developer of a referee payment portal designed...,"The company raised GBP 50,000 of venture fundi...",Information Technology,Software,Financial Software,"FinTech, Mobile, SaaS, TMT",Financial Software*;Business/Productivity Soft...,...,+44 (0)33 0124 6844,,,Oliver Ballinger,"Co-Founder, Chief Executive Officer & Board Me...",,,,www.yesref.com,True
1048,459859-42,Yottled,,Developer of a client management platform desi...,M25 sold a stake in the company to an undisclo...,Information Technology,Software,Business/Productivity Software,"FinTech, Mobile, SaaS",Business/Productivity Software*;Application So...,...,+1 (831) 228-6756,,hello@yottled.com,,,,,,www.yottled.com,True
1049,435817-09,YouAttest,,Developer of a cloud-based IGA engine designed...,"The company raised $655,000 of venture funding...",Business Products and Services (B2B),Commercial Services,"Accounting, Audit and Tax Services (B2B)","Artificial Intelligence & Machine Learning, Bi...","Accounting, Audit and Tax Services (B2B)*;Netw...",...,+1 (714) 658-0765,,info@youattest.com,Garret Grajek,"Co-Founder, Chief Executive Officer, President...",,,,www.youattest.com,True


In [24]:
df_not_accessible = df_fintech[df_fintech['is_accessible'] != True]
df_not_accessible

Unnamed: 0,company_id,company_name,company_former_name,description,financing_status_note,primary_industry_sector,primary_industry_group,primary_industry_code,verticals,all_industries,...,hq_phone,hq_fax,hq_email,primary_contact,primary_contact_title,secondary_contact,pitchbook_link,linkedin_url,processed_url,is_accessible
3,59990-50,Justworks,Clockwork,Developer of a human resource management platf...,The company raised $16.84 million of venture f...,Information Technology,Software,Business/Productivity Software,"FinTech, HR Tech, SaaS",Business/Productivity Software*;Human Capital ...,...,+1 (888) 534-1711,,hello@justworks.com,Isaac Oates,Co-Founder & Executive Chairman,,,,www.justworks.com,False
5,58716-37,Driveway (Business/Productivity Software),,Developer of a telematics technology designed ...,The company was acquired by Earnix for an undi...,Information Technology,Software,Business/Productivity Software,"Artificial Intelligence & Machine Learning, Fi...",Business/Productivity Software*;Media and Info...,...,,,,,,,,,www.driveway.ai,False
10,55758-97,Gusto,"Switchboard Labs, ZenPayroll","Developer of cloud-based payroll, benefits, an...",The company raised $230 million of Series E ve...,Information Technology,Software,Business/Productivity Software,"FinTech, HR Tech, Industrials, SaaS, TMT",Business/Productivity Software*;Human Capital ...,...,+1 (800) 936-0383,,info@gusto.com,Joshua Reeves,"Co-Founder, Chief Executive Officer & Board Me...",,,,www.gusto.com,False
12,56265-94,Carta,"eShares, Eshares Securities",Developer of an equity management platform des...,"Kima Ventures, Anthemis, NKM Capital, SV Angel...",Information Technology,Software,Financial Software,"FinTech, Mobile, SaaS, TMT",Financial Software*;Media and Information Serv...,...,+1 (650) 669-8381,,info@carta.com,Henry Ward,"Co-Founder, Chief Executive Officer & Board Me...",,,,www.carta.com,False
13,52260-85,YapStone,,Developer of a global payments platform design...,The company reached a definitive agreement to ...,Information Technology,Software,Financial Software,"B2B Payments, FinTech, Mobile Commerce, Mobile...",Financial Software*;Media and Information Serv...,...,+1 (866) 289-5977,,info@yapstone.com,Frank Mastrangelo,Co-Chief Executive Officer & Board Member,,,,www.yapstone.com,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1003,466148-08,V-Art,,Developer of an automated IP (internet protoco...,The company is on the process of raising $2 mi...,Information Technology,Software,Business/Productivity Software,"Artificial Intelligence & Machine Learning, Bi...",Business/Productivity Software*;Other Financia...,...,+1 (302) 608-0768,,info@v-art.digital,Anastasiia Gliebova,"Co-Founder, Chief Executive Officer & Board Me...",,,,www.v-art.digital,False
1007,234159-58,Valspresso,,Developer of funding assistance software desig...,The company raised an undisclosed amount of an...,Information Technology,Software,Financial Software,"Artificial Intelligence & Machine Learning, Cl...",Financial Software*;Media and Information Serv...,...,+1 (877) 694-2209,,info@valspresso.com,Reginald Nosegbe,"Founder, President & Chief Executive Officer",,,,www.valspresso.com,False
1015,233366-23,VeriLedger,,Operator of a cryptocurrency accounting platfo...,The company joined WTIA Startup Program as a p...,Information Technology,Software,Financial Software,"Cryptocurrency/Blockchain, FinTech, SaaS",Financial Software*;Other Financial Services;A...,...,,,contact@veriledger.io,,,,,,www.veriledger.io,False
1041,52852-06,XTRM,XBux,Provider of an API-powered global payment plat...,Artiman Ventures sold its 14.56% stake in the ...,Business Products and Services (B2B),Commercial Services,Media and Information Services (B2B),"FinTech, SaaS, TMT",Media and Information Services (B2B)*;Financia...,...,+1 (866) 367-9289,,support@xtrm.com,Richard Grogan-Crane,Founder & Chief Executive Officer,,,,www.xtrm.com,False


In [25]:
dict(zip(list(df_fintech_filter.company_name), list(df_fintech_filter.processed_url)))

{'Estimize': 'www.estimize.com',
 'New Constructs': 'www.newconstructs.com',
 'Magnite': 'www.magnite.com',
 'Cardlytics': 'www.cardlytics.com',
 'Mindbody': 'www.mindbodyonline.com',
 'Plaid': 'www.plaid.com',
 'Human Interest': 'www.humaninterest.com',
 'Agro.Club': 'www.agro.club',
 'ChargeBee': 'www.chargebee.com',
 'WorkMarket': 'www.workmarket.com',
 'Xactly': 'www.xactlycorp.com',
 'Catalis (Alpharetta)': 'www.catalisgov.com',
 'FloQast': 'www.floqast.com',
 'Zenefits': 'www.zenefits.com',
 'Pinnacle Realty Advisors': 'www.pinnaclera.com',
 'Quantexa': 'www.quantexa.com',
 'Ooyala': 'www.ooyala.com',
 'Tradeshift': 'www.tradeshift.com',
 'Landbay': 'www.landbay.co.uk',
 'Acumatica': 'www.acumatica.com',
 'Ocrolus': 'www.ocrolus.com',
 'Softheon': 'www.softheon.com',
 'Side': 'www.side.com',
 'Karbon': 'www.karbonhq.com',
 'GoSite': 'www.gosite.com',
 'ShowingTime': 'www.showingtime.com',
 'Tango Card': 'www.tangocard.com',
 'Beacon (London)': 'www.beacon.com',
 'Axial': 'www.axi

In [26]:
dict(zip(list(df_not_accessible.company_name), list(df_not_accessible.processed_url)))

{'Justworks': 'www.justworks.com',
 'Driveway (Business/Productivity Software)': 'www.driveway.ai',
 'Gusto': 'www.gusto.com',
 'Carta': 'www.carta.com',
 'YapStone': 'www.yapstone.com',
 'PitchBook Data': 'www.pitchbook.com',
 'Shyft': 'www.shyftmoving.com',
 'SuperRewards': 'www.superrewards.com',
 'Finaloop': 'www.finaloop.com',
 'Stem (Financial Software)': 'www.stem.is',
 'idaciti': 'www.hello.idaciti.com',
 'FixtHub': 'www.fixthub.com',
 'DiliVer': 'www.diliver.com',
 '6fusion': 'www.6fusion.com',
 'Adapt Ready': 'www.adaptready.com',
 'AddZest': 'www.addzest.ai',
 'AuthorityData': 'www.authoritydata.com',
 'BeneStream': 'www.benestream.com',
 'Betterfin': 'www.betterfin.com',
 'Capdesk': 'www.capdesk.com',
 'CareerGig': 'www.careergig.com',
 'CodexDF': 'www.codexdf.com',
 'CodiPark': 'www.codipark.us',
 'COG Network': 'www.cog.network',
 'Concert (Financial Software)': 'www.concertfinance.com',
 'CrowdTwist': 'www.crowdtwist.com',
 'CxO Analytics': 'www.cxo-analytics.com',
 'Cyo

In [27]:
sample = df_fintech_filter.iloc[0:30]
sample

Unnamed: 0,company_id,company_name,company_former_name,description,financing_status_note,primary_industry_sector,primary_industry_group,primary_industry_code,verticals,all_industries,...,hq_phone,hq_fax,hq_email,primary_contact,primary_contact_title,secondary_contact,pitchbook_link,linkedin_url,processed_url,is_accessible
0,55185-04,Estimize,,Developer of an open financial estimates platf...,The company was acquired by ExtractAlpha for a...,Information Technology,Software,Financial Software,"FinTech, SaaS, TMT",Financial Software*;Other Financial Services;M...,...,+1 (212) 634-9085,,info@estimize.com,Vinesh Jha,Chief Executive Officer,,,,www.estimize.com,True
2,56288-62,New Constructs,,Operator of an investment research firm intend...,"Solidus, Lucius Burch, Lhoist North America of...",Information Technology,Software,Financial Software,"Artificial Intelligence & Machine Learning, Bi...",Financial Software*;Media and Information Serv...,...,+1 (615) 377-0443,,info@newconstructs.com,David Trainer,Co-Founder & Chief Executive Officer,,,,www.newconstructs.com,True
8,54118-36,Magnite,"The Rubicon Project, Inc., Rubicon Project, Inc.",Magnite is one of the largest supply-side plat...,The company completed a $540 million of debt r...,Information Technology,Software,Business/Productivity Software,"AdTech, FinTech, SaaS, TMT",Business/Productivity Software*;Media and Info...,...,+1 (212) 243-2769,+1 (212) 414-8748,contact@magnite.com,David Day,Chief Financial Officer,,,,www.magnite.com,True
14,50851-45,Cardlytics,,Cardlytics Inc operates an advertising platfor...,The company raised $70.2 million in its initia...,Business Products and Services (B2B),Commercial Services,Media and Information Services (B2B),"Artificial Intelligence & Machine Learning, Ad...",Media and Information Services (B2B)*;Business...,...,+1 (888) 798-5802,,info@cardlytics.com,Alexis DeSieno,"Chief Financial Officer, Finance & Chief Accou...",,,,www.cardlytics.com,True
16,40672-81,Mindbody,,Developer of business management software inte...,The company received an undisclosed amount of ...,Information Technology,Software,Business/Productivity Software,"FinTech, Marketing Tech, SaaS",Business/Productivity Software*;Media and Info...,...,+1 (877) 755-4279,,,Tom Aveston,Chief Financial Officer,,,,www.mindbodyonline.com,True
17,59128-48,Plaid,Plaid Technologies,Developer of a data network programming platfo...,The company raised an undisclosed amount of ve...,Information Technology,Software,Financial Software,"FinTech, SaaS, TMT",Financial Software*;Media and Information Serv...,...,+1 (415) 799-1354,,info@plaid.com,Zachary Perret,"Co-Founder, President & Chief Executive Officer",William Hockey,,,www.plaid.com,True
18,120927-25,Human Interest,Captain401,Operator of an administration and advising pla...,The company raised an estimated $71.40 million...,Financial Services,Other Financial Services,Other Financial Services,"FinTech, SaaS",Other Financial Services*;Media and Informatio...,...,+1 (855) 622-7824,,contact@humaninterest.com,Jeff Schneble Ph.D,Chief Executive Officer & Board Member,,,,www.humaninterest.com,True
22,267018-58,Agro.Club,,Developer of a full-stack B2B marketplace desi...,The company is reportedly seeking Series B ven...,Information Technology,Software,Business/Productivity Software,"AgTech, FinTech, Mobile, SaaS, Supply Chain Tech",Business/Productivity Software*;Other Agricult...,...,,,hello@agro.club,Egor Kirin,Founder & Chief Executive Officer,,,,www.agro.club,True
23,55782-55,ChargeBee,BubblePath,Developer of a revenue growth management platf...,The company raised $250 million of Series H ve...,Information Technology,Software,Financial Software,"FinTech, SaaS",Financial Software*;Media and Information Serv...,...,+1 (877) 900-1818,,info@chargebee.com,Krishnamoorthy Subramanian,"Co-Founder, Chief Executive Officer & Board Me...",,,,www.chargebee.com,True
24,43071-49,WorkMarket,,Developer of a SaaS labor automation platform ...,The company was acquired by Automatic Data Pro...,Information Technology,Software,Business/Productivity Software,"FinTech, HR Tech, SaaS, TMT",Business/Productivity Software*;Automation/Wor...,...,,,,Alejandro CEO,Chief Executive Officer,,,,www.workmarket.com,True


In [15]:
sample_dict = dict(zip(list(sample.company_name), list(sample.website)))
sample_dict

{'Estimize': 'www.estimize.com',
 'New Constructs': 'www.newconstructs.com',
 'Magnite': 'www.magnite.com',
 'Cardlytics': 'www.cardlytics.com',
 'Mindbody': 'www.mindbodyonline.com',
 'Plaid': 'www.plaid.com',
 'Human Interest': 'www.humaninterest.com',
 'Agro.Club': 'www.agro.club',
 'ChargeBee': 'www.chargebee.com',
 'WorkMarket': 'www.workmarket.com',
 'Xactly': 'www.xactlycorp.com',
 'Catalis (Alpharetta)': 'www.catalisgov.com',
 'CoinFlip': 'www.coinflip.network',
 'FloQast': 'www.floqast.com',
 'Zenefits': 'www.zenefits.com',
 'Pinnacle Realty Advisors': 'www.pinnaclera.com',
 'Quantexa': 'www.quantexa.com',
 'Ooyala': 'www.ooyala.com',
 'Tradeshift': 'www.tradeshift.com',
 'Landbay': 'www.landbay.co.uk',
 'Acumatica': 'www.acumatica.com',
 'Ocrolus': 'www.ocrolus.com',
 'Softheon': 'www.softheon.com',
 'Side': 'www.side.com',
 'Karbon': 'www.karbonhq.com',
 'GoSite': 'www.gosite.com',
 'ShowingTime': 'www.showingtime.com',
 'Tango Card': 'www.tangocard.com',
 'Beacon (London)':

In [9]:
for key, value in sample_dict.items():
    filename = process_company_name(key)

    LLM_extraction_agent(filename = filename, 
                        url = sample_dict[key])

https://www.xactlycorp.com




1. Scrape URL with Firecrawl
2: Save scraped contents as a MD file
Raw data saved to scraping_output/xactly_20240702.md
3: Clean scraped contents
4: Extract information using LLM
5: Save extracted information as a JSON file
Output saved to extraction_output/xactly.json
https://www.catalisgov.com




1. Scrape URL with Firecrawl
2: Save scraped contents as a MD file
Raw data saved to scraping_output/catalis_alpharetta_20240702.md
3: Clean scraped contents
4: Extract information using LLM
5: Save extracted information as a JSON file
Output saved to extraction_output/catalis_alpharetta.json
https://www.coinflip.network




1. Scrape URL with Firecrawl
2: Save scraped contents as a MD file
Raw data saved to scraping_output/coinflip_20240702.md
3: Clean scraped contents
4: Extract information using LLM
The MD file is empty
5: Save as an empty JSON file
Empty JSON file created at extraction_output/coinflip.json
https://www.floqast.com




1. Scrape URL with Firecrawl
2: Save scraped contents as a MD file
Raw data saved to scraping_output/floqast_20240702.md
3: Clean scraped contents
4: Extract information using LLM
5: Save extracted information as a JSON file
Output saved to extraction_output/floqast.json
https://www.zenefits.com




1. Scrape URL with Firecrawl
2: Save scraped contents as a MD file
Raw data saved to scraping_output/zenefits_20240702.md
3: Clean scraped contents
4: Extract information using LLM
5: Save extracted information as a JSON file
Output saved to extraction_output/zenefits.json
https://www.pinnaclera.com




1. Scrape URL with Firecrawl
2: Save scraped contents as a MD file
Raw data saved to scraping_output/pinnacle_realty_advisors_20240702.md
3: Clean scraped contents
4: Extract information using LLM
5: Save extracted information as a JSON file
Output saved to extraction_output/pinnacle_realty_advisors.json
https://www.quantexa.com




1. Scrape URL with Firecrawl
2: Save scraped contents as a MD file
Raw data saved to scraping_output/quantexa_20240702.md
3: Clean scraped contents
4: Extract information using LLM
5: Save extracted information as a JSON file
Output saved to extraction_output/quantexa.json
https://www.ooyala.com




1. Scrape URL with Firecrawl
2: Save scraped contents as a MD file
Raw data saved to scraping_output/ooyala_20240702.md
3: Clean scraped contents
4: Extract information using LLM
The MD file is empty
5: Save as an empty JSON file
Empty JSON file created at extraction_output/ooyala.json
https://www.tradeshift.com




1. Scrape URL with Firecrawl
2: Save scraped contents as a MD file
Raw data saved to scraping_output/tradeshift_20240702.md
3: Clean scraped contents
4: Extract information using LLM
5: Save extracted information as a JSON file
Output saved to extraction_output/tradeshift.json
https://www.landbay.co.uk




1. Scrape URL with Firecrawl
2: Save scraped contents as a MD file
Raw data saved to scraping_output/landbay_20240702.md
3: Clean scraped contents
4: Extract information using LLM
The MD file is empty
5: Save as an empty JSON file
Empty JSON file created at extraction_output/landbay.json


In [17]:
import json
import pandas as pd
from neo4j import GraphDatabase

# Escape single quotes for Cypher queries
def escape_quotes_for_cypher(value):
    return value.replace("'", "\\'")

# Sanitize property names
def sanitize_property_name(name):
    return name.replace(' ', '_').replace('#', 'number_of').replace('/', '_').replace(';', '_').replace('-', '_').replace(',', '')

# Generate Cypher CREATE statements from DataFrame
def df_to_cypher_create_nodes(df, node_label_column, attribute_columns):
    """
    Generates Cypher CREATE statements for nodes from a DataFrame.

    Args:
    df (pd.DataFrame): DataFrame containing the data.
    node_label_column (str): Column name for node labels.
    attribute_columns (list): List of column names for node attributes.

    Returns:
    list: List of Cypher CREATE statements.
    """
    cypher_statements = []
    
    for _, row in df.iterrows():
        properties = []
        node_label = 'Company'
        properties.append(f"name: '{escape_quotes_for_cypher(row[node_label_column])}'")  # Ensure 'name' property is set

        for col in attribute_columns:
            value = row[col]
            if isinstance(value, (dict, list)):
                json_value = json.dumps(value)
                escaped_value = escape_quotes_for_cypher(json_value)
                value = f"'{escaped_value}'"
            elif isinstance(value, bool):
                value = str(value).lower()
            elif isinstance(value, str):
                escaped_value = escape_quotes_for_cypher(value)
                value = f"'{escaped_value}'"
            else:
                value = str(value)
            properties.append(f"{col}: {value}")

        properties_str = ", ".join(properties)
        create_statement = f"CREATE (:{sanitize_property_name(node_label)} {{{properties_str}}})"
        cypher_statements.append(create_statement)
    
    return cypher_statements


In [18]:
cypher_statements = df_to_cypher_create_nodes(sample, 'company_name', ['company_name', 'company_former_name', 'description',
       'financing_status_note', 'primary_industry_sector',
       'primary_industry_group', 'primary_industry_code', 'verticals',
       'all_industries', 'processed_url', 'number_of_employees_at_company'])

print(cypher_statements)

["CREATE (:Company {name: 'Estimize', company_name: 'Estimize', company_former_name: nan, description: 'Developer of an open financial estimates platform designed to provide an accurate, more timely and more representative view of expectations. The company\\'s platform collects forward-looking financial estimates from independent, buy-side, and sell-side analysts, along with those of private investors and academics as well as offers a view of market expectations on stocks and economic indicators, enabling investors to analyze the market before investing.', financing_status_note: 'The company was acquired by ExtractAlpha for an undisclosed amount on May 11, 2021.', primary_industry_sector: 'Information Technology', primary_industry_group: 'Software', primary_industry_code: 'Financial Software', verticals: 'FinTech, SaaS, TMT', all_industries: 'Financial Software*;Other Financial Services;Media and Information Services (B2B)', processed_url: 'www.estimize.com', number_of_employees_at_com

In [19]:
# Upload nodes to Neo4j
def upload_nodes_to_neo4j(tx, statements):
    for statement in statements:
        tx.run(statement)

from neo4j import GraphDatabase

# Establish a connection to the Neo4j database
uri = "bolt://localhost:7687"
username = "neo4j"
password = os.getenv('NEO4J_PASSWORD')

driver = GraphDatabase.driver(uri, auth=(username, password))

with driver.session() as session:
        session.write_transaction(upload_nodes_to_neo4j, cypher_statements)
driver.close()



  session.write_transaction(upload_nodes_to_neo4j, cypher_statements)


In [13]:
# Connect to a Neo4j instance
from langchain_community.graphs import Neo4jGraph
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_openai import ChatOpenAI
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate
from langchain.schema import StrOutputParser


graph = Neo4jGraph(
    url="bolt://localhost:7687",
    username="neo4j",
    password=os.getenv('NEO4J_PASSWORD') 
)

result = graph.query("""
MATCH (n) 
RETURN n
""")

print(result)

ValueError: Could not use APOC procedures. Please ensure the APOC plugin is installed in Neo4j and that 'apoc.meta.data()' is allowed in Neo4j configuration 

In [42]:



llm = ChatOpenAI(openai_api_key=os.getenv('OPENAI_KEY'),
                 temperature = 0, 
                 model_name = "gpt-4o")
llm_transformer = LLMGraphTransformer(llm)

text = """
"Gusto's platform offers features including payroll management, benefits management, team management, time tracking, health insurance, dental insurance, and vision insurance management, enabling small businesses to take care of their team's requirements."
"""

# Create a template to help guide the LLM towards the type of graph we want to build
template = ChatPromptTemplate.from_messages(
    [
        (
            "system", 
            """
            ## 1. Overview
            You are a top-tier algorithm designed for extracting information in structured formats to build a knowledge graph.
            Given a parapraph, you need to extract the company and its services. The relationship between company and its service is OFFER
            """
        )
    ]
)

documents = [Document(page_content=text)]


llm_transformer_props = LLMGraphTransformer(
    llm=llm,
    allowed_nodes=["Company", "Product", "Service"],
    allowed_relationships=["OFFER"],
    # node_properties=["born_year"],
    # prompt = template
)
graph_documents_props = llm_transformer_props.convert_to_graph_documents(documents)
print(f"Nodes:{graph_documents_props[0].nodes}")
print(f"Relationships:{graph_documents_props[0].relationships}")



Nodes:[Node(id='Gusto', type='Company'), Node(id='Payroll Management', type='Service'), Node(id='Benefits Management', type='Service'), Node(id='Team Management', type='Service'), Node(id='Time Tracking', type='Service'), Node(id='Health Insurance', type='Service'), Node(id='Dental Insurance', type='Service'), Node(id='Vision Insurance Management', type='Service')]
Relationships:[Relationship(source=Node(id='Gusto', type='Company'), target=Node(id='Payroll Management', type='Service'), type='OFFER'), Relationship(source=Node(id='Gusto', type='Company'), target=Node(id='Benefits Management', type='Service'), type='OFFER'), Relationship(source=Node(id='Gusto', type='Company'), target=Node(id='Team Management', type='Service'), type='OFFER'), Relationship(source=Node(id='Gusto', type='Company'), target=Node(id='Time Tracking', type='Service'), type='OFFER'), Relationship(source=Node(id='Gusto', type='Company'), target=Node(id='Health Insurance', type='Service'), type='OFFER'), Relationshi

In [43]:
graph.add_graph_documents(graph_documents_props)

In [None]:
from langchain.graphs.graph_document import (
    Node as BaseNode,
    Relationship as BaseRelationship
)
from typing import List, Dict, Any, Optional
from langchain.pydantic_v1 import Field, BaseModel

class Property(BaseModel):
  """A single property consisting of key and value"""
  key: str = Field(..., description="key")
  value: str = Field(..., description="value")

class Node(BaseNode):
    properties: Optional[List[Property]] = Field(
        None, description="List of node properties")

class Relationship(BaseRelationship):
    properties: Optional[List[Property]] = Field(
        None, description="List of relationship properties"
    )

class KnowledgeGraph(BaseModel):
    """Generate a knowledge graph with entities and relationships."""
    nodes: List[Node] = Field(
        ..., description="List of nodes in the knowledge graph")
    rels: List[Relationship] = Field(
        ..., description="List of relationships in the knowledge graph"
    )
    

def get_extraction_chain(
    allowed_nodes: Optional[List[str]] = None,
    allowed_rels: Optional[List[str]] = None
    ):
    prompt = ChatPromptTemplate.from_messages(
    [(
      "system",
      f"""# Knowledge Graph Instructions for GPT-4
## 1. Overview
You are a top-tier algorithm designed for extracting information in structured formats to build a knowledge graph.
- **Nodes** represent entities and concepts. They're akin to Wikipedia nodes.
- The aim is to achieve simplicity and clarity in the knowledge graph, making it accessible for a vast audience.
## 2. Labeling Nodes
- **Consistency**: Ensure you use basic or elementary types for node labels.
  - For example, when you identify an entity representing a person, always label it as **"person"**. Avoid using more specific terms like "mathematician" or "scientist".
- **Node IDs**: Never utilize integers as node IDs. Node IDs should be names or human-readable identifiers found in the text.
{'- **Allowed Node Labels:**' + ", ".join(allowed_nodes) if allowed_nodes else ""}
{'- **Allowed Relationship Types**:' + ", ".join(allowed_rels) if allowed_rels else ""}
## 3. Handling Numerical Data and Dates
- Numerical data, like age or other related information, should be incorporated as attributes or properties of the respective nodes.
- **No Separate Nodes for Dates/Numbers**: Do not create separate nodes for dates or numerical values. Always attach them as attributes or properties of nodes.
- **Property Format**: Properties must be in a key-value format.
- **Quotation Marks**: Never use escaped single or double quotes within property values.
- **Naming Convention**: Use camelCase for property keys, e.g., `birthDate`.
## 4. Coreference Resolution
- **Maintain Entity Consistency**: When extracting entities, it's vital to ensure consistency.
If an entity, such as "John Doe", is mentioned multiple times in the text but is referred to by different names or pronouns (e.g., "Joe", "he"), 
always use the most complete identifier for that entity throughout the knowledge graph. In this example, use "John Doe" as the entity ID.  
Remember, the knowledge graph should be coherent and easily understandable, so maintaining consistency in entity references is crucial. 
## 5. Strict Compliance
Adhere to the rules strictly. Non-compliance will result in termination."""),
        ("human", "Use the given format to extract information from the following input: {input}"),
        ("human", "Tip: Make sure to answer in the correct format"),
    ])
    return create_structured_output_chain(KnowledgeGraph, llm, prompt, verbose=False)


def extract_and_store_graph(
    document: Document,
    nodes:Optional[List[str]] = None,
    rels:Optional[List[str]]=None) -> None:
    # Extract graph data using OpenAI functions
    extract_chain = get_extraction_chain(nodes, rels)
    data = extract_chain.run(document.page_content)
    # Construct a graph document
    graph_document = GraphDocument(
      nodes = [map_to_base_node(node) for node in data.nodes],
      relationships = [map_to_base_relationship(rel) for rel in data.rels],
      source = document
    )
    # Store information into a graph
    graph.add_graph_documents([graph_document])