In [57]:
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import os
from os import getenv
from openai import OpenAI
import pathlib
import textwrap
import google.generativeai as genai
import time
import requests
import ast
import json
from sklearn.metrics.pairwise import cosine_similarity
import re


pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
pd.set_option('display.precision', 2)

load_dotenv("../../.env",override=True)
GOOGLE_API_KEY = getenv("GEMINI_API_KEY")
OPENAI_API_KEY = getenv("OPENAI_API_KEY")


In [39]:

class chatGPT():
    def __init__(self):
        self.client = OpenAI(api_key=OPENAI_API_KEY)
        
    # def ask(self, q):
    #     stream = self.client.chat.completions.create(
    #         model="gpt-4",
    #         messages=[{"role": "user", "content": q}],
    #         stream=True,
    #         temperature=0
    #     )
    #     response = ""
    #     for chunk in stream:
    #         if chunk.choices[0].delta.content is not None:
    #             response += chunk.choices[0].delta.content

    #     self.response = response
    #     return response

    def get_embedding(self,text, model="text-embedding-3-large"):
       text = text.replace("\n", " ")
       return self.client.embeddings.create(input = [text], model=model).data[0].embedding

x_chat = chatGPT()


In [58]:
onet_df = pd.read_csv("../output/onet_df.csv")
onet_df = onet_df[(onet_df.task_similarity > .47) & (onet_df.job_title_similarity > .35)]
onet_weights = onet_df.groupby("organization_name")["onet_title"].count().apply(lambda x: 1/x).reset_index().rename({"onet_title":"onet_weight"},axis=1)
onet_df = onet_df.merge(onet_weights, on="organization_name")
onet_titles = onet_df.groupby("onet_title")["onet_weight"].sum().round(2).sort_values(ascending=False).reset_index()

onet_titles


Unnamed: 0,onet_title,onet_weight
0,Data Scientists,16.01
1,Search Marketing Strategists,15.74
2,Financial and Investment Analysts,10.3
3,Customer Service Representatives,9.5
4,Market Research Analysts and Marketing Special...,8.15
5,Investment Fund Managers,8.01
6,Graphic Designers,5.62
7,Writers and Authors,5.48
8,Travel Agents,5.43
9,Video Game Designers,5.27


In [33]:
onet_df.example_job_title

0            Legal Secretary
1            Legal Secretary
2            Legal Secretary
3            Legal Secretary
4            Legal Secretary
               ...          
229     vacation rental host
230     vacation rental host
231    vacation rental guest
232    vacation rental guest
233    vacation rental guest
Name: example_job_title, Length: 234, dtype: object

In [82]:
onet_df.onet_task.value_counts().reset_index().head()

Unnamed: 0,onet_task,count
0,Collect and analyze data on customer demograph...,20
1,"Forecast and track marketing and sales trends,...",18
2,Inform investment decisions by analyzing finan...,17
3,Apply feature selection algorithms to models p...,14
4,"Interpret data on price, yield, stability, fut...",14


In [None]:
# sample_df = pd.read_csv("../output/df_with_onet.csv")
# sample_df = sample_df[~sample_df.Industry.isnull()]
# sample_df['Industry_embedding'] = sample_df['Industry'].apply(lambda text: x_chat.get_embedding(text))

In [75]:
industry_df = pd.read_csv("../output/df_with_industry_embeddings.csv")
embeddings = industry_df.Industry_embedding.apply(lambda x: [float(y) for y in x.strip("[]").split(", ")])
embeddings = np.array(embeddings)
embeddings = np.vstack(embeddings)


In [92]:
# from sklearn.metrics.pairwise import cosine_similarity
# from sklearn.neighbors import NearestNeighbors
# from sklearn.cluster import SpectralClustering
# import matplotlib.pyplot as plt
# from sklearn.decomposition import PCA

# n_neighbors = 7



# # Create a KNN model for connectivity based on cosine distance
# knn = NearestNeighbors(n_neighbors=n_neighbors, metric='cosine')
# knn.fit(embeddings)
# connectivity_matrix = knn.kneighbors_graph(embeddings, mode='connectivity')

# # Use Spectral Clustering with the connectivity matrix defined by KNN
# clustering = SpectralClustering(n_clusters=200, affinity='precomputed', assign_labels='kmeans')
# cluster_labels = clustering.fit_predict(connectivity_matrix)

# # Assign cluster labels back to the original DataFrame
# sample_df['cluster_label'] = cluster_labels
# if "industry_cluster_label" in sample_df.columns:
#     sample_df = sample_df.drop(columns=['industry_cluster_label'])
# industry_clusters = pd.DataFrame(sample_df.groupby("cluster_label")['Industry'].agg(lambda x: ', '.join(x)).reset_index())


# industry_clusters = industry_clusters.rename({"Industry":"industry_cluster_label"},axis=1)
# sample_df = sample_df.merge(industry_clusters, on="cluster_label")


  adjacency = check_symmetric(adjacency)


In [105]:
sample_df['cluster_label'] = cluster_labels

if "industry_cluster_label" in sample_df.columns:
    sample_df = sample_df.drop(columns=['industry_cluster_label'])
industry_clusters = pd.DataFrame(sample_df.groupby("cluster_label")['Industry'].agg(lambda x: ', '.join(x)).reset_index())

industry_clusters = industry_clusters.rename({"Industry":"industry_cluster_label"},axis=1)
sample_df = sample_df.merge(industry_clusters, on="cluster_label")

industry_cluster_df = sample_df[["organization name","Industry","Industry_embedding","cluster_label","industry_cluster_label"]]
industry_cluster_df.industry_cluster_label.unique()


array(['Legal, Digital marketing, Agriculture',
       'Home Design, Game development',
       'Customer relationship management (CRM), Workflow Automation, Artificial intelligence, Mental health, Data Analytics, Web Development, Industrial maintenance',
       'Marketing , Climate technology,  AI software solutions',
       'Venture Capital, Hospitality, News aggregation, AI-powered research platform, Content creation, Public safety, Software Development, Knowledge management , Animal health, Marketing automation, Business automation, Fundraising, Financial services, Education technology',
       'IT Services, Financial Services, Business automation, Cybersecurity, AI-powered content creation , Legal Tech, Finance, Relationship Management, Website building, Video intelligence, Software development, IT Consulting, Retail, Restaurant management, Healthcare, Smart transportation, Legal Technology, Human Resources, AI-powered APIs, Talent Management , Healthcare,  Robotics, AI-powered cre

In [101]:
industry_df[industry_df.Industry.str.contains("Legal")]

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,organization name,num employees,founded date,description,industries,headquarters location,description.1,cb rank,postal code,headquarters region,website,actively hiring,linkedin,num articles,email,hub tag,phone num,num of sub org,industry groups,total funding amt,company type,estimated rev,description_all,industries_parsed,generated_description,Company,parsed_description,Tasks/Jobs,Industry,People Using Tool,examples,Job1_ONET,Job2_ONET,Job3_ONET,Example1,Job1,Example2,Job2,Example3,Job3,name,Job1_embedding,Job2_embedding,Job3_embedding,Industry_embedding,cluster_label
0,0,20738,Advoria,1-10,2023-08-29,Advoria's online appointment booking for law f...,"Artificial Intelligence (AI), Legal, Legal Tec...","Berlin, Berlin, Germany",Online appointment booking for the successful ...,239244,10967,"European Union (EU), Europe, Middle East, and ...",advoria.de,—,View on LinkedIn,—,mail@advoria.de,—,—,—,"Artificial Intelligence (AI), Data and Analyti...",—,For Profit,—,Advoria's online appointment booking for law f...,"['Legal', 'Legal Tech', 'SaaS', 'Software']","Advoria is a German-made, GDPR-compliant onlin...",Advoria,"\nTasks/Jobs: Appointment scheduling, Client i...","Appointment scheduling, Client intake, Secreta...",Legal,"Lawyers, Legal Secretaries",## Advoria Example Scenarios:\nExample 1: \nA ...,"[[""Legal Secretaries and Administrative Assist...","[[""Legal Secretaries and Administrative Assist...","[[""Legal Secretaries and Administrative Assist...",A lawyer uses Advoria to set up an automated b...,Legal Secretary schedules client appointments.,A law firm uses Advoria to collect necessary c...,Legal Secretary collects and organizes client ...,A paralegal uses Advoria to manage their own c...,Paralegal manages own calendar and appointment...,Advoria,"[-0.00684443861246109, -0.03683798760175705, -...","[-0.006861537229269743, -0.028195897117257118,...","[-0.011333281174302101, -0.03152986243367195, ...","[-0.002616604557260871, -0.00396003620699048, ...",136
70,70,22458,Fifty One Ai,1-10,2024-04-01,"At 51AI, a public benefit company, we're dedic...","Artificial Intelligence (AI), CivicTech, GovTe...","Arlington, Virginia, United States",AI with a Cause: Built to Scale Justice for Im...,—,22209,"Washington DC Metro Area, East Coast, Southern US",fiftyoneai.com,—,View on LinkedIn,1,wail@fiftyoneai.com,—,2028482282,—,"Artificial Intelligence (AI), Data and Analyti...",—,For Profit,—,"At 51AI, a public benefit company, we're dedic...","['CivicTech', 'GovTech', 'Legal Tech']",Fifty One AI is a public benefit company using...,Fifty One Ai,"\nTasks/Jobs: Report generation, Task automati...","Report generation, Task automation, Workflow o...",Legal,"Lawyers, Legal Aid Organizations",\nExample 1: A lawyer uses AI agents to automa...,,,,A lawyer uses AI agents to automatically colle...,A legal assistant collects and organizes evide...,A legal aid organization uses AI to generate c...,A legal aid researcher conducts research and w...,A human rights organization deploys AI agents ...,A human rights researcher tracks and monitors ...,Fifty One Ai,"[-0.0030687362886965275, -0.006015349645167589...","[0.005640426650643349, -0.008820916526019573, ...","[-0.004961515311151743, -0.009353178553283215,...","[-0.0026129940524697304, -0.003935463260859251...",136
103,103,20871,SPEED AI,1-10,2023-09-01,The Legal Industry's First and ONLY AI-Powered...,"Legal, Legal Tech, Software",—,The Legal Industry's First and ONLY AI-Powered...,—,—,—,speedintake.com/,—,—,—,ai@wedrivecases.com,—,—,—,"Professional Services, Software",—,For Profit,—,The Legal Industry's First and ONLY AI-Powered...,"['Legal', 'Legal Tech', 'Software']",SPEED AI is a revolutionary AI-powered platfor...,SPEED AI,"\nTasks/Jobs: Intake process automation, Prosp...","Intake process automation, Prospect data analy...",Legal,"Lawyers, Legal professionals",\nExample 1: A lawyer uses SPEED AI to automat...,,,,A lawyer uses SPEED AI to automate the initial...,A legal assistant collects client information ...,A legal professional uses SPEED AI to analyze ...,A paralegal reviews and analyzes legal documen...,A law firm uses SPEED AI to automatically scre...,A law firm marketing specialist performs lead ...,SPEED AI,"[-0.02201763540506363, 0.0075805396772921085, ...","[-0.03470991924405098, -0.01375794131308794, -...","[-0.0027657123282551765, -0.03262830898165703,...","[-0.002592011122033, -0.003965755458921194, -0...",136
178,178,21098,Lexlabs AI,1-10,2023-10-01,Lexlabs AI is revolutionizing the legal tech i...,"Information Technology, Legal Tech","San Francisco, California, United States",Lexlabs streamlines Contract Management with A...,—,—,"San Francisco Bay Area, West Coast, Western US",www.lexlabs.ai/,—,View on LinkedIn,—,support@lexlabs.ai,—,—,—,"Information Technology, Professional Services",—,For Profit,—,Lexlabs AI is revolutionizing the legal tech i...,"['Information Technology', 'Legal Tech']",Lexlabs AI is an AI-powered contract managemen...,Lexlabs AI,"\nTasks/Jobs: Contract drafting, Contract anal...","Contract drafting, Contract analysis, Contract...",Legal Technology,"Lawyers, Construction Businesses",\nExample 1: A lawyer uses Lexlabs AI to analy...,,,,A lawyer uses Lexlabs AI to analyze a complex ...,A lawyer performs contract analysis to identif...,A construction manager uses Lexlabs AI to draf...,A lawyer drafts a standard construction contra...,A construction company uses Lexlabs AI to revi...,A contract manager reviews and manages constru...,Lexlabs AI,"[-0.017063811421394348, -0.03392619639635086, ...","[-0.0052331057377159595, -0.04000703617930412,...","[-0.014200237579643726, 0.007051771506667137, ...","[0.007521213497966528, -0.02653316967189312, -...",71
281,281,18882,Lexlink AI,1-10,2023-01-24,Lexlink.ai revolutionizes legal due diligence ...,"Artificial Intelligence (AI), Legal, Legal Tech","Mountain View, California, United States",Lexlink.ai: Transforming legal due diligence &...,78653,—,"San Francisco Bay Area, Silicon Valley, West C...",www.lexlink.ai,—,View on LinkedIn,3,hello@lexlink.ai,—,—,—,"Artificial Intelligence (AI), Data and Analyti...","$187,005",For Profit,—,Lexlink.ai revolutionizes legal due diligence ...,"['Legal', 'Legal Tech']",Lexlink.ai is an AI-powered legal due diligenc...,Lexlink AI,"\nTasks/Jobs: Document review, Inconsistency ...","Document review, Inconsistency detection, Co...",Legal due diligence,"Lawyers, Law firms",\nExample 1: A lawyer uses Lexlink AI to revie...,,,,A lawyer uses Lexlink AI to review hundreds of...,A lawyer performs document review to ensure in...,A junior associate at a law firm uses Lexlink ...,A junior associate performs legal document rev...,A small law firm uses Lexlink AI to conduct du...,A lawyer conducts due diligence for a start up...,Lexlink AI,"[0.008794877678155899, -0.03216756507754326, -...","[-0.013765326701104641, -0.02444649487733841, ...","[-0.0011428106809034944, -0.023619988933205605...","[0.022129228338599205, -0.008283508010208607, ...",137
337,337,19244,Jurist AI,1-10,2023-03-01,AI CoPilot for Legal Professionals,Artificial Intelligence (AI),"Makati, Manila, Philippines",AI SaaS for Legal Professionals,289872,—,"Asia-Pacific (APAC), Association of Southeast ...",www.jurist.ph,—,View on LinkedIn,1,hello@jurist.ph,—,—,—,"Artificial Intelligence (AI), Data and Analyti...",—,For Profit,—,AI CoPilot for Legal Professionals AI SaaS for...,[],Jurist AI is an AI-powered software-as-a-servi...,Jurist AI,"\nTasks/Jobs: Legal research, Document draftin...","Legal research, Document drafting, Case manage...",Legal Technology,"Lawyers, Legal professionals",\nExample 1: A lawyer uses Jurist AI to automa...,,,,A lawyer uses Jurist AI to automatically draft...,A legal assistant would draft the contract.,A paralegal utilizes Jurist AI to conduct lega...,A paralegal would conduct legal research.,A litigation attorney leverages Jurist AI to m...,"A legal assistant would manage the case files,...",Jurist AI,"[-0.012824032455682755, -0.03496798500418663, ...","[-0.008540261536836624, -0.006222190335392952,...","[-0.012473296374082565, -0.03415875509381294, ...","[0.007503456901758909, -0.026531945914030075, ...",71
353,353,21700,LegalDex AI,1-10,2024-01-01,—,"Artificial Intelligence (AI), Legal, Legal Tech","Makati, Manila, Philippines",AI Legal-tech startup,383182,12,"Asia-Pacific (APAC), Association of Southeast ...",legaldex.com,—,View on LinkedIn,—,hi@legaldex.com,—,—,—,"Artificial Intelligence (AI), Data and Analyti...",—,For Profit,—,— AI Legal-tech startup,"['Legal', 'Legal Tech']",LegalDex AI is an AI-powered legal research pl...,LegalDex AI,"\nTasks/Jobs: Legal research, Case analysis, D...","Legal research, Case analysis, Document review...",Legal Research,"Lawyers, Legal professionals",\nExample 1: A lawyer uses LegalDex AI to quic...,,,,A lawyer uses LegalDex AI to quickly locate re...,A legal research assistant is tasked with find...,A corporate counsel uses LegalDex AI to analyz...,A legal professional reviews a new contract fo...,An intellectual property lawyer uses LegalDex ...,A legal professional is tasked with searching ...,LegalDex AI,"[-0.01787339523434639, -0.002488552127033472, ...","[-0.03228314220905304, -0.036461345851421356, ...","[-0.011127960868179798, -0.03744742274284363, ...","[0.010720687918365002, 0.004632656928151846, -...",137
389,390,18757,Smart Firms,1-10,2023-01-04,—,"Artificial Intelligence (AI), Generative AI, L...","Orlando, Florida, United States",Generative Ai assisting in the reduction of hu...,—,—,"East Coast, Southern US",smartfiorms.co,—,—,—,joel@smartfirms.co,—,—,—,"Artificial Intelligence (AI), Data and Analyti...","$750,000",For Profit,—,— Generative Ai assisting in the reduction of ...,"['Generative AI', 'Legal', 'Legal Tech']",Smart Firms is an AI-powered platform that lev...,Smart Firms,"\nTasks/Jobs: Document creation, Legal researc...","Document creation, Legal research, Contract an...",Legal services,"Attorneys, Law firms",\nExample 1: A lawyer uses Smart Firms to gene...,,,,A lawyer uses Smart Firms to generate a standa...,A paralegal generates standard contracts for n...,A law firm uses Smart Firms to analyze a large...,A legal researcher analyzes legal documents fo...,A corporate counsel uses Smart Firms to automa...,A legal assistant drafts and reviews employee ...,Smart Firms,"[-0.01399413961917162, -0.04067544639110565, -...","[-0.013073818758130074, -0.01987871713936329, ...","[-0.02785480208694935, -0.0012156603625044227,...","[0.003485316876322031, -0.012093094177544117, ...",28
405,406,17743,AI Lawyer,1-10,2023-01-01,—,"Artificial Intelligence (AI), Legal Tech, SaaS","New York, New York, United States","Co-pilot for lawyers, instant legal help for c...",171822,—,"Greater New York Area, East Coast, Northeaster...",ailawyer.pro/,—,—,—,hi@ailawyer.pro,—,—,—,"Artificial Intelligence (AI), Data and Analyti...",—,For Profit,—,"— Co-pilot for lawyers, instant legal help for...","['Legal Tech', 'SaaS']",AI Lawyer provides instant legal help for cons...,AI Lawyer,"\nTasks/Jobs: Document drafting, Legal researc...","Document drafting, Legal research, Case analys...",Legal technology,"Lawyers, Consumers",\nExample 1: A small business owner uses AI La...,,,,A small business owner uses AI Lawyer to draft...,A lawyer drafts standard lease agreements.,A lawyer uses AI Lawyer to quickly research re...,A lawyer researches relevant case law for a co...,A consumer uses AI Lawyer to review a contract...,NOT_ONET A lawyer analyzes and reviews contracts.,AI Lawyer,"[0.011000319384038448, -0.053822197020053864, ...","[0.002033372176811099, 0.0010979827493429184, ...","[-0.005813006777316332, -0.030635876581072807,...","[-0.007920012809336185, -0.02822677418589592, ...",71
500,501,21390,Lawgic Legal Track,1-10,2023-11-20,Lawgic Legal Track redefines legal processes b...,—,—,Lawgic Legal Track redefines legal processes b...,—,—,—,legaltrack.ai/,—,View on LinkedIn,—,ricardo.rodriguez@getlawgic.com,1 (415) 420-2179,—,—,—,—,For Profit,—,Lawgic Legal Track redefines legal processes b...,[],Lawgic Legal Track uses AI to transform legal ...,Lawgic Legal Track,"\nTasks/Jobs: Case management, Legal research,...","Case management, Legal research, Document revi...",Legal Services,"Lawyers, Paralegals",\nExample 1: A lawyer uses Lawgic Legal Track ...,,,,A lawyer uses Lawgic Legal Track to analyze a ...,A lawyer performs legal research to find patte...,A paralegal uses the platform to automatically...,A paralegal performs contract drafting for cli...,A legal team utilizes the tool to organize and...,A lawyer and paralegal team performs document ...,Lawgic Legal Track,"[-0.012424166314303875, -0.009911800734698772,...","[-0.003233750117942691, -0.018310317769646645,...","[-0.011593738570809364, -0.03614287078380585, ...","[0.012798679992556572, -0.013599172234535217, ...",28


In [108]:
e = sample_df.Industry_embedding.iloc[0]
e

[-0.002616604557260871,
 -0.00396003620699048,
 -0.015124782919883728,
 0.021699294447898865,
 -0.016802474856376648,
 0.02806941792368889,
 -0.013114958070218563,
 0.05171040818095207,
 0.00712806498631835,
 0.04309200495481491,
 0.014639358967542648,
 0.006872578989714384,
 -0.05038188025355339,
 -0.0049308836460113525,
 0.012263337150216103,
 0.017151640728116035,
 -0.017543386667966843,
 -0.02694527804851532,
 -0.01588272489607334,
 0.030045177787542343,
 0.015669820830225945,
 0.029057297855615616,
 0.051608212292194366,
 0.016223372891545296,
 -0.011249909177422523,
 0.0030892540235072374,
 -0.019468048587441444,
 0.004577461164444685,
 0.0059954095631837845,
 -0.0013498187763616443,
 3.925439159502275e-06,
 0.0230789203196764,
 -0.010960358195006847,
 0.0006823610747233033,
 -0.035086773335933685,
 0.02083064243197441,
 0.02791612595319748,
 -0.0023589893244206905,
 -0.00580379506573081,
 -0.010091705247759819,
 0.001970437355339527,
 -0.02038779854774475,
 -0.024986550211906433

In [62]:
sample_df.to_csv("../output/df_with_industry_embeddings.csv")



