In [39]:
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import os
from os import getenv
from openai import OpenAI
import pathlib
import textwrap
import google.generativeai as genai
import time
import requests
import ast
import json
from sklearn.metrics.pairwise import cosine_similarity
import re

load_dotenv("../../.env",override=True)
GOOGLE_API_KEY = getenv("GEMINI_API_KEY")
OPENAI_API_KEY = getenv("OPENAI_API_KEY")



In [62]:

genai.configure(api_key=GOOGLE_API_KEY)
class gemini():
    def __init__(self):
        self.model = genai.GenerativeModel('gemini-1.5-flash')
    def request(self,prompt):
        url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash-latest:generateContent'
        headers = {
            'Content-Type': 'application/json',
        }
        data = {
            "contents": [
                {
                    "parts": [
                        {
                            "text": prompt
                        }
                    ]
                }
            ]
        }
        params = {
            'key': GOOGLE_API_KEY
        }
        
        response = requests.post(url, headers=headers, json=data, params=params)
        return json.loads(response.text)

    def ask(self,prompt):
        #response = self.model.generate_content(prompt)
        response = self.request(prompt)
        if response["candidates"][0]["finishReason"] == 'SAFETY': return "N/A"
        response = response["candidates"][0]["content"]["parts"][0]["text"]
        return response

x_gemini = gemini()
x_gemini.ask("hi")

'Hi! How can I help you today? \n'

In [41]:
class chatGPT():
    def __init__(self):
        self.client = OpenAI(api_key=OPENAI_API_KEY)
        
    # def ask(self, q):
    #     stream = self.client.chat.completions.create(
    #         model="gpt-4",
    #         messages=[{"role": "user", "content": q}],
    #         stream=True,
    #         temperature=0
    #     )
    #     response = ""
    #     for chunk in stream:
    #         if chunk.choices[0].delta.content is not None:
    #             response += chunk.choices[0].delta.content

    #     self.response = response
    #     return response

    def get_embedding(self,text, model="text-embedding-3-large"):
       text = text.replace("\n", " ")
       return self.client.embeddings.create(input = [text], model=model).data[0].embedding

x_chat = chatGPT()


In [42]:
#Randomly generate 1000 startups for proof of concept
df = pd.read_csv("../output/data.csv")
df = df[df["num employees"]=="1-10"]

sample_df = df[pd.to_datetime(df['founded date']).dt.year.isin([2023, 2024])]
sample_df = sample_df.sample(n=1000, random_state=1)
sample_df = sample_df.reset_index(drop=True)


Code Below:
- Step 1: Generate more detailed description of startup and its product (generated_description)
- Step 2: Generate description of the task/job that startup is automating, the industry, and people the startup is targeting. (parsed_description)
- Step 3: Generate example situations of when a person/employee might use the product and ask what person is being automated in each example (use generated_description and parsed_description to help LLM generate realistic examples).
- Step 4: Map the job being automated back to ONET (computation performed in onet_mapping.ipynb)

In [61]:
generated_description_prompt = """
Your role is to describe $company ($website)'s product in two sentences.

EXAMPLES BELOW 
_________________________
Company: Genmo
Website: https://www.genmo.ai/
Description: Genmo.ai is a free online platform that uses artificial intelligence to turn your ideas into videos and images. Genmo.ai will generate animations or graphics, allowing even those without animation experience to create professional-looking content.

Company: Love Genius
Website: https://www.lovegenius.io/
Description: LoveGenius is an AI-powered tool aimed at improving user experiences on dating platforms by assisting with the creation of engaging and personalized dating profiles. This service targets individuals looking to enhance their profiles on various dating apps, catering to those seeking both serious and casual relationships.
________________________

Company: $company
Wesbite: $website
Current Description: $description
YOUR TURN:
**************************
Description:
**************************
"""

websites = list(sample_df['website'])
companies = list(sample_df['organization name'])
descriptions = list(sample_df['description_all'])
industries = list(sample_df['industries_parsed'])
results = []
n = len(sample_df)
# n = 10
for i in range(0,n):
    while True:
        try:
            print(f"Processing {str(i)}: {companies[i]}")
            p1 = generated_description_prompt.replace("$website", websites[i])\
                       .replace("$company", companies[i])\
                       .replace("$description", descriptions[i])\
                       .replace("$industries",industries[i])
            result = x_gemini.ask(p1)
    
            if "Unfortunately" in result:
                 result = False
            result = result.replace("*","").replace("Product: ","").replace("\n\n","\n")
    
            print(f"Result: {result}\n")
            results.append(result)
            break
    
        
        except Exception as e:
            print(e)
            print(f"Error processing {str(i)}, {companies[i]}: {e}")
            time.sleep(20)


sample_df['generated_description'] = results

sample_df.to_csv('../output/df_with_generated_description.csv', index=False)




Processing 0: Advoria
{'candidates': [{'content': {'parts': [{'text': 'Advoria is a German-made, GDPR-compliant online appointment booking system specifically designed for law firms. It allows clients to schedule appointments 24/7, reducing the workload of secretaries and streamlining the client intake process. \n'}], 'role': 'model'}, 'finishReason': 'STOP', 'index': 0, 'safetyRatings': [{'category': 'HARM_CATEGORY_SEXUALLY_EXPLICIT', 'probability': 'NEGLIGIBLE'}, {'category': 'HARM_CATEGORY_HATE_SPEECH', 'probability': 'NEGLIGIBLE'}, {'category': 'HARM_CATEGORY_HARASSMENT', 'probability': 'NEGLIGIBLE'}, {'category': 'HARM_CATEGORY_DANGEROUS_CONTENT', 'probability': 'NEGLIGIBLE'}]}], 'usageMetadata': {'promptTokenCount': 249, 'candidatesTokenCount': 45, 'totalTokenCount': 294}}
Result: Advoria is a German-made, GDPR-compliant online appointment booking system specifically designed for law firms. It allows clients to schedule appointments 24/7, reducing the workload of secretaries and 

In [7]:
sample_df = sample_df[~sample_df.generated_description.isnull()]
# sample_df['openai_embedding'] = sample_df["generated_description"].apply(lambda text: x.get_embedding(text))

In [63]:
parsed_description_prompt = """
Your role is to describe what jobs/tasks and industries that $company is automating. Please refer to examples.

Be specific about people using tool. If it applies to many different people (over 20% of population), then say everyone.
DO NOT INCLUDE AN EXPLANATION

EXAMPLES BELOW 
_________________________
Company: Petville
Website: petville.co/pricing/biz
Description: Petville Global is a B2B CRM SaaS platform that utilizes advanced technologies like AI/ML and neural net to streamline and expand operations for pet businesses and veterinary clinics both locally and globally. The platform offers deep data analytics and marketing tools, helping businesses save an average of 22% on CRM and vet tech costs.
Tasks/Jobs: Data analysis, Marketing automation, Appointment scheduling, Inventory management
Industry: Customer management
People Using Tool: Vetinarians, Pet Businesses

Company: Thunder
Wesbite: thundercompute.com
Description: Thunder is a decentralized, peer-to-peer cloud computing platform designed to democratize access to GPUs and address the persistent GPU shortage. It provides a solution for individuals and businesses seeking high-performance computing power, enabling them to leverage unused GPU resources from around the world.
Tasks/Jobs: GPU resource allocation, Access to unused GPUs, Distributed computing tasks 
Industry: Cloud Computing
People Using Tool: Developers, GPU Owners


Company: InputAI
Wesbite: inputai.com/
Description: InputAI is an advanced platform that offers over 1000 AI templates powered by OpenAI GPT and ChatGPT technologies. It is designed to streamline and enhance the user's interaction with AI, providing a wide range of templates for various applications, from chatbots to content generation.
Tasks/Jobs: Chatbot development, Content creation
Industry: No-code AI platform
People Using Tool: Everyone
________________________


Company: $company
Wesbite: $website
Description: $description
YOUR TURN:
**************************
Tasks/Jobs (comma separated list of 4, short):
Industry (1 item):
People Using Tool (comma separated list, one/two words each): 
**************************
"""

In [65]:
sample_df.head()

Unnamed: 0.1,Unnamed: 0,organization name,num employees,founded date,description,industries,headquarters location,description.1,cb rank,postal code,...,hub tag,phone num,num of sub org,industry groups,total funding amt,company type,estimated rev,description_all,industries_parsed,generated_description
0,20738,Advoria,1-10,2023-08-29,Advoria's online appointment booking for law f...,"Artificial Intelligence (AI), Legal, Legal Tec...","Berlin, Berlin, Germany",Online appointment booking for the successful ...,239244,10967,...,—,—,—,"Artificial Intelligence (AI), Data and Analyti...",—,For Profit,—,Advoria's online appointment booking for law f...,"['Legal', 'Legal Tech', 'SaaS', 'Software']","Advoria is a German-made, GDPR-compliant onlin..."
1,19859,Petville Global,1-10,2023-05-01,Petville Global: a B2B CRM SaaS platform + vet...,"Information Technology, Pet, Veterinary","Kuala Lumpur, Kuala Lumpur, Malaysia",B2B CRM SaaS platform that helps pet businesse...,—,—,...,+60 11-5445 0117,—,—,"Community and Lifestyle, Health Care, Informat...","MYR15,000",For Profit,—,Petville Global: a B2B CRM SaaS platform + vet...,"['Information Technology', 'Pet', 'Veterinary']",Petville Global is a business-to-business (B2B...
2,18043,InputAI,1-10,2023-01-01,—,Artificial Intelligence (AI),—,1000+ AI templates with OpenAI GPT and ChatGPT,503337,—,...,—,—,—,"Artificial Intelligence (AI), Data and Analyti...",—,For Profit,—,— 1000+ AI templates with OpenAI GPT and ChatGPT,[],InputAI is an online platform that provides ov...
3,22241,Thunder,1-10,2024-02-02,Thunder is a Cloud computing to democratize ac...,"Artificial Intelligence (AI), Cloud Computing,...","Lewes, Delaware, United States",Thunder is a Cloud computing to democratize ac...,—,19958,...,—,908-386-2839,—,"Artificial Intelligence (AI), Blockchain and C...",—,For Profit,—,Thunder is a Cloud computing to democratize ac...,"['Cloud Computing', 'Data Center', 'Informatio...","Thunder offers decentralized, peer-to-peer GPU..."
4,19551,Tail and Skew,1-10,2023-04-01,Tail & Skew builds an AI agent to automate tasks.,"Artificial Intelligence (AI), Financial Servic...","San Francisco, California, United States",Tail & Skew builds an AI agent to automate tasks.,378106,—,...,—,—,—,"Artificial Intelligence (AI), Data and Analyti...",—,For Profit,—,Tail & Skew builds an AI agent to automate tas...,"['Financial Services', 'FinTech', 'Machine Lea...",Tail and Skew provides a platform where users ...


In [66]:
websites = list(sample_df['website'])
companies = list(sample_df['organization name'])
descriptions = list(sample_df['generated_description'])
industries = list(sample_df['industries_parsed'])
results = []
n = len(sample_df)
# n = 10
for i in range(n):
    while True:
        try:
            print(f"Processing {str(i), len(results)}: {companies[i]}")
            p1 = parsed_description_prompt.replace("$website", websites[i])\
                       .replace("$company", companies[i])\
                       .replace("$description", descriptions[i])\
                       .replace("$industries",industries[i])
            # print(p1)
            print(descriptions[i])
            result = x_gemini.ask(p1)
            
            if "Unfortunately" in result:
                 result = False
            result = result.replace("*","").replace("Product: ","").replace("\n\n","\n")
    
            print(f"Result: {result}\n")
            if (result != None):
                results.append([companies[i],result])
            break
    
        
        except Exception as e:
            print(f"Error processing {str(i)}, {companies[i]}: {e}")
            time.sleep(20)
            
results_df = pd.DataFrame(results,columns=["Company","Parsed Description"])

sample_df[["Company","parsed_description"]] = results

sample_df.to_csv('../output/df_with_generated_description.csv', index=False)

Processing ('0', 0): Advoria
Advoria is a German-made, GDPR-compliant online appointment booking system specifically designed for law firms. It allows clients to schedule appointments 24/7, reducing the workload of secretaries and streamlining the client intake process. 

Result: 
Tasks/Jobs: Appointment scheduling, Client intake, Secretary workload reduction, Time management
Industry: Legal
People Using Tool: Lawyers, Legal Secretaries 
 


Processing ('1', 1): Petville Global
Petville Global is a business-to-business (B2B) software solution that combines a Customer Relationship Management (CRM) platform with veterinary technology to help pet businesses and vet clinics streamline their operations, increase efficiency, and expand their reach. Petville Global leverages AI and data analytics to provide insights, marketing tools, and cost savings, empowering pet industry professionals to provide better care and grow their businesses. 

Result: 
Tasks/Jobs: Customer Relationship Management

In [67]:
sample_df[['Tasks/Jobs', 'Industry', 'People Using Tool']] = sample_df['parsed_description'].str.extract('Tasks/Jobs: (.*?)\nIndustry: (.*?)\nPeople Using Tool: (.*)')
sample_df["Industry"] = sample_df.Industry.astype("str")
sample_df["Industry"] = [x.replace("\n","") for x in list(sample_df.Industry)]
sample_df['People Using Tool'] = sample_df['People Using Tool'].str.replace('AI|Artificial Intelligence', '', regex=True)
sample_df["Tasks/Jobs"] = sample_df["Tasks/Jobs"].astype("str")
sample_df.head()

Unnamed: 0.1,Unnamed: 0,organization name,num employees,founded date,description,industries,headquarters location,description.1,cb rank,postal code,...,company type,estimated rev,description_all,industries_parsed,generated_description,Company,parsed_description,Tasks/Jobs,Industry,People Using Tool
0,20738,Advoria,1-10,2023-08-29,Advoria's online appointment booking for law f...,"Artificial Intelligence (AI), Legal, Legal Tec...","Berlin, Berlin, Germany",Online appointment booking for the successful ...,239244,10967,...,For Profit,—,Advoria's online appointment booking for law f...,"['Legal', 'Legal Tech', 'SaaS', 'Software']","Advoria is a German-made, GDPR-compliant onlin...",Advoria,"\nTasks/Jobs: Appointment scheduling, Client i...","Appointment scheduling, Client intake, Secreta...",Legal,"Lawyers, Legal Secretaries"
1,19859,Petville Global,1-10,2023-05-01,Petville Global: a B2B CRM SaaS platform + vet...,"Information Technology, Pet, Veterinary","Kuala Lumpur, Kuala Lumpur, Malaysia",B2B CRM SaaS platform that helps pet businesse...,—,—,...,For Profit,—,Petville Global: a B2B CRM SaaS platform + vet...,"['Information Technology', 'Pet', 'Veterinary']",Petville Global is a business-to-business (B2B...,Petville Global,\nTasks/Jobs: Customer Relationship Management...,"Customer Relationship Management, Marketing Au...",Veterinary Technology,"Veterinarians, Pet Businesses"
2,18043,InputAI,1-10,2023-01-01,—,Artificial Intelligence (AI),—,1000+ AI templates with OpenAI GPT and ChatGPT,503337,—,...,For Profit,—,— 1000+ AI templates with OpenAI GPT and ChatGPT,[],InputAI is an online platform that provides ov...,InputAI,"\nTasks/Jobs: Writing, Summarizing, Translatin...","Writing, Summarizing, Translating, Code Genera...",No-code AI platform,"Everyone, Professionals, Businesses"
3,22241,Thunder,1-10,2024-02-02,Thunder is a Cloud computing to democratize ac...,"Artificial Intelligence (AI), Cloud Computing,...","Lewes, Delaware, United States",Thunder is a Cloud computing to democratize ac...,—,19958,...,For Profit,—,Thunder is a Cloud computing to democratize ac...,"['Cloud Computing', 'Data Center', 'Informatio...","Thunder offers decentralized, peer-to-peer GPU...",Thunder,"\nTasks/Jobs: GPU resource allocation, Distrib...","GPU resource allocation, Distributed computing...",Cloud Computing,"Developers, Researchers, Businesses, Individuals"
4,19551,Tail and Skew,1-10,2023-04-01,Tail & Skew builds an AI agent to automate tasks.,"Artificial Intelligence (AI), Financial Servic...","San Francisco, California, United States",Tail & Skew builds an AI agent to automate tasks.,378106,—,...,For Profit,—,Tail & Skew builds an AI agent to automate tas...,"['Financial Services', 'FinTech', 'Machine Lea...",Tail and Skew provides a platform where users ...,Tail and Skew,"\nTasks/Jobs: Task automation, Workflow optimi...","Task automation, Workflow optimization, Data a...",Business process automation,"Businesses, Developers"


In [68]:
examples_prompt = """
Your role is to give me 3 two sentence example that would be using the product of $company. Do not use name of company in description. Keep it broad.
The goal is to give the a detailed description of the job that the tool automates and who performs that job and when.

IF THE PERSON AUTOMATED IS NOT DOING A JOB THAT IS NOT DONE AT WORK THEN WRITE "NOT_ONET" IN FRONT OF IT.

EXAMPLE: 
________________
Company: DreamGenerator.ai
Website: DreamGenerator.ai
DreamGenerator.ai is an innovative platform that utilizes generative AI to transform user ideas into stunning and diverse images. It offers a unique creative experience, encouraging users to experiment with prompts and share their AI-generated art, while also providing an opportunity to receive recognition through likes and shares.
Tasks/Jobs: Image generation, Artistic exploration, Prompt engineering, Social media sharing
Industry: Creative arts
People Using Tool: Artists, Designers, Everyone
**************************
Example 1: A writer uses generates images that will inspire ideas for their new book by typing in fun prompts into the website.
Job automated 1: Writer has to generate create ideas for book.
Example 2: A social media marketer creates eye-catching visuals for their summer fitness campaign, like "people doing yoga on a sunrise beach," then uses them for engaging social media posts.
Job automated 2: Graphic designer createsn images for summer fitness brand campaign.
Example 3: An architect generates images that spark ideas to for the new building. They type prompts like "skyscraper covered in living walls" and "underwater hotel with transparent pods," generating visuals to inspire their sustainable architecture concepts.
Job autoamted 3: Architect generates ideas for new building.
**************************

Company: Cozy Ventures
Wesbite: https://cozy.ventures/
Current Description: Cozy Ventures is a boutique software development company that provides startups with innovative digital solutions to accelerate their growth. Composed of a team of seasoned engineers and designers, they specialize in creating custom software tailored to meet the unique needs of each client.
Tasks/Jobs: Software development,  UI/UX design,  Project management,  Technical consulting
Industry:  Software development
People Using Tool:  Startups,  Entrepreneurs
**************************
Example 1: A food delivery startup develop a custom app that integrates real-time tracking, route optimization, and communication features. 
Person automated 1: A food deliver manager is in charge of real-time tracking, route optimization, and communication for food delivery employees.
Example 2: An e-commerce company build a custom recommendation engine, powered by AI, that analyzes user behavior and preferences to offer relevant product suggestions.
Person automated 2: A market research analyst for a ecommerce company analyzes user behavior and preferences to offer relevant product suggestions
Example 3: A social media platform develops an app a sophisticated content moderation system that leverages machine learning algorithms to identify and flag inappropriate content in real-time. 
Person automated 3: A social media content moderator identifies and flag inappropriate content in real-time.
**************************


Company: PnPAI
Website: www.pnpai.co
PnPAI is an AIaaS platform that standardizes the implementation of AI across various industries, providing pre-built solutions that can be easily integrated without the need for in-house AI expertise. Catering to businesses of all sizes, PnPAI offers industry-focused solutions for sectors like ecommerce, retail, healthcare, and finance, making AI an accessible and integral part of decision-making and growth strategies.
Tasks/Jobs:  Data analysis, Predictive modeling, Automation, Optimization
Industry: AIaaS (AI as a Service)
People Using Tool: Businesses, Professionals 
**************************
Example 1: A retail manager uses PnPAI to analyze customer purchasing patterns and predict future demand for specific products. They then adjust inventory levels and optimize product placement in stores based on the AI-generated insights.
Person automated 1: A data analyst for a retail manager performs inventory management assestment to predict future custumer demand.
Example 2: An insurance company uses PnPAI to automate the risk assessment process for new clients. The AI system analyzes various data points, including credit history, driving records, and health information, to quickly determine insurance premiums and coverage options. 
Person automated 2: A risk assesser for an insurance agency uses data to perform risk assement for new clients. 
Example 3: A healthcare provider leverages PnPAI to personalize treatment plans for patients with chronic diseases. The AI analyzes patient medical history, current symptoms, and lifestyle factors to recommend optimal medication dosages and treatment strategies. 
Person automated 3: A data analysts for a healthcare company uses patient data to form personalized treatment plan creation. 
**************************
________________
Now your turn:
Company:$company
Website: $website
Current Description: $generated_description
$parsed_description
YOUR TURN:
**************************
Example 1: 
Person automated 1:
Example 2: 
Person automated 2:
Example 3:
Person automated 3:
**************************




"""


In [69]:
websites = list(sample_df['website'])
companies = list(sample_df['organization name'])
descriptions = list(sample_df['generated_description'])
industries = list(sample_df['parsed_description'])
# labels = list(sample_df['generated_cluster_label'])
results = []
n = len(sample_df)
for i in range(0,n):
    while True:
        try:
            print(f"Processing {str(i)}: {companies[i]}")
            print(websites[i])
            print(descriptions[i])
            print(industries[i])
            # print(f"Label Cluster: {labels[i]}")
            p1 = examples_prompt.replace("$website", websites[i])\
                       .replace("$company", companies[i])\
                       .replace("$generated_description", descriptions[i])\
                       .replace("$parsed_description",industries[i])
            result = x_gemini.ask(p1)
            result = result.replace("*","").replace("Product: ","").replace("\n\n","\n")
    
            print(f"Result: {result}\n")
            results.append(result)
            break
    
        
        except Exception as e:
            print(f"Error processing {str(i)}, {companies[i]}: {e}")
            time.sleep(20)

sample_df = sample_df[:len(results)]
sample_df.loc[:,'examples'] = results

sample_df.to_csv('../output/df_with_examples.csv', index=False)

Processing 0: Advoria
advoria.de
Advoria is a German-made, GDPR-compliant online appointment booking system specifically designed for law firms. It allows clients to schedule appointments 24/7, reducing the workload of secretaries and streamlining the client intake process. 


Tasks/Jobs: Appointment scheduling, Client intake, Secretary workload reduction, Time management
Industry: Legal
People Using Tool: Lawyers, Legal Secretaries 
 

Result: ## Advoria Example Scenarios:
Example 1: 
A lawyer uses Advoria to set up an automated booking system, allowing clients to schedule consultations at any time, day or night. This frees up the lawyer's time to focus on client work. 
Person automated 1: Legal Secretary schedules client appointments.
Example 2:
A law firm uses Advoria to collect necessary client information during the online booking process. This streamlines the intake process and reduces the need for manual data entry.
Person automated 2: Legal Secretary collects and organizes clie

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_df.loc[:,'examples'] = results


In [81]:
sample_df

Unnamed: 0.1,Unnamed: 0,organization name,num employees,founded date,description,industries,headquarters location,description.1,cb rank,postal code,...,generated_description,Company,parsed_description,Tasks/Jobs,Industry,People Using Tool,examples,Job1_ONET,Job2_ONET,Job3_ONET
0,20738,Advoria,1-10,2023-08-29,Advoria's online appointment booking for law f...,"Artificial Intelligence (AI), Legal, Legal Tec...","Berlin, Berlin, Germany",Online appointment booking for the successful ...,239244,10967,...,"Advoria is a German-made, GDPR-compliant onlin...",Advoria,"\nTasks/Jobs: Appointment scheduling, Client i...","Appointment scheduling, Client intake, Secreta...",Legal,"Lawyers, Legal Secretaries",## Advoria Example Scenarios:\nExample 1: \nA ...,,,
1,19859,Petville Global,1-10,2023-05-01,Petville Global: a B2B CRM SaaS platform + vet...,"Information Technology, Pet, Veterinary","Kuala Lumpur, Kuala Lumpur, Malaysia",B2B CRM SaaS platform that helps pet businesse...,—,—,...,Petville Global is a business-to-business (B2B...,Petville Global,\nTasks/Jobs: Customer Relationship Management...,"Customer Relationship Management, Marketing Au...",Veterinary Technology,"Veterinarians, Pet Businesses",\nExample 1: A veterinarian uses the software ...,,,
2,18043,InputAI,1-10,2023-01-01,—,Artificial Intelligence (AI),—,1000+ AI templates with OpenAI GPT and ChatGPT,503337,—,...,InputAI is an online platform that provides ov...,InputAI,"\nTasks/Jobs: Writing, Summarizing, Translatin...","Writing, Summarizing, Translating, Code Genera...",No-code AI platform,"Everyone, Professionals, Businesses",\nExample 1: A marketing team uses a template ...,,,
3,22241,Thunder,1-10,2024-02-02,Thunder is a Cloud computing to democratize ac...,"Artificial Intelligence (AI), Cloud Computing,...","Lewes, Delaware, United States",Thunder is a Cloud computing to democratize ac...,—,19958,...,"Thunder offers decentralized, peer-to-peer GPU...",Thunder,"\nTasks/Jobs: GPU resource allocation, Distrib...","GPU resource allocation, Distributed computing...",Cloud Computing,"Developers, Researchers, Businesses, Individuals",## Thunder Examples:\nExample 1: \nA developer...,,,
4,19551,Tail and Skew,1-10,2023-04-01,Tail & Skew builds an AI agent to automate tasks.,"Artificial Intelligence (AI), Financial Servic...","San Francisco, California, United States",Tail & Skew builds an AI agent to automate tasks.,378106,—,...,Tail and Skew provides a platform where users ...,Tail and Skew,"\nTasks/Jobs: Task automation, Workflow optimi...","Task automation, Workflow optimization, Data a...",Business process automation,"Businesses, Developers",\nExample 1: A customer service representativ...,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,18750,GoAudience,1-10,2023-01-04,In the ever-evolving world of digital marketin...,"Ad Targeting, Generative AI, SaaS, Software","Miami, Florida, United States",GoAudience uses AI to help e-commerce brands f...,219045,—,...,GoAudience is a marketing platform that levera...,GoAudience,"\nTasks/Jobs: Targeted advertising, Customer a...","Targeted advertising, Customer acquisition, So...",E-commerce marketing,"E-commerce businesses, Marketers",\nExample 1: An e-commerce clothing store uses...,,,
996,19664,Bridgecare,1-10,2023-04-13,BridgeCare provides schools w/ an on-demand so...,"Artificial Intelligence (AI), Education, Menta...","Dallas, Texas, United States","SaaS, Mobile, On-demand coaching service",—,75024,...,BridgeCare offers a cutting-edge solution that...,Bridgecare,"\nTasks/Jobs: Mental health assessments, Perso...","Mental health assessments, Personalized coachi...",Mental health,"Students, Coaches",\nExample 1: A student struggling with anxiety...,,,
997,18724,BOOK HOTEL DIRECT,1-10,2023-01-02,BookHotel.direct is a commission-free booking ...,"Artificial Intelligence (AI), Direct Marketing...",—,BookHotel.direct is a commission-free booking ...,524911,—,...,BookHotel.direct is a commission-free booking ...,BOOK HOTEL DIRECT,"\nTasks/Jobs: Hotel booking, Direct booking, H...","Hotel booking, Direct booking, Hotel marketing...",Hospitality,"Travelers, Hoteliers",\nExample 1: A traveler books a unique boutiqu...,,,
998,19981,Commented,1-10,2023-05-26,Commented is your collaboration feature embedd...,"Artificial Intelligence (AI), B2B, SaaS","London, England, United Kingdom","Comment and chat on your projects, skyrocket c...",165343,TW77EW,...,Commented is a tool that allows teams to colla...,Commented,"\nTasks/Jobs: Collaboration, Feedback, Project...","Collaboration, Feedback, Project management, C...",Project management,"Designers, Developers, Product Managers",\nExample 1: A web designer uses Commented to ...,,,


In [74]:
sample_df = sample_df[sample_df.examples != "N/A"]
sample_df = sample_df[~sample_df["organization name"].str.contains("Mira Labs")]


In [82]:
def extract_data(row):
    row = row.replace("\n"," ").replace("  "," ")
    pattern = r"(Example \d+: .*?)(Person automated \d+: .*?)(?= Example \d+|$)"
    matches = re.findall(pattern, row, flags=re.DOTALL)
    matches = [[m.split(":")[1].strip() for m in match] for match in matches]
    results = []
    for m in matches: 
        for l in m: results.append(l)
    if len(results) != 6:
        print(row)
    return results

sample_df.loc[:, ["Job1_ONET", "Job2_ONET", "Job3_ONET"]] = 'N/A'
sample_df = sample_df.reset_index(drop=True)

sample_df[['Example1','Job1','Example2','Job2','Example3','Job3']] = list(sample_df['examples'].apply(lambda x: extract_data(x)))



In [83]:
results = []
for i, x in sample_df.iterrows():
    name = x["organization name"]
    print(f"Processing {str(i), len(results)}: {name}")
    job1_embedding = x_chat.get_embedding(x["Job1"])
    job2_embedding = x_chat.get_embedding(x["Job2"])
    job3_embedding = x_chat.get_embedding(x["Job3"])
    results.append([name,job1_embedding, job2_embedding, job3_embedding])

df = pd.DataFrame(results, columns=['name', "Job1_embedding","Job2_embedding","Job3_embedding"])
sample_df = pd.concat([sample_df, df], axis=1)

Processing ('0', 0): Advoria
Processing ('1', 1): Petville Global
Processing ('2', 2): InputAI
Processing ('3', 3): Thunder
Processing ('4', 4): Tail and Skew
Processing ('5', 5): AI Insider Tips
Processing ('6', 6): AI Budge
Processing ('7', 7): Semantic
Processing ('8', 8): Pariah AI
Processing ('9', 9): Doppler
Processing ('10', 10): healf.mx
Processing ('11', 11): Latimer.AI
Processing ('12', 12): Dune Security
Processing ('13', 13): RAIBS
Processing ('14', 14): Wave Predict
Processing ('15', 15): Apptify
Processing ('16', 16): Snipzo
Processing ('17', 17): Singularico
Processing ('18', 18): Botvisor
Processing ('19', 19): Intriq
Processing ('20', 20): StockCake
Processing ('21', 21): ProperPlan
Processing ('22', 22): Archive Intel
Processing ('23', 23): Nirah
Processing ('24', 24): Salesforge
Processing ('25', 25): keymate
Processing ('26', 26): Chopa AI
Processing ('27', 27): Xenos Labs
Processing ('28', 28): Trudy AI
Processing ('29', 29): LearyAI (Leary.Xyz)
Processing ('30', 3

In [84]:
sample_df.to_csv('../output/df_with_examples_embeddings.csv', index=False)

In [85]:
df_exp = pd.read_csv('../output/onet/gpt_exposure_embeddings.csv')
df_exp.Embeddings = df_exp.Embeddings.apply(lambda x: x.strip("[]").split(", "))

embeddings = df_exp["Embeddings"]
embeddings = np.array(embeddings)
embeddings = np.vstack(embeddings)
embeddings.shape

(11582, 3072)

In [86]:
sample_df

Unnamed: 0.1,Unnamed: 0,organization name,num employees,founded date,description,industries,headquarters location,description.1,cb rank,postal code,...,Example1,Job1,Example2,Job2,Example3,Job3,name,Job1_embedding,Job2_embedding,Job3_embedding
0,20738,Advoria,1-10,2023-08-29,Advoria's online appointment booking for law f...,"Artificial Intelligence (AI), Legal, Legal Tec...","Berlin, Berlin, Germany",Online appointment booking for the successful ...,239244,10967,...,A lawyer uses Advoria to set up an automated b...,Legal Secretary schedules client appointments.,A law firm uses Advoria to collect necessary c...,Legal Secretary collects and organizes client ...,A paralegal uses Advoria to manage their own c...,Paralegal manages own calendar and appointment...,Advoria,"[-0.00684443861246109, -0.03683798760175705, -...","[-0.006861537229269743, -0.028195897117257118,...","[-0.011333281174302101, -0.03152986243367195, ..."
1,19859,Petville Global,1-10,2023-05-01,Petville Global: a B2B CRM SaaS platform + vet...,"Information Technology, Pet, Veterinary","Kuala Lumpur, Kuala Lumpur, Malaysia",B2B CRM SaaS platform that helps pet businesse...,—,—,...,A veterinarian uses the software to automatica...,A veterinary receptionist manually sends out r...,A pet grooming business uses the platform to a...,A pet grooming business manager uses data to t...,A veterinary clinic leverages the software to ...,A veterinary receptionist schedules appointmen...,Petville Global,"[-0.02583499625325203, 0.006958781275898218, -...","[-0.02536037191748619, 0.01035560667514801, -0...","[-0.016943059861660004, -0.0001608182647032663..."
2,18043,InputAI,1-10,2023-01-01,—,Artificial Intelligence (AI),—,1000+ AI templates with OpenAI GPT and ChatGPT,503337,—,...,A marketing team uses a template to create eng...,A social media marketer creates social media p...,A student uses a template to summarize a compl...,A student reads and summarizes research papers.,A programmer uses a template to generate basic...,NOT_ONET,InputAI,"[-0.04794558137655258, 0.02471514232456684, -0...","[-0.0007236742530949414, 0.014056037180125713,...","[0.00404541939496994, -0.017835143953561783, -..."
3,22241,Thunder,1-10,2024-02-02,Thunder is a Cloud computing to democratize ac...,"Artificial Intelligence (AI), Cloud Computing,...","Lewes, Delaware, United States",Thunder is a Cloud computing to democratize ac...,—,19958,...,A developer uses Thunder to train a complex ma...,NOT_ONET A developer spends hours training a c...,A research team working on climate modeling us...,NOT_ONET A research team struggles to find eno...,A small business owner uses Thunder to render ...,NOT_ONET A small business owner cannot afford ...,Thunder,"[-0.010479268617928028, 0.013774021528661251, ...","[-0.023190032690763474, -0.016715245321393013,...","[-0.02602432854473591, 0.02815021015703678, -0..."
4,19551,Tail and Skew,1-10,2023-04-01,Tail & Skew builds an AI agent to automate tasks.,"Artificial Intelligence (AI), Financial Servic...","San Francisco, California, United States",Tail & Skew builds an AI agent to automate tasks.,378106,—,...,A customer service representative trains an AI...,Customer service representative handles routin...,A marketing manager uses an AI-powered agent t...,NOT_ONET,A software developer builds an AI agent that a...,A software developer manually tests code for b...,Tail and Skew,"[-0.02071544900536537, -0.02404162846505642, -...","[0.00404541939496994, -0.017835143953561783, -...","[-0.022204095497727394, 0.004857667721807957, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
992,18750,GoAudience,1-10,2023-01-04,In the ever-evolving world of digital marketin...,"Ad Targeting, Generative AI, SaaS, Software","Miami, Florida, United States",GoAudience uses AI to help e-commerce brands f...,219045,—,...,An e-commerce clothing store uses GoAudience t...,A marketing manager for a clothing store creat...,A company selling luxury home goods uses GoAud...,A marketing manager for a furniture and home g...,A small business owner selling handmade jewelr...,A marketing manager for a handmade jewelry com...,GoAudience,"[-0.015135165303945541, -0.01889854297041893, ...","[-0.020899586379528046, 0.00412752665579319, -...","[-0.015671193599700928, -0.01687435805797577, ..."
993,19664,Bridgecare,1-10,2023-04-13,BridgeCare provides schools w/ an on-demand so...,"Artificial Intelligence (AI), Education, Menta...","Dallas, Texas, United States","SaaS, Mobile, On-demand coaching service",—,75024,...,A student struggling with anxiety uses the pla...,A student needs to be screened and connected w...,A high school athlete facing pressure to perfo...,NOT_ONET A student needs to track their emotio...,A college student experiencing academic burnou...,A student needs to receive personalized recomm...,Bridgecare,"[-0.010149136185646057, 0.015644945204257965, ...","[-0.009472891688346863, 0.009795221500098705, ...","[0.013785602524876595, -0.008135512471199036, ..."
994,18724,BOOK HOTEL DIRECT,1-10,2023-01-02,BookHotel.direct is a commission-free booking ...,"Artificial Intelligence (AI), Direct Marketing...",—,BookHotel.direct is a commission-free booking ...,524911,—,...,A traveler books a unique boutique hotel in a ...,A travel agent books a hotel for a client.,A hotelier uses the platform to create a custo...,A hotel website developer designs and manages ...,A traveler reaches out to the platform's custo...,A customer service rep assists a customer with...,BOOK HOTEL DIRECT,"[-0.00695926696062088, 0.021801339462399483, -...","[-0.02478785254061222, 0.03585508465766907, -0...","[-0.002951432717964053, 0.015213046222925186, ..."
995,19981,Commented,1-10,2023-05-26,Commented is your collaboration feature embedd...,"Artificial Intelligence (AI), B2B, SaaS","London, England, United Kingdom","Comment and chat on your projects, skyrocket c...",165343,TW77EW,...,A web designer uses Commented to provide feedb...,NOT_ONET Designer provides feedback on a colle...,A product manager uses Commented to gather fee...,Product manager gathers feedback from a develo...,A software developer uses Commented to discuss...,Developers discuss implementation details on a...,Commented,"[0.00017314498836640269, -0.007336603477597237...","[-0.04769685119390488, 0.003237571334466338, -...","[-0.03409171104431152, -0.005779969040304422, ..."


In [95]:
sample_df['job_embeddings'] = sample_df.apply(lambda row: [np.vstack(np.array(row[f'Job{i}_embedding'])).reshape(1, -1) for i in range(1, 4)], axis=1)


In [3]:
# results = []
# for index, row in sample_df.iterrows():
#     print("***********************")
#     print(f"Processing {index}")
#     name = row["organization name"]
#     print(name)
#     print(row["generated_description"])
        
#     job_results = []
#     for i in range(3):  # 0, 1, 2 for Job1, Job2, Job3
#         job = row[f'Job{i+1}']
#         print(f"\nProcessing Job: {job}")
#         example = row[f"Example{i+1}"]
#         job_embedding = np.array(row[f'Job{i+1}_embedding']).reshape(1,-1)
#         cosine_sim = cosine_similarity(job_embedding, embeddings, 'cosine')
#         example_title = get_person(job)
#         example_title_embedding = np.array([x_chat.get_embedding(example_title)])
        
#         top3 = sorted(zip(cosine_sim[0], df_exp[['Title', 'Task']].values), reverse=True)[:3]
#         local_results = []
#         for cosine_sim_job, onet in top3:
#             onet_title_embedding = np.array([x_chat.get_embedding(onet[0])])
#             cosine_sim_title = cosine_similarity(job_embedding, onet_title_embedding, 'cosine')[0][0]
#             print(f"{onet[0]}\nCosine Similarity: {cosine_sim_job}, Person Cosine Similarity: {cosine_sim_title}")
#             results.append([name,example,job,onet[0],onet[1],example_title, cosine_sim_job,cosine_sim_title])
#             local_results.append(json.dumps([onet[0],onet[1],example_title, cosine_sim_job,cosine_sim_title]))

#     sample_df.loc[index, ["Job1_ONET", "Job2_ONET", "Job3_ONET"]] = local_results

#     if index % 5 == 0:
#         print("**Saving Results**\n\n")
#         sample_df.to_csv('../output/df_with_onet.csv', index=False)
#         onet_df = pd.DataFrame(results, columns=["organization_name","example","job","onet_title","onet_task","example_job_title","task_similarity", "job_title_similarity"])
#     onet_df.to_csv("../output/onet_df.csv")
#     if index % 30 == 0:
#         sample_df.to_csv('../output/df_with_onet.csv', index=False)
        
#     print("***********************\n\n")
#     break

In [127]:
embeddings.shape

(11582, 3072)

In [61]:
onet_df = pd.DataFrame(total_results, columns=["organization_name","example","job","onet_title","onet_task","example_job_title","task_similarity", "job_title_similarity"])
onet_df.to_csv("../output/onet_df.csv")

In [62]:
onet_weights = onet_df.groupby("organization_name")["onet_title"].count().apply(lambda x: 1/x).reset_index().rename({"onet_title":"onet_weight"},axis=1)
onet_df = onet_df.merge(onet_weights, on="organization_name")



In [2]:
sample_df.to_csv('../output/df_with_onet.csv', index=False)

In [64]:
onet_df[(onet_df.task_similarity > .47) & (onet_df.job_title_similarity > .27)]

Unnamed: 0,organization_name,example,job,onet_title,onet_task,task_similarity,job_title_similarity,onet_weight
0,Advoria,A lawyer uses Advoria to schedule a client mee...,A legal secretary would normally schedule appo...,Legal Secretaries and Administrative Assistants,Schedule and make appointments.,0.725143,0.605163,0.166667
1,Advoria,A lawyer uses Advoria to schedule a client mee...,A legal secretary would normally schedule appo...,Legal Secretaries and Administrative Assistants,"Mail, fax, or arrange for delivery of legal co...",0.606237,0.605163,0.166667
2,Advoria,A lawyer uses Advoria to schedule a client mee...,A legal secretary would normally schedule appo...,Medical Secretaries and Administrative Assistants,Schedule and confirm patient diagnostic appoin...,0.586487,0.474208,0.166667
3,Advoria,A potential client finds a law firm's website ...,A legal secretary would normally book appointm...,Legal Secretaries and Administrative Assistants,Schedule and make appointments.,0.688586,0.636179,0.166667
4,Advoria,A potential client finds a law firm's website ...,A legal secretary would normally book appointm...,Legal Secretaries and Administrative Assistants,"Mail, fax, or arrange for delivery of legal co...",0.624741,0.636179,0.166667
5,Advoria,A potential client finds a law firm's website ...,A legal secretary would normally book appointm...,Legal Secretaries and Administrative Assistants,Receive and place telephone calls.,0.60044,0.636188,0.166667


In [65]:
onet_df.groupby("onet_title")["onet_weight"].sum().round(2).sort_values(ascending=False).reset_index().values



array([['Legal Secretaries and Administrative Assistants', 0.83],
       ['Medical Secretaries and Administrative Assistants', 0.17]],
      dtype=object)

In [66]:
onet_df.onet_title.value_counts().reset_index().values

array([['Legal Secretaries and Administrative Assistants', 5],
       ['Medical Secretaries and Administrative Assistants', 1]],
      dtype=object)