In [41]:
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import os
from os import getenv
from openai import OpenAI
import pathlib
import textwrap
import google.generativeai as genai
import time
import requests
import ast
import json
from sklearn.metrics.pairwise import cosine_similarity
import re

load_dotenv("../../.env",override=True)
GOOGLE_API_KEY = getenv("GEMINI_API_KEY")
OPENAI_API_KEY = getenv("OPENAI_API_KEY")



In [24]:

genai.configure(api_key=GOOGLE_API_KEY)
class gemini():
    def __init__(self):
        self.model = genai.GenerativeModel('gemini-1.5-flash')
    def request(self,prompt):
        url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash-latest:generateContent'
        headers = {
            'Content-Type': 'application/json',
        }
        data = {
            "contents": [
                {
                    "parts": [
                        {
                            "text": prompt
                        }
                    ]
                }
            ]
        }
        params = {
            'key': GOOGLE_API_KEY
        }
        
        response = requests.post(url, headers=headers, json=data, params=params)
        return json.loads(response.text)

    def ask(self,prompt):
        #response = self.model.generate_content(prompt)
        response = self.request(prompt)
        response = response["candidates"][0]["content"]["parts"][0]["text"]
        return response

x_gemini = gemini()
x_gemini.ask("hi")

'Hi! What can I do for you today? \n'

In [25]:
class chatGPT():
    def __init__(self):
        self.client = OpenAI(api_key=OPENAI_API_KEY)
        
    # def ask(self, q):
    #     stream = self.client.chat.completions.create(
    #         model="gpt-4",
    #         messages=[{"role": "user", "content": q}],
    #         stream=True,
    #         temperature=0
    #     )
    #     response = ""
    #     for chunk in stream:
    #         if chunk.choices[0].delta.content is not None:
    #             response += chunk.choices[0].delta.content

    #     self.response = response
    #     return response

    def get_embedding(self,text, model="text-embedding-3-large"):
       text = text.replace("\n", " ")
       return self.client.embeddings.create(input = [text], model=model).data[0].embedding

x_chat = chatGPT()


In [26]:
df = pd.read_csv("../output/data.csv")
df = df[df["num employees"]=="1-10"]

sample_df = df[pd.to_datetime(df['founded date']).dt.year.isin([2023, 2024])]
sample_df = sample_df.sample(n=1, random_state=1)

In [27]:
generated_description_prompt = """
Your role is to describe $company ($website)'s product in two sentences.

EXAMPLES BELOW 
_________________________
Company: Genmo
Website: https://www.genmo.ai/
Description: Genmo.ai is a free online platform that uses artificial intelligence to turn your ideas into videos and images. Genmo.ai will generate animations or graphics, allowing even those without animation experience to create professional-looking content.

Company: Love Genius
Website: https://www.lovegenius.io/
Description: LoveGenius is an AI-powered tool aimed at improving user experiences on dating platforms by assisting with the creation of engaging and personalized dating profiles. This service targets individuals looking to enhance their profiles on various dating apps, catering to those seeking both serious and casual relationships.
________________________

Company: $company
Wesbite: $website
Current Description: $description
YOUR TURN:
**************************
Description:
**************************
"""

websites = list(sample_df['website'])
companies = list(sample_df['organization name'])
descriptions = list(sample_df['description_all'])
industries = list(sample_df['industries_parsed'])
results = []
n = len(sample_df)
# n = 10
for i in range(n):
    while True:
        try:
            print(f"Processing {str(i)}: {companies[i]}")
            p1 = generated_description_prompt.replace("$website", websites[i])\
                       .replace("$company", companies[i])\
                       .replace("$description", descriptions[i])\
                       .replace("$industries",industries[i])
            result = x_gemini.ask(p1)
    
            if "Unfortunately" in result:
                 result = False
            result = result.replace("*","").replace("Product: ","").replace("\n\n","\n")
    
            print(f"Result: {result}\n")
            results.append(result)
            break
    
        
        except Exception as e:
            print(f"Error processing {str(i)}, {companies[i]}: {e}")
            results.append(None)
            time.sleep(20)


sample_df['generated_description'] = results

sample_df.to_csv('../output/df_with_generated_description.csv', index=False)




Processing 0: Advoria
Result: Advoria is a German-made online appointment booking system specifically designed for law firms, offering 24/7 client access and reducing administrative burden on the secretariat.  The platform prioritizes GDPR compliance and quality to streamline appointment scheduling for successful law firms. 




In [28]:
sample_df = sample_df[~sample_df.generated_description.isnull()]
# sample_df['openai_embedding'] = sample_df["generated_description"].apply(lambda text: x.get_embedding(text))

In [29]:
parsed_description_prompt = """
Your role is to describe what jobs/tasks and industries that $company is automating. Please refer to examples.

Be specific about people using tool. If it applies to many different people (over 20% of population), then say everyone.
DO NOT INCLUDE AN EXPLANATION

EXAMPLES BELOW 
_________________________
Company: Petville
Website: petville.co/pricing/biz
Description: Petville Global is a B2B CRM SaaS platform that utilizes advanced technologies like AI/ML and neural net to streamline and expand operations for pet businesses and veterinary clinics both locally and globally. The platform offers deep data analytics and marketing tools, helping businesses save an average of 22% on CRM and vet tech costs.
Tasks/Jobs: Data analysis, Marketing automation, Appointment scheduling, Inventory management
Industry: Customer management
People Using Tool: Vetinarians, Pet Businesses

Company: Thunder
Wesbite: thundercompute.com
Description: Thunder is a decentralized, peer-to-peer cloud computing platform designed to democratize access to GPUs and address the persistent GPU shortage. It provides a solution for individuals and businesses seeking high-performance computing power, enabling them to leverage unused GPU resources from around the world.
Tasks/Jobs: GPU resource allocation, Access to unused GPUs, Distributed computing tasks 
Industry: Cloud Computing
People Using Tool: Developers, GPU Owners


Company: InputAI
Wesbite: inputai.com/
Description: InputAI is an advanced platform that offers over 1000 AI templates powered by OpenAI GPT and ChatGPT technologies. It is designed to streamline and enhance the user's interaction with AI, providing a wide range of templates for various applications, from chatbots to content generation.
Tasks/Jobs: Chatbot development, Content creation
Industry: No-code AI platform
People Using Tool: Everyone
________________________


Company: $company
Wesbite: $website
Description: $description
YOUR TURN:
**************************
Tasks/Jobs (comma separated list of 4, short):
Industry (1 item):
People Using Tool (comma separated list, one/two words each): 
**************************
"""

In [30]:
sample_df

Unnamed: 0.1,Unnamed: 0,organization name,num employees,founded date,description,industries,headquarters location,description.1,cb rank,postal code,...,hub tag,phone num,num of sub org,industry groups,total funding amt,company type,estimated rev,description_all,industries_parsed,generated_description
20738,20738,Advoria,1-10,2023-08-29,Advoria's online appointment booking for law f...,"Artificial Intelligence (AI), Legal, Legal Tec...","Berlin, Berlin, Germany",Online appointment booking for the successful ...,239244,10967,...,—,—,—,"Artificial Intelligence (AI), Data and Analyti...",—,For Profit,—,Advoria's online appointment booking for law f...,"['Legal', 'Legal Tech', 'SaaS', 'Software']",Advoria is a German-made online appointment bo...


In [31]:
websites = list(sample_df['website'])
companies = list(sample_df['organization name'])
descriptions = list(sample_df['generated_description'])
industries = list(sample_df['industries_parsed'])
results = []
n = len(sample_df)
# n = 10
for i in range(n):
    while True:
        try:
            print(f"Processing {str(i), len(results)}: {companies[i]}")
            p1 = parsed_description_prompt.replace("$website", websites[i])\
                       .replace("$company", companies[i])\
                       .replace("$description", descriptions[i])\
                       .replace("$industries",industries[i])
            # print(p1)
            print(descriptions[i])
            result = x_gemini.ask(p1)
            
            if "Unfortunately" in result:
                 result = False
            result = result.replace("*","").replace("Product: ","").replace("\n\n","\n")
    
            print(f"Result: {result}\n")
            if (result != None):
                results.append([companies[i],result])
            break
    
        
        except Exception as e:
            print(f"Error processing {str(i)}, {companies[i]}: {e}")
            time.sleep(20)
            
results_df = pd.DataFrame(results,columns=["Company","Parsed Description"])

sample_df[["Company","parsed_description"]] = results

sample_df.to_csv('../output/df_with_generated_description.csv', index=False)

Processing ('0', 0): Advoria
Advoria is a German-made online appointment booking system specifically designed for law firms, offering 24/7 client access and reducing administrative burden on the secretariat.  The platform prioritizes GDPR compliance and quality to streamline appointment scheduling for successful law firms. 

Result: 
Tasks/Jobs: Appointment scheduling, Client communication, Administrative tasks, GDPR compliance
Industry: Legal Services
People Using Tool: Lawyers, Secretaries
 




In [32]:
sample_df[['Tasks/Jobs', 'Industry', 'People Using Tool']] = sample_df['parsed_description'].str.extract('Tasks/Jobs: (.*?)\nIndustry: (.*?)\nPeople Using Tool: (.*)')
sample_df["Industry"] = sample_df.Industry.astype("str")
sample_df["Industry"] = [x.replace("\n","") for x in list(sample_df.Industry)]
sample_df['People Using Tool'] = sample_df['People Using Tool'].str.replace('AI|Artificial Intelligence', '', regex=True)
sample_df["Tasks/Jobs"] = sample_df["Tasks/Jobs"].astype("str")
sample_df.head()

Unnamed: 0.1,Unnamed: 0,organization name,num employees,founded date,description,industries,headquarters location,description.1,cb rank,postal code,...,company type,estimated rev,description_all,industries_parsed,generated_description,Company,parsed_description,Tasks/Jobs,Industry,People Using Tool
20738,20738,Advoria,1-10,2023-08-29,Advoria's online appointment booking for law f...,"Artificial Intelligence (AI), Legal, Legal Tec...","Berlin, Berlin, Germany",Online appointment booking for the successful ...,239244,10967,...,For Profit,—,Advoria's online appointment booking for law f...,"['Legal', 'Legal Tech', 'SaaS', 'Software']",Advoria is a German-made online appointment bo...,Advoria,"\nTasks/Jobs: Appointment scheduling, Client c...","Appointment scheduling, Client communication, ...",Legal Services,"Lawyers, Secretaries"


In [33]:
examples_prompt = """
Your role is to give me 3 two sentence example that would be using the product of $company. Do not use name of company in description. Keep it broad.
The goal is to give the a detailed description of the job that the tool automates and who performs that job and when.

IF THE PERSON AUTOMATED IS NOT DOING A JOB THAT IS NOT DONE AT WORK THEN WRITE "NOT_ONET" IN FRONT OF IT.

EXAMPLE: 
________________
Company: DreamGenerator.ai
Website: DreamGenerator.ai
DreamGenerator.ai is an innovative platform that utilizes generative AI to transform user ideas into stunning and diverse images. It offers a unique creative experience, encouraging users to experiment with prompts and share their AI-generated art, while also providing an opportunity to receive recognition through likes and shares.
Tasks/Jobs: Image generation, Artistic exploration, Prompt engineering, Social media sharing
Industry: Creative arts
People Using Tool: Artists, Designers, Everyone
**************************
Example 1: A writer uses generates images that will inspire ideas for their new book by typing in fun prompts into the website.
Job automated 1: Writer has to generate create ideas for book.
Example 2: A social media marketer creates eye-catching visuals for their summer fitness campaign, like "people doing yoga on a sunrise beach," then uses them for engaging social media posts.
Job automated 2: Graphic designer createsn images for summer fitness brand campaign.
Example 3: An architect generates images that spark ideas to for the new building. They type prompts like "skyscraper covered in living walls" and "underwater hotel with transparent pods," generating visuals to inspire their sustainable architecture concepts.
Job autoamted 3: Architect generates ideas for new building.
**************************

Company: Cozy Ventures
Wesbite: https://cozy.ventures/
Current Description: Cozy Ventures is a boutique software development company that provides startups with innovative digital solutions to accelerate their growth. Composed of a team of seasoned engineers and designers, they specialize in creating custom software tailored to meet the unique needs of each client.
Tasks/Jobs: Software development,  UI/UX design,  Project management,  Technical consulting
Industry:  Software development
People Using Tool:  Startups,  Entrepreneurs
**************************
Example 1: A food delivery startup develop a custom app that integrates real-time tracking, route optimization, and communication features. 
Person automated 1: A food deliver manager is in charge of real-time tracking, route optimization, and communication for food delivery employees.
Example 2: An e-commerce company build a custom recommendation engine, powered by AI, that analyzes user behavior and preferences to offer relevant product suggestions.
Person automated 2: A market research analyst for a ecommerce company analyzes user behavior and preferences to offer relevant product suggestions
Example 3: A social media platform develops an app a sophisticated content moderation system that leverages machine learning algorithms to identify and flag inappropriate content in real-time. 
Person automated 3: A social media content moderator identifies and flag inappropriate content in real-time.
**************************


Company: PnPAI
Website: www.pnpai.co
PnPAI is an AIaaS platform that standardizes the implementation of AI across various industries, providing pre-built solutions that can be easily integrated without the need for in-house AI expertise. Catering to businesses of all sizes, PnPAI offers industry-focused solutions for sectors like ecommerce, retail, healthcare, and finance, making AI an accessible and integral part of decision-making and growth strategies.
Tasks/Jobs:  Data analysis, Predictive modeling, Automation, Optimization
Industry: AIaaS (AI as a Service)
People Using Tool: Businesses, Professionals 
**************************
Example 1: A retail manager uses PnPAI to analyze customer purchasing patterns and predict future demand for specific products. They then adjust inventory levels and optimize product placement in stores based on the AI-generated insights.
Person automated 1: A data analyst for a retail manager performs inventory management assestment to predict future custumer demand.
Example 2: An insurance company uses PnPAI to automate the risk assessment process for new clients. The AI system analyzes various data points, including credit history, driving records, and health information, to quickly determine insurance premiums and coverage options. 
Person automated 2: A risk assesser for an insurance agency uses data to perform risk assement for new clients. 
Example 3: A healthcare provider leverages PnPAI to personalize treatment plans for patients with chronic diseases. The AI analyzes patient medical history, current symptoms, and lifestyle factors to recommend optimal medication dosages and treatment strategies. 
Person automated 3: A data analysts for a healthcare company uses patient data to form personalized treatment plan creation. 
**************************
________________
Now your turn:
Company:$company
Website: $website
Current Description: $generated_description
$parsed_description
YOUR TURN:
**************************
Example 1: 
Person automated 1:
Example 2: 
Person automated 2:
Example 3:
Person automated 3:
**************************




"""


In [38]:
websites = list(sample_df['website'])
companies = list(sample_df['organization name'])
descriptions = list(sample_df['generated_description'])
industries = list(sample_df['parsed_description'])
# labels = list(sample_df['generated_cluster_label'])
results = []
n = len(sample_df)
for i in range(0,n):
    while True:
        try:
            print(f"Processing {str(i)}: {companies[i]}")
            print(websites[i])
            print(descriptions[i])
            print(industries[i])
            # print(f"Label Cluster: {labels[i]}")
            p1 = prompt.replace("$website", websites[i])\
                       .replace("$company", companies[i])\
                       .replace("$generated_description", descriptions[i])\
                       .replace("$parsed_description",industries[i])
            result = x_gemini.ask(p1)
            result = result.replace("*","").replace("Product: ","").replace("\n\n","\n")
    
            print(f"Result: {result}\n")
            results.append(result)
            break
    
        
        except Exception as e:
            print(f"Error processing {str(i)}, {companies[i]}: {e}")
            results.append(None)
            time.sleep(20)

sample_df = sample_df[:len(results)]
sample_df.loc[:,'examples'] = results

sample_df.to_csv('../output/df_with_examples.csv', index=False)

Processing 0: Advoria
advoria.de
Advoria is a German-made online appointment booking system specifically designed for law firms, offering 24/7 client access and reducing administrative burden on the secretariat.  The platform prioritizes GDPR compliance and quality to streamline appointment scheduling for successful law firms. 


Tasks/Jobs: Appointment scheduling, Client communication, Administrative tasks, GDPR compliance
Industry: Legal Services
People Using Tool: Lawyers, Secretaries
 

Result: 
Example 1: A lawyer schedules a client consultation using an online booking system, allowing the client to choose a convenient time slot that fits their schedule, eliminating the need for phone calls or emails. 
Person automated 1: A secretary schedules a client consultation using an online booking system.
Example 2: A legal secretary receives an automated notification when a client books an appointment, streamlining communication and ensuring timely updates for both the client and the lawy

In [47]:
def extract_data(row):
    row = row.replace("\n"," ").replace("  "," ")
    pattern = r"(Example \d+: .*?)(Person automated \d+: .*?)(?= Example \d+|$)"
    matches = re.findall(pattern, row, flags=re.DOTALL)
    matches = [[m.split(":")[1].strip() for m in match] for match in matches]
    results = []
    for m in matches: 
        for l in m: results.append(l)
    return results

if "JOB1_ONET" not in sample_df.columns:
    sample_df[["Job1_ONET","Job2_ONET","Job3_ONET"]] = 'N/A'
    sample_df = sample_df.reset_index(drop=True)

sample_df[['Example1','Job1','Example2','Job2','Example3','Job3']] = list(sample_df['examples'].apply(lambda x: extract_data(x)))



In [49]:
results = []
for i, x in sample_df.iterrows():
    name = x["organization name"]
    print(f"Processing {str(i), len(results)}: {name}")
    job1_embedding = x_chat.get_embedding(x["Job1"])
    job2_embedding = x_chat.get_embedding(x["Job2"])
    job3_embedding = x_chat.get_embedding(x["Job3"])
    results.append([name,job1_embedding, job2_embedding, job3_embedding])

df = pd.DataFrame(results, columns=['name', "Job1_embedding","Job2_embedding","Job3_embedding"])
sample_df = pd.concat([sample_df, df], axis=1)

Processing ('0', 0): Advoria


In [81]:
df_exp = pd.read_csv('../output/onet/gpt_exposure_embeddings.csv')
df_exp['Embeddings'] = df_exp['Embeddings'].apply(ast.literal_eval)



KeyboardInterrupt



In [93]:
df_exp.Embeddings = df_exp.Embeddings.apply(lambda x: x.strip("[]").split(", "))

In [94]:

embeddings = df_exp["Embeddings"]
embeddings = np.array(embeddings)
embeddings = np.vstack(embeddings)
embeddings.shape

(11582, 3072)

In [73]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None


def get_person(text):
    tokens = nltk.word_tokenize(text)
    tagged_tokens = pos_tag(tokens)
    lemmatizer = WordNetLemmatizer()
    for word, tag in tagged_tokens:
        wntag = get_wordnet_pos(tag)
        if wntag == wordnet.VERB:
            verb_index = text.index(word)
            return text[:verb_index].strip().replace("A ","")
    return tokens[1]  
get_person("A julia run")

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/juliasusser/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/juliasusser/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


'julia'

In [None]:
sample_df

In [None]:
total_results = []

for index, row in sample_df.iterrows():
    print(index)
    print("***********************")
    print(row["organization name"])
    print(row["generated_description"])
    startup_results = []
    for i in range(1,3):
        job = row[f"Job{i}"]
        example = row[f"Example{i}"]
        job_embedding = row[f"Job{i}_embedding"]
        job_embedding = np.array(job_embedding).reshape(1,-1)
        print(f"\nProcessing Job: {job}")
        cosine_sim = cosine_similarity(job_embedding, embeddings, 'cosine')
        person = get_person(job)
        person_embedding = np.array([x_chat.get_embedding(person)])
        top3 = sorted(zip(cosine_sim[0], df_exp[['Title', 'Task']].values), reverse=True)[:3]
        job_results = []
        for x,y in top3:
            
            onet_person = y[0]
            onet_person_embedding = np.array([x_chat.get_embedding(onet_person)])
            cosine_sim = cosine_similarity(person_embedding, onet_person_embedding, 'cosine')[0][0]
            print(f"{y}\n Cosine Similarity: {x}, Person Cosine Similarity: {cosine_sim}")
            total_results.append([name,example,y[0],y[1],x,cosine_sim])
            job_results.append([y[0],y[1],x,cosine_sim])
        startup_results.append(job_results)
    sample_df.loc[index, ["Job1_ONET", "Job2_ONET", "Job3_ONET"]] = startup_results
        
    print("***********************\n\n")

In [None]:
onet_df = pd.DataFrame(results, columns=["organization_name","example","job","onet_title","onet_task","task_similarity", "job_title_similarity"])
onet_df.to_csv("./output/onet_df.csv")

In [None]:
onet_weights = onet_df.groupby("organization_name")["onet_title"].count().apply(lambda x: 1/x).reset_index().rename({"onet_title":"onet_weight"},axis=1)

onet_df = onet_df.merge(onet_weights, on="organization_name")

In [None]:
sample_df.to_csv('./output/df_with_onet.csv', index=False)

In [102]:
np.array(job_embedding).reshape(1,-1).shape

(1, 3072)

In [None]:
embeddings