In [3]:
import nest_asyncio
import aiohttp
import asyncio
import nest_asyncio
import re
import time
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import os
from os import getenv
from openai import OpenAI
import pathlib
import textwrap
# import google.generativeai as genai
import time
import requests
import ast
import json
from sklearn.metrics.pairwise import cosine_similarity
import re
from llms import chatGPT
from llms import gemini


x_chat = chatGPT()
x_gemini = gemini()
nest_asyncio.apply()

df_exp = pd.read_csv('../input/gpts_labels/gpt_exposure_embeddings.csv')

df_exp.task_embedding = df_exp.task_embedding.apply(lambda x: x.strip("[]").split(", "))

task_embeddings = []
for x in df_exp.task_embedding:
    task_embeddings.append([float(y) for y in x])
df_exp.task_embedding = task_embeddings


task_embeddings = np.array(task_embeddings)
task_embeddings = np.vstack(task_embeddings)

In [4]:
class prompting():
    def __init__(self):
        self.results_df = pd.DataFrame(columns=["organization name", "value"])

    def set_current_results_df(self, results_df):
        self.results_df = results_df

    async def iterate(self, df, prompt_template, args, value, batch_size=10, start=0, end=False):
        if end == False:
            end = len(df)
        self.results_df = pd.DataFrame(columns=["organization name", value])
        if value in list(df.columns):
            if start != 0:
                self.results_df = pd.concat([df[["organization name", value]].iloc[:start], self.results_df], axis=0)
            df = df.drop(columns=[value])

        batch_prompts = []
        batch_indices = []

        for i, row in list(df.iterrows())[start:end]:
            name = row['organization name']
            website = row['website']
            prompt = prompt_template

            for arg in args:
                prompt = prompt.replace(f"${arg[0]}", row[arg[1]])

            batch_prompts.append((i, name, prompt))
            batch_indices.append(i)

            if len(batch_prompts) >= batch_size:
                await self.process_batch(batch_prompts)
                batch_prompts = []

        if batch_prompts:
            await self.process_batch(batch_prompts)
            self.results_df.to_csv("../output/current_results_df_prompting.csv")

        df = df.merge(self.results_df, on='organization name', how='left')
        return df

    async def process_batch(self, batch_prompts):
        async with aiohttp.ClientSession() as session:
            tasks = []
            for i, name, prompt in batch_prompts:
                tasks.append(self.fetch_result(session, i, name, prompt))
            await asyncio.gather(*tasks)

    async def fetch_result(self, session, i, name, prompt):
        failure_count = 0
        while True:
            try:
                print(f"******************************\nProcessing {i}: {name}")
                result = await x_gemini.ask(session, prompt)
                if result == "N/A": break  # explicit material

                text = re.sub(r"#|#\s+|_|\*", "", result).strip()

                self.results_df.loc[i] = [name, text]

                    
                break
            except Exception as e:
                print(failure_count)
                failure_count += 1
                if failure_count > 10:
                    break
                print(f"Error processing {i}, {name}: {e}")
                await asyncio.sleep(20)

prompting_class = prompting()


In [5]:
# df2 = pd.read_csv('../output/df_with_parsed_description.csv').iloc[:10]
# df2["Task"] = (df2["Customers"] + " that do " + df2["Tasks/Jobs"]).values
df = pd.read_csv('../output/results/df_with_parsed_description.csv')
df["Task"] = (df["Customers"] + " that do " + df["Tasks/Jobs"]) + " for " + df["Industry"] .values
df = df.sample(800, random_state=2).reset_index(drop=True)

In [6]:
df.head()

Unnamed: 0,organization name,website,description_all,founded date,generated_description_llm,generated_description,generated_description_conf_interval,generated_description_conf_interval_reasoning,parsed_description_llm,parsed_description,Tasks/Jobs,Industry,Customers,parsed_description_conf_interval,parsed_description_conf_interval_reasoning,Task
0,Divergence Academy,divergence.one/,Divergence Academy empowers the next generatio...,2014-01-01,Description (in two sentences): Divergence Aca...,Divergence Academy offers online courses and b...,7,The website provides limited information about...,Divergence Academy Analysis\n\nTasks/Jobs: Eth...,"Tasks/Jobs: Ethical hacking, data analysis, ma...","Ethical hacking, data analysis, machine learni...","Education, Cybersecurity, Data Science","Students, aspiring professionals, cybersecurit...",8,The website and description clearly indicate t...,"Students, aspiring professionals, cybersecurit..."
1,Descartes Labs,www.descarteslabs.com,Enabling a better understanding of the physica...,2014-01-01,Description (in two sentences): Descartes Labs...,Descartes Labs offers a SaaS platform that ana...,9,The provided description is clear and accurate...,Descartes Labs\n\nTasks/Jobs: Geospatial data ...,"Tasks/Jobs: Geospatial data analysis, Earth ob...","Geospatial data analysis, Earth observation in...","Environmental science, Agriculture, Energy, Fi...","Researchers, Farmers, Energy companies, Financ...",9,The company's description explicitly mentions ...,"Researchers, Farmers, Energy companies, Financ..."
2,Qelzal,www.qelzal.com/,Next Generation Obstacle Avoidance for Commerc...,2014-01-01,Here's a description and confidence rating for...,Qelzal develops advanced obstacle avoidance sy...,8,The provided information on their website give...,Qelzal Analysis\n\nTasks/Jobs: Obstacle avoida...,"Tasks/Jobs: Obstacle avoidance, Object detecti...","Obstacle avoidance, Object detection, Navigati...","Drone technology, Autonomous systems","Drone operators, Drone delivery services, Aeri...",9,The description clearly focuses on drone safet...,"Drone operators, Drone delivery services, Aeri..."
3,Beyond Limits,www.beyond.ai/,Beyond Limits is an industrial and enterprise-...,2014-01-01,Description (in two sentences):\n\nBeyond Limi...,Beyond Limits is an AI technology company focu...,7,While the provided information is quite detail...,Beyond Limits Analysis\n\nTasks/Jobs: Operatio...,"Tasks/Jobs: Operations optimization, Decision ...","Operations optimization, Decision support, Pre...","Energy, Utilities, Healthcare, Finance","Operations managers, C-suite executives, Data ...",9,"The company's focus on ""Cognitive AI solutions...","Operations managers, C-suite executives, Data ..."
4,RAI,www.rai-inc.com,"RAI is a mixed reality, generative AI, metaver...",2023-04-04,Description (in two sentences):\n\nRAI special...,"RAI specializes in developing mixed reality, g...",7,The website provides a high-level overview of ...,RAI Analysis\n\nTasks/Jobs: XR/MR development...,"Tasks/Jobs: XR/MR development, AI model develo...","XR/MR development, AI model development, Metav...","XR/MR, AI, Metaverse, Media and Streaming","Software developers, Media companies, Healthca...",9,The provided description explicitly details RA...,"Software developers, Media companies, Healthca..."


In [23]:
pre.Industry

3                 Energy, Utilities, Healthcare, Finance
12                                    Financial services
13                Market Research, Business Intelligence
24                      Intellectual Property Management
29                              Data Science & Analytics
                             ...                        
787              Sports analytics, Sports betting, Media
791    Marketing technology, Customer relationship ma...
796               Financial services, Energy, Industrial
797                                            Insurance
799              Collaboration tools, Project management
Name: Industry, Length: 122, dtype: object

In [19]:
words = ["decision", "automate", "workflow","task", "generate"]

# Create regex pattern
pattern = '|'.join(words)

# Filter the dataframe
df = df[df["generated_description"].str.contains(pattern, case=False, na=False)]


In [21]:
pre = df[pd.to_datetime(df['founded date'])<'11/30/2022']
post = df[pd.to_datetime(df['founded date'])>='11/30/2022']
len(pre)

122

In [259]:
for x in range(10):
    print(df["Task"].iloc[x])

Students, aspiring professionals, cybersecurity professionals, data scientists that do Ethical hacking, data analysis, machine learning, software development for Education, Cybersecurity, Data Science
Researchers, Farmers, Energy companies, Financial institutions, Government agencies that do Geospatial data analysis, Earth observation interpretation, Predictive modeling, Risk assessment for Environmental science, Agriculture, Energy, Finance, Government
Drone operators, Drone delivery services, Aerial mapping companies that do Obstacle avoidance, Object detection, Navigation, Flight control for Drone technology, Autonomous systems
Operations managers, C-suite executives, Data analysts that do Operations optimization, Decision support, Predictive analytics, Risk assessment for Energy, Utilities, Healthcare, Finance
Software developers, Media companies, Healthcare providers, Game developers, Enterprise businesses that do XR/MR development, AI model development, Metaverse platform creatio

In [229]:
df['new Task'] = df['new Task'].replace('"', '', regex=True)

In [264]:
df = x_chat.run_batch_embeddings(df,'Task')


Processing batch 0 to 199
Processing batch 200 to 399
Processing batch 400 to 599
Processing batch 600 to 799


In [261]:
df["Task"].iloc[0]

'Students, aspiring professionals, cybersecurity professionals, data scientists that do Ethical hacking, data analysis, machine learning, software development for Education, Cybersecurity, Data Science'

In [284]:
results = []
for index, row in list(df.iterrows())[0:]:
    print("***********************")
    print(f"Processing {index}")
    name = row["organization name"]
    date = row["founded date"]
    customers = row["Customers"]
    print(name, date, customers)
    print(row["generated_description"])

    job_embedding = np.array(row["Task_embedding"]).reshape(1,-1)
    cosine_sim = cosine_similarity(job_embedding, task_embeddings)
    top3 = list(sorted(zip(cosine_sim[0], df_exp[['description']].values), reverse=True))[0]
    print(top3)
    results.append([name, date, top3[0],top3[1][0]])

onet_df = pd.DataFrame(results,columns = ["organization name", "founded date", "cosine similarity", "description"])


***********************
Processing 0
Divergence Academy 2014-01-01 Students, aspiring professionals, cybersecurity professionals, data scientists
Divergence Academy offers online courses and bootcamps in ethical hacking, data science, and machine learning. Their curriculum is designed to equip students with the skills and knowledge needed to succeed in these rapidly growing fields.
(np.float64(0.5099267402052063), array(['Penetration testers that develop presentations on threat intelligence.'],
      dtype=object))
***********************
Processing 1
Descartes Labs 2014-01-01 Researchers, Farmers, Energy companies, Financial institutions, Government agencies
Descartes Labs offers a SaaS platform that analyzes geospatial data at scale, providing insights for various industries like commodity sourcing, resource exploration, and climate resilience. Their technology helps organizations understand the physical world better by leveraging earth observation data.
(np.float64(0.596854224093756

In [279]:
pre = onet_df[pd.to_datetime(onet_df['founded date'])<'11/30/2022']
post = onet_df[pd.to_datetime(onet_df['founded date'])>='11/30/2022']

In [281]:
pre[pre['cosine similarity'] > .5]

Unnamed: 0,organization name,founded date,cosine similarity,description
0,Divergence Academy,2014-01-01,0.509927,Penetration testers that develop presentations...
1,Descartes Labs,2014-01-01,0.596854,Remote sensing scientists and technologists th...
3,Beyond Limits,2014-01-01,0.552420,Financial managers that prepare operational or...
6,Ceres Imaging,2014-01-01,0.612879,"Farmworkers and laborers, crop, nursery, and g..."
11,Swipecart,2014-01-01,0.569445,Online merchants that design customer interfac...
...,...,...,...,...
793,Wallaroo,2014-01-01,0.568339,Data scientists that recommend data-driven sol...
795,Graph Story,2014-01-01,0.640153,Data scientists that identify business problem...
796,LawIQ,2014-01-01,0.616910,Financial managers that prepare operational or...
797,Intellect SEEC,2014-01-01,0.533913,Insurance claims and policy processing clerks ...


In [282]:
post[post['cosine similarity'] > .5]

Unnamed: 0,organization name,founded date,cosine similarity,description
7,Bryckel AI,2023-03-03,0.533407,"Title examiners, abstractors, and searchers th..."
8,Bewort,2023-05-01,0.565480,Data scientists that identify business problem...
9,Vectari,2023-01-01,0.538487,Financial risk specialists that devise systems...
10,DropGenius,2023-09-01,0.555109,Online merchants that prepare or organize onli...
15,Echo Labs,2023-01-01,0.535711,Word processors and typists that transcribe st...
...,...,...,...,...
782,Mantis Analytics,2023-03-01,0.626842,Security management specialists that perform r...
784,Kwibal,2024-01-15,0.578855,Online merchants that purchase new or used ite...
785,Sapucay,2023-01-30,0.606801,"Farmers, ranchers, and other agricultural mana..."
788,MathGPTPro,2023-01-01,0.514259,Tutors that review class material with student...
