In [30]:
import pandas as pd
import numpy as np
from llms import gemini
from llms import chatGPT
import re
import time
from lemmatize import lemmatize
x_chat = chatGPT()
x_gemini = gemini()

In [31]:
df = pd.read_csv("../output/data/data_post_chat_gpt.csv")
df = df[df["headquarters location"].str.contains("United States")]
df = df[pd.to_datetime(df['founded date'])>='11/30/2022']

In [32]:
len(df)

2210

In [33]:
generated_description_prompt = """
Your role is to describe $company ($website)'s product. 
Then, provide a confidence interval on scale on 1-10 on how sure you are about the response. Please be reasonable.

PLEASE FOLLOW THE FORMAT EXACTLY FROM THE EXAMPLES!!!!

EXAMPLES BELOW
************************
Company: Genmo
Website: https://www.genmo.ai/
Description: Genmo is a free tool that creates videos and images using artificial intelligence. Genmo to generate creative text formats of text content, like poems, code, scripts, musical pieces, email, letters, etc.

Company: OnePane
Website: https://www.onepane.ai/
Description: Onepane is a company that offers an AI companion for enhanced DevOps & SRE efficiency. Onepane offers a GenAI solution  providing unparalleled unified insights and control over your Cloud resources. Onepane helps with root cause analysis, cloud governance, and optimization strategies.
************************

YOUR TURN:
Company: $company
Wesbite: $website
Current Description: $description
________________________
Description (two sentences):
Confidence Interval:
Reasoning:
________________________

"""



In [34]:
parsed_description_prompt = """
Your role is to describe what jobs/tasks, industries, and customer that $company is targeting. Then, provide a confidence interval (1-10) on scale on 1-10 on how sure you are about the response. Please be reasonable.

A response should include:
- tasks/jobs being automated
- the industry that the startup applies to
- specific customers using the tool. DO NOT INCLUDE AN EXPLANATION

PLEASE FOLLOW THE FORMAT EXACTLY FROM THE EXAMPLES!!!!

EXAMPLES BELOW
************************
Company: Petville
Website: petville.co/pricing/biz
Description: Petville Global is a B2B CRM SaaS platform that utilizes advanced technologies like AI/ML and neural net to streamline and expand operations for pet businesses and veterinary clinics both locally and globally. The platform offers deep data analytics and marketing tools, helping businesses save an average of 22% on CRM and vet tech costs.
Tasks/Jobs: Data analysis, Marketing automation, Appointment scheduling, Inventory management
Industry: Customer management
Customers: Vetinarians, Pet Businesses

Company: Thunder
Wesbite: thundercompute.com
Description: Thunder is a decentralized, peer-to-peer cloud computing platform designed to democratize access to GPUs and address the persistent GPU shortage. It provides a solution for individuals and businesses seeking high-performance computing power, enabling them to leverage unused GPU resources from around the world.
Tasks/Jobs: GPU resource allocation, Access to unused GPUs, Distributed computing tasks 
Industry: Cloud Computing
Customers: Developers, GPU Owners

Company: NonprofitsHQ
Websit: www.nonprofitshq.com
Description: NonprofitsHQ is a software suite designed for nonprofits that utilizes AI to automate tasks, manage operations, and improve efficiency, ultimately saving organizations time and resources.
Tasks/Jobs: Fundraising management, Donor relationship management, Grant writing, Event planning
Industry: Non-profit management
Customers: Non-profit organizations
************************

YOUR TURN:
________________________
Company: $company
Wesbite: $website
Description: $description
________________________
Tasks/Jobs (comma separated list of 4, short):
Industry (1 item):
Customers (comma separated list): 
________________________
Confidence Interval:
Reasoning:
________________________
"""


In [35]:
examples_prompt = """
Your role is to provide 3 two-sentence examples of how the product from $company might be used. Do not mention the name of the company in the examples, and keep the descriptions broad.

Each example should include:
- A detailed description of the job that the tool automates and who performs that job and when.
- The ONET job being automated (preferably one from the database https://www.onetonline.org/) and the ONET task that the tool replaces (preferably one that from https://www.onetcenter.org/dictionary/20.1/excel/task_statements.html, include task id)
- A confidence interval (1-10) indicating how sure you are about the accuracy of your response.

PLEASE FOLLOW THE FORMAT EXACTLY FROM THE EXAMPLES!!!!

The goal is to map each example back to ONET jobs. If the job being automated is recognized by ONET, please use the ONET job title. 
If it is not typically found in ONET, use best judgement!

EXAMPLES BELOW: 
************************
Company: Blanc
Website: tryblanc.ai
Blanc is a compliance automation platform designed for fintech companies. It helps streamline regulatory compliance processes by providing a centralized hub for managing policies, monitoring activities, and generating reports.
_________________________
Example 1: A compliance officer at a fintech company uses Blanc to automate the process of creating and updating compliance policies, ensuring all documents are current and accessible to relevant team members.
ONET JOB automated 1: Compliance Officers that verify that all firm and regulatory policies and procedures have been documented, implemented, and communicated.
ONET JOB 1: Compliance Officers
_________________________
Example 2: A fintech company uses Blanc to generate automated compliance reports for regulatory audits, ensuring all necessary documentation is readily available and organized.
ONET JOB automated 2: Compliance Officers Prepare reports of activities, evaluations, recommendations, or decisions.
ONET JOB 2: Compliance Officers
_________________________
Example 3: A fintech company uses Blanc to monitor real-time transactions for potential compliance violations, triggering alerts and generating reports for further investigation.
ONET JOB automated 3: Compliance Officers that identify compliance issues that require follow-up or investigation.
ONET JOB 3: Compliance Officers



Company: Aether
Website: aetherenergie.com/
Aether Energy is an AI-driven platform designed to simplify the process of rooftop solar installation for businesses, providing comprehensive support from project planning and financing to installation and ongoing maintenance. This platform aims to streamline and optimize the entire solar energy journey for installers. 
________________
Example 1: A solar installer uses Aether to quickly create detailed project plans for rooftop solar installations, including system size, panel placement, and wiring diagrams. 
ONET JOB automated 1: Solar Photovoltaic Installers that diagram layouts and locations for photovoltaic (PV) arrays and equipment, including existing building or site features.
ONET JOB 1: Solar Photovoltaic Installers
________________
Example 2: A business owner leverages Aether to secure financing for their rooftop solar project, providing them with customized loan options and streamlined application processes. 
ONET JOB automated 2: Solar Photovoltaic Installers that prepare solar installation project proposals, quotes, budgets, or schedules.
ONET JOB 2: Solar Photovoltaic Installers
________________
Example 3: A solar installer uses Aether to manage the installation process, tracking materials, scheduling technicians, and coordinating with subcontractors, ensuring smooth project execution. 
ONET JOB automated 3:  Solar Engery Installation Managers that monitor work of contractors and subcontractors to ensure projects conform to plans, specifications, schedules, or budgets.
ONET JOB 3: Solar Energy Installation Managers
________________
************************

YOUR TURN:
Company:$company
Website: $website
Current Description: $description
$parsed_description
________________
Example 1: 
ONET JOB automated 1:
ONET JOB 1: 
Confidence Interval 1:
Reasoning 1:
________________
Example 2: 
ONET JOB automated 2:
ONET JOB 2: 
Confidence Interval 2:
Reasoning 2:
________________
Example 3:
ONET JOB automated 3:
ONET JOB 3: 
Confidence Interval 3:
Reasoning 3:
________________
"""


In [91]:
class prompting():
    def __init__(self):
        pass
        
    def iterate(self, df, prompt_template, args, value, start=0, end=False):
        if end == False:
            end = len(df)
            
        if start == 0:
            self.results_df = pd.DataFrame(columns=["organization name",value])
        
        if value in list(df.columns): 
            if start != 0:
                self.results_df = pd.concat([df[["organization name",value]].iloc[:start],self.results_df],axis=0)
            df = df.drop(columns=[value])
        
        
        for i, row in list(df.iterrows())[start:end]:
            failure_count = 0
            prompt = prompt_template
            while True:
                try:
                    name = row['organization name']
                    website = row['website']
                    print(f"******************************\nProcessing {i}: {name}, {website}")
                    if "generated_description" in df.columns:
                        print(row["generated_description"])

                    for arg in args: 
                        prompt = prompt.replace(f"${arg[0]}", row[arg[1]])
                    result = x_gemini.ask(prompt)
                    if result == "N/A": break #explicit material
 
                    text = re.sub(r"#|#\s+|_|\*", "", result).strip()
                    print(f"Result:\n{text}\n")
   
                    self.results_df.loc[i] = [name, result]
                    break 
                
                except Exception as e:
                    print(failure_count)
                    failure_count += 1
                    if failure_count > 10: 
                        break
                    print(f"Error processing {i}, {row['organization name']}: {e}")
                    time.sleep(20)  
                if i % 5 == 0:
                    self.results_df.to_csv("../output/current_results_df_prompting.csv")

            
        df = df.merge(self.results_df, on='organization name', how='left')
        return df

prompting_class = prompting()


In [37]:
cols = ["generated_description", "generated_description_conf_interval", "generated_description_conf_interval_reasoning"]

args = [["company","organization name"],["website","website"], ["description","description_all"]]

df = prompting_class.iterate(df, generated_description_prompt, args, "generated_description_llm")

df.to_csv('../output/df_with_generated_description.csv', index=False)

******************************
Processing 0: Pika, pika.art
Result:
Description (two sentences):
Pika is an AI-powered platform that transforms captions and still images into professional-looking videos. With its intuitive interface, Pika simplifies video creation and editing, making it accessible to users of all skill levels.

Confidence Interval: 8
Reasoning:  While Pika's website doesn't go into deep detail about its features,  the information provided clearly indicates its focus on AI-powered video generation and editing. The emphasis on accessibility and user-friendliness aligns with the current trends in AI-driven creative tools.

******************************
Processing 1: Contextual AI, contextual.ai
Result:
Contextual AI

Description (two sentences): Contextual AI provides a cutting-edge generative AI platform designed specifically for business applications. Their technology empowers employees to generate creative content, automate workflows, and gain insights from data, all 

In [41]:
def extract_data(text):
    if pd.isnull(text): return {}
    text = text.replace("\n"," ").replace("  "," ").replace("*","").replace(" (two sentences)","").replace("/10","").replace("_","").replace("#","")
    pattern = r"^.*?\s*Description:?\s*(.*)Confidence Interval:\s*(\d+)\s*Reasoning:\s*(.*)$"
    match = re.search(pattern, text, re.DOTALL)
    
    if match:
        description = match.group(1).strip()
        confidence_interval = match.group(2).strip()
        reasoning = match.group(3).strip()
        
        result = {
            "generated_description": description,
            "generated_description_conf_interval": confidence_interval,
            "generated_description_conf_interval_reasoning": reasoning
        }
        return result
        
    else:
        print("FAILURE TO MATCH")
        return {}
        
for col in cols:
    if col in list(df.columns): df=df.drop(columns=[col])
        
results_df = pd.DataFrame(list(df.apply(lambda x: extract_data(x["generated_description_llm"]),axis=1)))
df = pd.concat([df, results_df],axis=1)

In [46]:
len(df)

2193

In [15]:
# Restart if cut in the middle of run!
# df = df.merge(prompting_class.results_df, on='organization name', how='left')

In [45]:
df = df.dropna(subset=["generated_description"],axis=0)
df = df.drop_duplicates(subset=['organization name'], keep='first')
df = df.reset_index(drop=True)

In [72]:
cols = ["parsed_description", "parsed_description_conf_interval", "parsed_description_conf_interval_reasoning", "Tasks/Jobs","Industry","Customers"]
    
args = [["company","organization name"],["website","website"], ["description","generated_description"]]

df = prompting_class.iterate(df, parsed_description_prompt, args, "parsed_description_llm",1778)

df.to_csv('../output/df_with_parsed_description.csv', index=False)

******************************
Processing 1778: Abide AI, abideai.com/
Abide AI is a platform that simplifies the development, deployment, and evaluation of machine learning (ML) systems. They focus on ensuring that ML models can be smoothly integrated into technology ecosystems while prioritizing robustness, reliability, and scalability.
Result:
Tasks/Jobs: Model training, Model deployment, Model monitoring, Model evaluation
Industry: Machine Learning
Customers: Data Scientists, Machine Learning Engineers

Confidence Interval: 8
Reasoning: The website focuses on ML development and deployment, which suggests the target audience is directly involved in these tasks. The language used in their description implies targeting individuals within the ML field.

******************************
Processing 1779: Plexicus, www.plexicus.com/
Plexicus is an AI-powered vulnerability management platform that helps organizations classify their assets, prioritize vulnerabilities, and automate remediation

In [78]:
def extract_data(text):
    if pd.isnull(text): return {}
    text = re.sub(r'\s+', ' ', text)  
    text = text.replace("_", "").replace("*", "").replace("#", "")
    text = re.sub(r'\s?\([^)]*\)', '', text)
    pattern = r".*?Tasks/Jobs:\s*(.*?)\s*Industry:\s*(.*?)\s*Customers:\s*(.*?)\s*Confidence Interval:\s*(.*?)\s*Reasoning:\s*(.*)"
    match = re.search(pattern, text, re.DOTALL)
    
    if match:
        tasks_jobs = match.group(1).strip()
        industry = match.group(2).strip()
        customers = match.group(3).strip()
        confidence_interval = match.group(4).strip()
        reasoning = match.group(5).strip()
        
        result = {
            "parsed_description": 
            "Tasks/Jobs: " + tasks_jobs + "\n" + "Industry: " + industry + "\n" + "Customers: "+ customers,
            "Tasks/Jobs": tasks_jobs,
            "Industry": industry,
            "Customers": customers,
            "parsed_description_conf_interval": confidence_interval,
            "parsed_description_conf_interval_reasoning": reasoning
        }
        
        return result
    else:
        print(text)
        print("FAILURE TO MATCH")
        return {}
        
for col in cols:
    if col in list(df.columns): df=df.drop(columns=[col])
        
results_df = pd.DataFrame(list(df.apply(lambda x: extract_data(x["parsed_description_llm"]),axis=1)))
df = pd.concat([df, results_df],axis=1)


In [87]:
df = df.dropna(subset=["parsed_description"],axis=0)
df = df.drop_duplicates(subset=['organization name'], keep='first')
df = df.reset_index(drop=True)

In [89]:
df.to_csv('../output/df_with_parsed_description.csv', index=False)

In [92]:
cols = ["situation1", "situation1_conf_interval", "situation1_conf_interval_reasoning", "situation2", "situation2_conf_interval", "situation2_conf_interval_reasoning","situation3", "situation3_conf_interval", "situation3_conf_interval_reasoning",'Example1','Job1','Job1_title','Example2','Job2','Job2_title','Example3','Job3','Job3_title']

args = [["company","organization name"],["website","website"], ["description","generated_description"], ["parsed_description","parsed_description"]]
df = prompting_class.iterate(df, examples_prompt, args, "examples_llm")

df.to_csv('../output/df_with_examples.csv', index=False)

******************************
Processing 0: Pika, pika.art
Pika is an AI-powered platform that transforms captions and still images into professional-looking videos. With its intuitive interface, Pika simplifies video creation and editing, making it accessible to users of all skill levels.
Result:
Pika Examples:

Example 1: A social media manager uses Pika to quickly generate short, engaging video ads from existing product images and marketing copy, saving time and resources in their content creation workflow. 
ONET JOB automated 1:  Video Editors that create or edit video productions and commercials for television, Web, or mobile devices.
ONET JOB 1: Video Editors
Confidence Interval 1: 8
Reasoning 1: This task specifically involves editing and creating video content, which is directly aligned with the work of a Video Editor.

Example 2:  A small business owner leverages Pika to create high-quality animated explainer videos from their existing marketing materials, enhancing their onl


KeyboardInterrupt



In [None]:
prompting_class.result_df.to_csv(

In [20]:

def extract_data(text):
    if pd.isnull(text): return {}
    # Normalize the text to ensure consistent whitespace and remove unwanted characters.
    text = re.sub(r'\s+', ' ', text).strip()
    text = text.replace("*", "").replace("/10", "").replace("_", "").replace("#", "")
    vals = {}
    i = 1
    while i <=3:
        # Regex pattern adjusted to handle fractions in confidence intervals like '9/10'
        pattern = rf"Example\s+{i}:\s+(.*?)\s+ONET JOB automated\s+{i}:\s+(.*?)\s+ONET JOB\s+{i}:\s+(.*?)\s+Confidence Interval\s+{i}:\s+(\d+(?:/\d+)?)\s+Reasoning\s+{i}:\s+(.*?)(?=\s*Example\s+{i + 1}:|$)"
        match = re.search(pattern, text, re.DOTALL)
        if not match:
            print(text)
            print(f"No matches found for Example {i}")  # Debug if no examples are found
            break

        example_text, onet_job_automated, onet_job, confidence_interval, reasoning = match.groups()
        example_key = f"Example{i}"
        vals[example_key] = example_text.strip()
        vals[f"Job{i}"] = onet_job_automated.strip()
        vals[f"Job{i}_title"] = onet_job.strip()
        vals[f"situation{i}_conf_interval"] = confidence_interval.strip()
        vals[f"situation{i}_conf_interval_reasoning"] = reasoning.strip()

        i += 1  # Prepare to search for the next example
    return vals

for col in cols:
    if col in list(df.columns): df=df.drop(columns=[col])

results_df = pd.DataFrame(list(df.apply(lambda x: extract_data(x["examples_llm"]),axis=1)))
df = pd.concat([df, results_df],axis=1)


In [21]:
# df[pd.isnull(df.examples_llm)].values

In [22]:
df = df.dropna(subset=["Example1"],axis=0)
df = df.drop_duplicates(subset=['organization name'], keep='first')
df = df.reset_index(drop=True)

In [23]:
df.to_csv('../output/df_with_examples.csv', index=False)