In [2]:
import pandas as pd
import numpy as np


In [172]:
class output():
    def __init__(self):
        self.codes = pd.read_csv("../output/soc_codes.csv",index_col=0)

    
    def getONET(self,task_sim, title_sim, conf_interval=True):
        self.task_sim = task_sim
        self.title_sim = title_sim
        self.conf_interval = conf_interval
        self.onet_df = pd.read_csv("../output/onet_df.csv")
        self.onet_df = self.onet_df[(self.onet_df.task_similarity > task_sim) & (self.onet_df.job_title_similarity > title_sim)].reset_index(drop=True)
        self.onet_df = self.onet_df[self.onet_df.situation_conf_interval > conf_interval]
        self.computeONETWeights()
        self.getStartupData()
        
        print(f"{len(self.onet_df)} tasks with task similarity > {task_sim} and title similarity > {title_sim}")
        self.num_onet_startups = len(self.df["organization name"].unique())
        self.num_startups = len(self.startup_df["organization name"].unique())
        print(f"{self.num_onet_startups} startups out of {self.num_startups} decomposed")
        
    def computeONETWeights(self):
        self.onet_weights = self.onet_df.groupby("organization_name")["onet_title"].count().apply(lambda x: 1/x).reset_index().rename({"onet_title":"onet_weight"},axis=1)
        self.onet_df = self.onet_df.merge(self.onet_weights, on="organization_name")

    def getONETTitles(self):
        self.onet_titles = self.onet_df.groupby("onet_title")["onet_weight"].sum().round(2).sort_values(ascending=False).reset_index()
        return self.onet_titles

    def getStartupGeneratedLLMTitles(self):
        self.example_job_titles_df = onet_df.example_job_title.value_counts().apply(lambda x: x/3).reset_index()
        return self.example_job_titles_df

    def printTop100ONETTasks(self):
        self.top_100_ONETtasks = onet_df['onet_task'].value_counts().reset_index().head(100)
        self.top_100_tasks.columns = ['Task', 'Frequency']
        for index, row in self.top_100_tasks.iterrows():
            print(f"Task = {row['Task']}, Frequency = {row['Frequency']}\n")
    
    def getStartupData(self):
        self.startup_df = pd.read_csv("../output/df_with_examples.csv")
        cols = ['organization name', 'num employees', 'founded date','website','description_all', 'industries_parsed', 'generated_description', 'parsed_description', 'Tasks/Jobs', 'Industry', 'Customers','generated_description_conf_interval','parsed_description_conf_interval']
        self.df = self.startup_df[cols]
        self.df = self.df.merge(self.onet_df,left_on="organization name",right_on="organization_name")
        self.df = self.df.merge(self.codes,left_on="onet_title",right_on="Title")

    
    def printExamples(self):
        grouped = self.df.groupby(['organization name', 'website'])
        for (org_name, website), group_data in grouped:
            print("***************************")
            print(f"Organization: {org_name}, Website: {website}")
            for col in ['generated_description', 'Tasks/Jobs', 'Industry', 'Customers', 'generated_description_conf_interval', 'parsed_description_conf_interval']:
                print(f"{col}: {group_data.iloc[0][col]}")
            for example, example_data in group_data.groupby('example'):
                print("_________________________")
                print(f"Example: {example}")
                for col in ['situation_conf_interval', 'situation_conf_interval_reasoning']:
                    print(f"{col}: {example_data.iloc[0][col]}")
                for idx, row in example_data.iterrows():
                    print("##########################")
                    for col in ['job', 'onet_title', 'onet_task', 'example_job_title', 'task_similarity', 'job_title_similarity', 'onet_weight']:
                        print(f"{col}: {row[col]}")
            print("***************************\n")

    def writeExamples(self):
        output_file_path = "../output/examples.txt"
        with open(output_file_path, 'w') as f:
            grouped = self.df.groupby(['organization name', 'website'])
            for (org_name, website), group_data in grouped:
                f.write("***************************\n")
                f.write(f"Organization: {org_name}, Website: {website}\n")
                for col in ['generated_description', 'Tasks/Jobs', 'Industry', 'Customers', 'generated_description_conf_interval', 'parsed_description_conf_interval']:
                    f.write(f"{col}: {group_data.iloc[0][col]}\n")
                for example, example_data in group_data.groupby('example'):
                    f.write("_________________________\n")
                    f.write(f"Example: {example}\n")
                    for col in ['situation_conf_interval', 'situation_conf_interval_reasoning']:
                        f.write(f"{col}: {example_data.iloc[0][col]}\n")
                    for idx, row in example_data.iterrows():
                        f.write("##########################\n")
                        for col in ['job', 'onet_title', 'onet_task', 'example_job_title', 'task_similarity', 'job_title_similarity','onet_weight']:
                            f.write(f"{col}: {row[col]}\n")
                f.write("***************************\n\n")
                        
    def generateOutput(self):
        self.conf_interval_text = self.conf_interval if self.conf_interval != True else 'all'
        output_file_path = f"../output/output_{self.task_sim}_{self.title_sim}_{self.conf_interval_text}.txt"
        
        with open(output_file_path, 'w') as f:
            
            group_sums = self.df.groupby('Minor Group Name')['onet_weight'].sum().sort_values(ascending=False)
            f.write(f"{self.num_onet_startups} Startups Founded post launch of chatGPT with 1-10 employees\n")
            
            f.write(f"Task Similarity: {self.task_sim}, Title Similarity {self.title_sim}, Confidence Interval: {self.conf_interval_text}\n")
            f.write(f"{self.num_onet_startups} startups out of {self.num_startups} decomposed\n")
            f.write("Decomposition of startup effect on labor market\n")
            
            for group_name in group_sums.index:
                group_data = self.df[self.df['Minor Group Name'] == group_name]
                f.write("*****************\n")
                f.write(f"Group: {group_name}\n")
                onet_titles = group_data.groupby('onet_title')['onet_weight'].sum().round(2).sort_values(ascending=False).reset_index()
                onet_tasks = group_data.groupby('onet_task')['onet_weight'].sum().round(2).sort_values(ascending=False).reset_index()
                
                f.write("________________\n")
                f.write("10 Most Highly Weighted Titles:\n")
                for index, row in onet_titles.iterrows():
                    f.write(f"Title: {row['onet_title']}, Weight: {row['onet_weight']}\n")
                f.write("\n")
            
                f.write("10 Most Highly Weighted Tasks:\n")
                for index, row in list(onet_tasks.iterrows())[:10]:
                    f.write(f"Task: {row['onet_task']}, Weight: {row['onet_weight']}\n")
                f.write("\n")
            
                f.write("10 Example Startups:\n")
                startups = group_data.groupby(['organization_name','website'])['onet_weight'].sum().round(2).sort_values(ascending=False).reset_index()
                for index, row in list(startups.iterrows())[:10]:
                    f.write(f"Startup: {row['organization_name']}, Website: {row['website']}\n")
                f.write("\n")
    


In [173]:
out = output()
out.getONET(task_sim=.67,title_sim=.7)
out.printExamples()
out.generateOutput()
out.writeExamples()

2023 tasks with task similarity > 0.67 and title similarity > 0.7
518 startups out of 990 decomposed
***************************
Organization: (defun ai ()), Website: defun.ai
generated_description: (defun ai ()) is a company that uses generative AI to create complex software from natural language prompts. Their Conjure technology enables users to describe their desired software functionality in plain English, and (defun ai ()) translates those instructions into working code.
Tasks/Jobs: Software development, Code generation, Natural language processing, Prompt engineering
Industry: Software development
Customers: Developers, Software engineers, Business owners
generated_description_conf_interval: 7
parsed_description_conf_interval: 8
_________________________
Example: A business owner uses (defun ai ()) to create a basic prototype of a new software application by describing the desired features and user interface elements in plain English.
situation_conf_interval: 7.0
situation_conf_i

In [158]:
out.onet_df.columns

Index(['organization_name', 'example', 'situation_conf_interval',
       'situation_conf_interval_reasoning', 'job', 'onet_title', 'onet_task',
       'example_job_title', 'task_similarity', 'job_title_similarity',
       'onet_weight'],
      dtype='object')

Unnamed: 0,organization_name,example,situation_conf_interval,situation_conf_interval_reasoning,job,onet_title,onet_task,example_job_title,task_similarity,job_title_similarity,onet_weight
0,Catio,A cloud architect uses Catio to evaluate diffe...,9.0,The provided description aligns with the respo...,Computer and Information Systems Managers that...,Computer and Information Systems Managers,Evaluate the organization's technology use and...,Computer and Information Systems Managers,0.730842,0.999999,0.250000
1,Catio,A cloud architect uses Catio to evaluate diffe...,9.0,The provided description aligns with the respo...,Computer and Information Systems Managers that...,Computer and Information Systems Managers,"Consult with users, management, vendors, and t...",Computer and Information Systems Managers,0.704641,0.999999,0.250000
2,Catio,A cloud architect uses Catio to evaluate diffe...,9.0,The provided description aligns with the respo...,Computer and Information Systems Managers that...,Computer and Information Systems Managers,"Develop computer information resources, provid...",Computer and Information Systems Managers,0.704586,0.999999,0.250000
3,Catio,An IT professional uses Catio to continuously ...,7.0,"While the task description is broad, it aligns...",Systems Administrators that monitor and analyz...,Network and Computer Systems Administrators,Monitor network performance to determine wheth...,Systems Administrators,0.650572,0.741931,0.250000
4,Helios Artificial Intelligence,A supply chain manager uses the AI platform to...,9.0,The ONET Job description for Supply Chain Mana...,Supply Chain Managers that analyze global trad...,Supply Chain Managers,Monitor forecasts and quotas to identify chang...,Supply Chain Managers,0.708903,1.000000,0.111111
...,...,...,...,...,...,...,...,...,...,...,...
3179,Redacted Holdings,A venture capitalist uses the platform to auto...,8.0,This is a common task for financial analysts w...,Financial Analysts that analyze market trends ...,Financial and Investment Analysts,Recommend investments and investment timing to...,Financial Analysts,0.738179,0.843050,0.142857
3180,Redacted Holdings,A venture capitalist uses the platform to cond...,7.0,"This is a core activity in venture capital, an...",Financial Analysts that conduct due diligence ...,Financial and Investment Analysts,Assess companies as investments for clients by...,Financial Analysts,0.654443,0.842992,0.142857
3181,Redacted Holdings,A venture capitalist uses the platform to moni...,8.0,This is a crucial task for venture capitalists...,Portfolio Managers that monitor the performanc...,Investment Fund Managers,Monitor financial or operational performance o...,Financial Analysts,0.773301,0.450171,0.142857
3182,Redacted Holdings,A venture capitalist uses the platform to moni...,8.0,This is a crucial task for venture capitalists...,Portfolio Managers that monitor the performanc...,Investment Fund Managers,"Perform or evaluate research, such as detailed...",Financial Analysts,0.680770,0.450058,0.142857


In [36]:
len(df["organization name"].unique())

419

In [44]:
len(startup_df)

990

In [68]:
out.onet_weights

Unnamed: 0,organization_name,onet_weight
0,AI Fashion Week,0.500000
1,AI Mavericks,1.000000
2,AI Square,0.166667
3,AIcon,1.000000
4,AIgeeked,0.333333
...,...,...
235,inPlace,0.500000
236,mimilabs.ai,0.333333
237,reprai,1.000000
238,stylio.ai,1.000000
