In [45]:
import pandas as pd
import numpy as np
import json
import re
from llms import gemini
from llms import chatGPT
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import TSNE

   
def printExamples():
    grouped = df.groupby(['organization name', 'website'])
    for (org_name, website), group_data in grouped:
        print("***************************")
        print(f"Organization: {org_name}, Website: {website}")
        for col in ['generated_description', 'Tasks/Jobs', 'Industry', 'Customers', 'generated_description_conf_interval', 'parsed_description_conf_interval']:
            print(f"{col}: {group_data.iloc[0][col]}")
        for example, example_data in group_data.groupby('example'):
            print("_________________________")
            print(f"Example: {example}")
            for col in ['situation_conf_interval', 'situation_conf_interval_reasoning']:
                print(f"{col}: {example_data.iloc[0][col]}")
            for idx, row in example_data.iterrows():
                print("##########################")
                for col in ['job', 'onet_title', 'onet_task', 'example_job_title', 'task_similarity', 'job_title_similarity', 'onet_weight']:
                    print(f"{col}: {row[col]}")
        print("***************************\n")
        
def writeExamples():
    output_file_path = "../output/examples.txt"
    with open(output_file_path, 'w') as f:
        grouped = df.groupby(['organization name', 'website'])
        for (org_name, website), group_data in grouped:
            f.write("***************************\n")
            f.write(f"Organization: {org_name}, Website: {website}\n")
            for col in ['generated_description', 'Tasks/Jobs', 'Industry', 'Customers', 'generated_description_conf_interval', 'parsed_description_conf_interval']:
                f.write(f"{col}: {group_data.iloc[0][col]}\n")
            for example, example_data in group_data.groupby('example'):
                f.write("_________________________\n")
                f.write(f"Example: {example}\n")
                for col in ['situation_conf_interval', 'situation_conf_interval_reasoning']:
                    f.write(f"{col}: {example_data.iloc[0][col]}\n")
                for idx, row in example_data.iterrows():
                    f.write("##########################\n")
                    for col in ['job', 'onet_title', 'onet_task', 'example_job_title', 'task_similarity', 'job_title_similarity','onet_weight']:
                        f.write(f"{col}: {row[col]}\n")
            f.write("***************************\n\n")
                
def generateOutput(df,type):
    conf_interval_text = 'all' #conf_interval if conf_interval != True else 'all'
    output_file_path = f"../output/output_{type}_{task_sim}_{title_sim}_{conf_interval_text}.txt"
    num_startups = len(df["organization name"].unique())
    num_onet_startups = len(df["organization_name"].unique())
    with open(output_file_path, 'w') as f:
        group_sums = df.groupby('Minor Group Name')['onet_weight'].sum().sort_values(ascending=False)
        f.write(f"{num_onet_startups} Startups Founded {type} launch of chatGPT with 1-10 employees\n")
        
        f.write(f"Task Similarity: {task_sim}, Title Similarity {title_sim}, Confidence Interval: {conf_interval_text}\n")
        f.write(f"{num_onet_startups} startups out of {num_startups} decomposed\n")
        f.write("Decomposition of startup effect on labor market\n")
        
        for group_name in group_sums.index:
            group_data = df[df['Minor Group Name'] == group_name]
            f.write("*****************\n")
            f.write(f"Group: {group_name}\n")
            onet_titles = group_data.groupby('onet_title')['onet_weight'].sum().round(2).sort_values(ascending=False).reset_index()
            onet_tasks = group_data.groupby('onet_task')['onet_weight'].sum().round(2).sort_values(ascending=False).reset_index()
            
            f.write("________________\n")
            f.write("10 Most Highly Weighted Titles:\n")
            for index, row in onet_titles.iterrows():
                f.write(f"Title: {row['onet_title']}, Weight: {row['onet_weight']}\n")
            f.write("\n")
        
            f.write("10 Most Highly Weighted Tasks:\n")
            for index, row in list(onet_tasks.iterrows())[:10]:
                f.write(f"Task: {row['onet_task']}, Weight: {row['onet_weight']}\n")
            f.write("\n")
        
            f.write("10 Example Startups:\n")
            startups = group_data.groupby(['organization_name','website'])['onet_weight'].sum().round(2).sort_values(ascending=False).reset_index()
            for index, row in list(startups.iterrows())[:10]:
                f.write(f"Startup: {row['organization_name']}, Website: {row['website']}\n")
            f.write("\n")
        


def get_ONET(task_sim, title_sim, conf_interval=True):
    onet_df = pd.read_csv("../output/onet_df.csv")
    onet_df = onet_df[(onet_df.task_similarity > task_sim) & (onet_df.job_title_similarity > title_sim)]
    onet_df = onet_df[onet_df.situation_conf_interval > conf_interval].reset_index(drop=True)
    onet_df = compute_ONET_weights(onet_df)
    print(f"{len(onet_df)} tasks with task similarity > {task_sim} and title similarity > {title_sim}")
    return onet_df

def compute_ONET_weights(onet_df):
    onet_weights = onet_df.groupby("organization_name")["onet_title"].count().apply(lambda x: 1/x).reset_index().rename({"onet_title": "onet_weight"}, axis=1)
    onet_df = onet_df.merge(onet_weights, on="organization_name")
    return onet_df


def get_startup_generated_LLM_titles(onet_df):
    example_job_titles_df = onet_df.example_job_title.value_counts().apply(lambda x: x/3).reset_index()
    return example_job_titles_df

def print_top_100_ONET_tasks(onet_df):
    top_100_ONETtasks = onet_df['onet_task'].value_counts().reset_index().head(100)
    top_100_tasks.columns = ['Task', 'Frequency']
    for index, row in top_100_tasks.iterrows():
        print(f"Task = {row['Task']}, Frequency = {row['Frequency']}\n")

def get_startup_data():
    startup_df = pd.read_csv("../output/df_with_examples.csv")
    cols = ['organization name', 'founded date', 'website', 'description_all', 'industries_parsed', 'generated_description', 'parsed_description', 'Tasks/Jobs', 'Industry', 'Customers', 'generated_description_conf_interval', 'parsed_description_conf_interval']
    df = startup_df[cols]
    df = df.merge(onet_df,left_on="organization name",right_on="organization_name")


    onet_occ = pd.read_csv("../input/onet_2023/Occupation Data.csv")[["O*NET-SOC Code","Title"]]
    onet_occ.columns = ["Detailed Occupation","onet_title"]
    df = df.merge(onet_occ,on="onet_title")
    df["Detailed Occupation"] = df["Detailed Occupation"].apply(lambda x: x[:-3])
    codes = pd.read_csv("../input/soc_codes/soc_codes.csv", index_col=0)
    df = df.merge(codes,on="Detailed Occupation", how="left")

    
    num_startups = len(startup_df["organization name"].unique())
    num_onet_startups = len(df["organization_name"].unique())
    print(f"{num_onet_startups} startups out of {num_startups} decomposed")
    return df



def get_example_task_embeddings(df):
    embedding_df = pd.read_csv('../output/df_with_examples_embeddings.csv')
    melted_examples = pd.melt(embedding_df, id_vars=['organization name'], value_vars=['Example1', 'Example2', 'Example3'], 
                        var_name='number', value_name='Example')
    
    melted_jobs = pd.melt(embedding_df, id_vars=['organization name'], value_vars=['Job1_embedding', 'Job2_embedding', 'Job3_embedding'], 
                          var_name='number', value_name='example_task_embedding')
    melted_jobs.number = melted_jobs.number.apply(lambda x: x[3])
    melted_examples.number = melted_examples.number.apply(lambda x: x[-1])
    embedding_df = pd.merge(melted_examples, melted_jobs, on=['organization name', 'number'])
    df = embedding_df.merge(df,left_on=["organization name","Example"],right_on=["organization name","example"])
    return df 


def get_onet_task_embeddings(df):
    df_exp = pd.read_csv('../input/gpts_labels/gpt_exposure_embeddings.csv')
    df_exp = df_exp[[x for x in df_exp.columns if "Unnamed" not in x]]
    df_exp = df_exp[["Task","task_embedding","title_embedding","Task ID"]]
    df_exp = df_exp.rename({"task_embedding":"onet_task_embedding","title_embedding":"onet_title_embedding"},axis=1)
    df = df.merge(df_exp,left_on="onet_task",right_on="Task",how="left")
   
    return df
    
def tsne():
    df.example_task_embedding = df.example_task_embedding.apply(lambda x: [float(y) for y in x.strip("[]").split(", ")])
    df.onet_task_embedding = df.onet_task_embedding.apply(lambda x: [float(y) for y in x.strip("[]").split(", ")])
    df.onet_title_embedding = df.onet_title_embedding.apply(lambda x: [float(y) for y in x.strip("[]").split(", ")])
    embeddings = np.vstack(df['example_task_embedding'].values)
    tsne = TSNE(n_components=3, perplexity=100, n_iter=1000)
    tsne_results = tsne.fit_transform(embeddings)
    df[["example_task_embedding_tsne1","example_task_embedding_tsne2"]] = tsne_results[:, [0,1]]
    
    
    embeddings = np.vstack(df['onet_task_embedding'].values)
    tsne = TSNE(n_components=3, perplexity=100, n_iter=1000)
    tsne_results = tsne.fit_transform(embeddings)
    df[["onet_task_embedding_tsne1","onet_task_embedding_tsne2"]] = tsne_results[:, [0,1]]
    
    embeddings = np.vstack(df['onet_title_embedding'].values)
    tsne = TSNE(n_components=3, perplexity=100, n_iter=1000)
    tsne_results = tsne.fit_transform(embeddings)
    df[["onet_title_embedding_tsne1","onet_title_embedding_tsne2"]] = tsne_results[:, [0,1]]
    
    tsne = df[['example_task_embedding_tsne1',
     'example_task_embedding_tsne2',
     'onet_task_embedding_tsne1',
     'onet_task_embedding_tsne2',
     'onet_title_embedding_tsne1',
     'onet_title_embedding_tsne2']]
    tsne.to_csv("../output/results/tsne.csv")
    return tsne, df





task_sim=0
title_sim=0
onet_df = get_ONET(task_sim=task_sim, title_sim=title_sim)
df = get_startup_data()
writeExamples()
pre = df[pd.to_datetime(df['founded date'])<='11/30/2022']
post = df[pd.to_datetime(df['founded date'])>='11/30/2022']
generateOutput(pre,"pre")
generateOutput(post,"post")
# df = get_example_task_embeddings(df)
# df = get_onet_task_embeddings(df)
# df.head()
# df.to_csv("../output/bls_df.csv")

# tsne, df = tsne()

16659 tasks with task similarity > 0 and title similarity > 0
1851 startups out of 2927 decomposed


In [38]:
pre = df[pd.to_datetime(df['founded date'])<='11/30/2022']
post = df[pd.to_datetime(df['founded date'])>='11/30/2022']
print(f"pre {len(pre)}, post {len(post)}")

pre 3207, post 1087


In [22]:
len(pre["organization name"].unique())

933

In [23]:
len(post["organization name"].unique())

324

In [33]:
startup_df = pd.read_csv("../output/df_with_examples.csv")
pre = startup_df[pd.to_datetime(startup_df['founded date'])<='11/30/2022']
post = startup_df[pd.to_datetime(startup_df['founded date'])>='11/30/2022']
print(f"pre {len(pre)}, post {len(post)}")

pre 2232, post 695


In [25]:
1358/2232

0.6084229390681004

In [26]:
453/695

0.6517985611510791

In [28]:
df.groupby("founded date")[

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x17464cad0>

In [39]:
for x in range(10):
    print(pre.website.iloc[x])

Greater New York Area, East Coast, Northeastern US
San Francisco Bay Area, Silicon Valley, West Coast
www.ultipa.com
www.ultipa.com
www.OLY.AI
fuzz.land
Greater Miami Area, East Coast, Southern US
San Francisco Bay Area, Silicon Valley, West Coast
San Francisco Bay Area, Silicon Valley, West Coast
San Francisco Bay Area, Silicon Valley, West Coast


In [44]:
pre.iloc[5:6]

Unnamed: 0,organization name,founded date,website,description_all,industries_parsed,generated_description,parsed_description,Tasks/Jobs,Industry,Customers,...,job_title_similarity,onet_weight,Detailed Occupation,Major Group,Minor Group,Broad Group,Title,Major Group Name,Minor Group Name,Broad Group Name
9,FuzzLand,2022-01-01,fuzz.land,FuzzLand is a leveraging AI tool that allows b...,['Web Development'],FuzzLand is an AI-powered platform that simpli...,"Tasks/Jobs: Smart contract analysis, Vulnerabi...","Smart contract analysis, Vulnerability detecti...",Blockchain security,"Smart contract developers, Auditors, Blockchai...",...,0.694171,0.25,11-3031,11-0000,11-3000,11-3030,Financial Managers,Management Occupations,Operations Specialties Managers,Financial Managers
