In [2]:
import pandas as pd
import numpy as np
import json
import re
from llms import gemini
from llms import chatGPT
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import TSNE


def get_ONET(task_sim, title_sim, conf_interval=True):
    onet_df = pd.read_csv("../output/onet_df.csv")
    onet_df = onet_df[(onet_df.task_similarity > task_sim) & (onet_df.job_title_similarity > title_sim)]
    onet_df = onet_df[onet_df.situation_conf_interval > conf_interval].reset_index(drop=True)
    onet_df = compute_ONET_weights(onet_df)
    print(f"{len(onet_df)} tasks with task similarity > {task_sim} and title similarity > {title_sim}")
    return onet_df

def compute_ONET_weights(onet_df):
    onet_weights = onet_df.groupby("organization_name")["onet_title"].count().apply(lambda x: 1/x).reset_index().rename({"onet_title": "onet_weight"}, axis=1)
    onet_df = onet_df.merge(onet_weights, on="organization_name")
    return onet_df


def get_startup_generated_LLM_titles(onet_df):
    example_job_titles_df = onet_df.example_job_title.value_counts().apply(lambda x: x/3).reset_index()
    return example_job_titles_df

def print_top_100_ONET_tasks(onet_df):
    top_100_ONETtasks = onet_df['onet_task'].value_counts().reset_index().head(100)
    top_100_tasks.columns = ['Task', 'Frequency']
    for index, row in top_100_tasks.iterrows():
        print(f"Task = {row['Task']}, Frequency = {row['Frequency']}\n")

def get_startup_data():
    startup_df = pd.read_csv("../output/df_with_examples.csv")
    cols = ['organization name', 'founded date', 'website', 'description_all', 'industries_parsed', 'generated_description', 'parsed_description', 'Tasks/Jobs', 'Industry', 'Customers', 'generated_description_conf_interval', 'parsed_description_conf_interval']
    df = startup_df[cols]
    df = df.merge(onet_df,left_on="organization name",right_on="organization_name")


    onet_occ = pd.read_csv("../input/onet/Occupation Data.csv")[["O*NET-SOC Code","Title"]]
    onet_occ.columns = ["Detailed Occupation","onet_title"]
    df = df.merge(onet_occ,on="onet_title")
    df["Detailed Occupation"] = df["Detailed Occupation"].apply(lambda x: x[:-3])
    codes = pd.read_csv("../input/soc_codes/soc_codes.csv", index_col=0)
    df = df.merge(codes,on="Detailed Occupation", how="left")

    
    num_startups = len(startup_df["organization name"].unique())
    num_onet_startups = len(df["organization_name"].unique())
    print(f"{num_onet_startups} startups out of {num_startups} decomposed")
    return df



def get_example_task_embeddings(df):
    embedding_df = pd.read_csv('../output/df_with_examples_embeddings.csv')
    melted_examples = pd.melt(embedding_df, id_vars=['organization name'], value_vars=['Example1', 'Example2', 'Example3'], 
                        var_name='number', value_name='Example')
    
    melted_jobs = pd.melt(embedding_df, id_vars=['organization name'], value_vars=['Job1_embedding', 'Job2_embedding', 'Job3_embedding'], 
                          var_name='number', value_name='example_task_embedding')
    melted_jobs.number = melted_jobs.number.apply(lambda x: x[3])
    melted_examples.number = melted_examples.number.apply(lambda x: x[-1])
    embedding_df = pd.merge(melted_examples, melted_jobs, on=['organization name', 'number'])
    df = embedding_df.merge(df,left_on=["organization name","Example"],right_on=["organization name","example"])
    return df 


def get_onet_task_embeddings(df):
    df_exp = pd.read_csv('../input/gpts_labels/gpt_exposure_embeddings.csv')
    df_exp = df_exp[[x for x in df_exp.columns if "Unnamed" not in x]]
    df_exp = df_exp[["Task","task_embedding","title_embedding","Task ID"]]
    df_exp = df_exp.rename({"task_embedding":"onet_task_embedding","title_embedding":"onet_title_embedding"},axis=1)
    df = df.merge(df_exp,left_on="onet_task",right_on="Task",how="left")
   
    return df
    
def tsne():
    df.example_task_embedding = df.example_task_embedding.apply(lambda x: [float(y) for y in x.strip("[]").split(", ")])
    df.onet_task_embedding = df.onet_task_embedding.apply(lambda x: [float(y) for y in x.strip("[]").split(", ")])
    df.onet_title_embedding = df.onet_title_embedding.apply(lambda x: [float(y) for y in x.strip("[]").split(", ")])
    embeddings = np.vstack(df['example_task_embedding'].values)
    tsne = TSNE(n_components=3, perplexity=100, n_iter=1000)
    tsne_results = tsne.fit_transform(embeddings)
    df[["example_task_embedding_tsne1","example_task_embedding_tsne2"]] = tsne_results[:, [0,1]]
    
    
    embeddings = np.vstack(df['onet_task_embedding'].values)
    tsne = TSNE(n_components=3, perplexity=100, n_iter=1000)
    tsne_results = tsne.fit_transform(embeddings)
    df[["onet_task_embedding_tsne1","onet_task_embedding_tsne2"]] = tsne_results[:, [0,1]]
    
    embeddings = np.vstack(df['onet_title_embedding'].values)
    tsne = TSNE(n_components=3, perplexity=100, n_iter=1000)
    tsne_results = tsne.fit_transform(embeddings)
    df[["onet_title_embedding_tsne1","onet_title_embedding_tsne2"]] = tsne_results[:, [0,1]]
    
    tsne = df[['example_task_embedding_tsne1',
     'example_task_embedding_tsne2',
     'onet_task_embedding_tsne1',
     'onet_task_embedding_tsne2',
     'onet_title_embedding_tsne1',
     'onet_title_embedding_tsne2']]
    tsne.to_csv("../output/tsne.csv")
    return tsne, df






onet_df = get_ONET(task_sim=0.68, title_sim=0.4)
df = get_startup_data()
df = get_example_task_embeddings(df)
df = get_onet_task_embeddings(df)
df.head()
df.to_csv("../output/bls_df.csv")

tsne, df = tsne()

6597 tasks with task similarity > 0.68 and title similarity > 0.4
1770 startups out of 2191 decomposed


In [12]:
df

Unnamed: 0,organization name,number,Example,example_task_embedding,founded date,website,description_all,industries_parsed,generated_description,parsed_description,...,job_title_similarity,onet_weight,Detailed Occupation,Major Group,Minor Group,Broad Group,Title,Major Group Name,Minor Group Name,Broad Group Name
0,Pika,1,A content creator uses Pika to quickly create ...,"[-0.02308758907020092, -0.0024262103252112865,...",2023-01-01,pika.art,Pika is a startup that develops an AI-powered ...,"['Generative AI', 'Graphic Design', 'Video']",Pika is an AI-powered platform that simplifies...,"Tasks/Jobs: Video creation, Video editing, Scr...",...,0.760252,0.333333,27-4032,27-0000,27-4000,27-4030,Film and Video Editors,"Arts, Design, Entertainment, Sports, and Media...",Media and Communication Equipment Workers,"Television, Video, and Film Camera Operators a..."
1,Pika,1,A content creator uses Pika to quickly create ...,"[-0.02308758907020092, -0.0024262103252112865,...",2023-01-01,pika.art,Pika is a startup that develops an AI-powered ...,"['Generative AI', 'Graphic Design', 'Video']",Pika is an AI-powered platform that simplifies...,"Tasks/Jobs: Video creation, Video editing, Scr...",...,0.760252,0.333333,27-4032,27-0000,27-4000,27-4030,Film and Video Editors,"Arts, Design, Entertainment, Sports, and Media...",Media and Communication Equipment Workers,"Television, Video, and Film Camera Operators a..."
2,Pika,1,A content creator uses Pika to quickly create ...,"[-0.02308758907020092, -0.0024262103252112865,...",2023-01-01,pika.art,Pika is a startup that develops an AI-powered ...,"['Generative AI', 'Graphic Design', 'Video']",Pika is an AI-powered platform that simplifies...,"Tasks/Jobs: Video creation, Video editing, Scr...",...,0.760103,0.333333,27-4032,27-0000,27-4000,27-4030,Film and Video Editors,"Arts, Design, Entertainment, Sports, and Media...",Media and Communication Equipment Workers,"Television, Video, and Film Camera Operators a..."
3,Sierra,1,A customer service representative uses the con...,"[-0.026539774611592293, -0.02338397316634655, ...",2023-01-01,sierra.ai,Sierra is an AI startup that tackles essential...,"['Enterprise Software', 'SaaS']",Sierra is an AI company that develops conversa...,"Tasks/Jobs: Customer service, Lead generation,...",...,0.999999,0.333333,43-4051,43-0000,43-4000,43-4050,Customer Service Representatives,Office and Administrative Support Occupations,Information and Record Clerks,Customer Service Representatives
4,Sierra,1,A customer service representative uses the con...,"[-0.026539774611592293, -0.02338397316634655, ...",2023-01-01,sierra.ai,Sierra is an AI startup that tackles essential...,"['Enterprise Software', 'SaaS']",Sierra is an AI company that develops conversa...,"Tasks/Jobs: Customer service, Lead generation,...",...,0.999999,0.333333,43-4051,43-0000,43-4000,43-4050,Customer Service Representatives,Office and Administrative Support Occupations,Information and Record Clerks,Customer Service Representatives
5,Sierra,1,A customer service representative uses the con...,"[-0.026539774611592293, -0.02338397316634655, ...",2023-01-01,sierra.ai,Sierra is an AI startup that tackles essential...,"['Enterprise Software', 'SaaS']",Sierra is an AI company that develops conversa...,"Tasks/Jobs: Customer service, Lead generation,...",...,0.594764,0.333333,41-3091,41-0000,41-3000,41-3090,"Sales Representatives of Services, Except Adve...",Sales and Related Occupations,"Sales Representatives, Services","Miscellaneous Sales Representatives, Services"
6,Essential AI,1,A marketing team uses Essential AI's tool to a...,"[-0.00786684826016426, -0.036635298281908035, ...",2023-01-01,www.essential.ai,Essential AI creates AI solutions that enhance...,"['Information Technology', 'Software']",Essential AI develops and delivers AI solution...,"Tasks/Jobs: Text generation, Data analysis, Cu...",...,0.629328,0.5,13-1161,13-0000,13-1000,13-1160,Market Research Analysts and Marketing Special...,Business and Financial Operations Occupations,Business Operations Specialists,Market Research Analysts and Marketing Special...
7,Essential AI,1,A marketing team uses Essential AI's tool to a...,"[-0.00786684826016426, -0.036635298281908035, ...",2023-01-01,www.essential.ai,Essential AI creates AI solutions that enhance...,"['Information Technology', 'Software']",Essential AI develops and delivers AI solution...,"Tasks/Jobs: Text generation, Data analysis, Cu...",...,0.713154,0.5,13-1161,13-0000,13-1000,13-1160,Market Research Analysts and Marketing Special...,Business and Financial Operations Occupations,Business Operations Specialists,Market Research Analysts and Marketing Special...
8,Liquid AI,1,A content creator uses Liquid AI to generate m...,"[-0.008149910718202591, -0.009260959923267365,...",2023-01-01,liquid.ai,Liquid AI is a developer of AI applications th...,"['Generative AI', 'Information Technology', 'M...",Liquid AI is an AI-powered platform that offer...,"Tasks/Jobs: Content generation, Idea brainstor...",...,0.611519,1.0,27-3041,27-0000,27-3000,27-3040,Editors,"Arts, Design, Entertainment, Sports, and Media...",Media and Communication Workers,Writers and Editors
9,Lindy,1,A busy CEO uses Lindy to automatically schedul...,"[-0.008831040933728218, -0.0019290806958451867...",2023-01-01,www.lindy.ai,Your AI executive assistant. Lindy is an AI as...,"['Internet', 'Software', 'Virtual Assistant', ...",Lindy is an AI-powered executive assistant tha...,"Tasks/Jobs: Appointment scheduling, Email mana...",...,0.689654,0.333333,43-6014,43-0000,43-6000,43-6010,"Secretaries and Administrative Assistants, Exc...",Office and Administrative Support Occupations,Secretaries and Administrative Assistants,Secretaries and Administrative Assistants
