In [101]:
import pandas as pd
import numpy as np
import json

def load_codes():
    return pd.read_csv("../input/soc_codes/soc_codes.csv", index_col=0)

def get_ONET(task_sim, title_sim, conf_interval=True):
    onet_df = pd.read_csv("../output/results v3/onet_df.csv")
    onet_df = onet_df[(onet_df.task_similarity > task_sim) & (onet_df.job_title_similarity > title_sim)]
    onet_df = onet_df[onet_df.situation_conf_interval > conf_interval].reset_index(drop=True)
    onet_df = compute_ONET_weights(onet_df)
    print(f"{len(onet_df)} tasks with task similarity > {task_sim} and title similarity > {title_sim}")
    return onet_df

def compute_ONET_weights(onet_df):
    onet_weights = onet_df.groupby("organization_name")["onet_title"].count().apply(lambda x: 1/x).reset_index().rename({"onet_title": "onet_weight"}, axis=1)
    onet_df = onet_df.merge(onet_weights, on="organization_name")
    return onet_df

def get_ONET_titles(df):
    onet_titles = df.groupby("onet_title").aggregate({"onet_weight": "sum", "Detailed Occupation": "first"})
    onet_titles.onet_weight = onet_titles.onet_weight.round(2)
    onet_titles = onet_titles.reset_index().sort_values(by="onet_weight", ascending=False)
    return onet_titles

def get_startup_generated_LLM_titles(onet_df):
    example_job_titles_df = onet_df.example_job_title.value_counts().apply(lambda x: x/3).reset_index()
    return example_job_titles_df

def print_top_100_ONET_tasks(onet_df):
    top_100_ONETtasks = onet_df['onet_task'].value_counts().reset_index().head(100)
    top_100_tasks.columns = ['Task', 'Frequency']
    for index, row in top_100_tasks.iterrows():
        print(f"Task = {row['Task']}, Frequency = {row['Frequency']}\n")

def get_startup_data():
    startup_df = pd.read_csv("../output/results v3/df_with_examples.csv")
    cols = ['organization name', 'founded date', 'website', 'description_all', 'industries_parsed', 'generated_description', 'parsed_description', 'Tasks/Jobs', 'Industry', 'Customers', 'generated_description_conf_interval', 'parsed_description_conf_interval']
    df = startup_df[cols]
    df = df.merge(onet_df,left_on="organization name",right_on="organization_name")
    df = df.merge(codes,left_on="onet_title",right_on="Title")
    num_startups = len(startup_df["organization name"].unique())
    num_onet_startups = len(df["organization_name"].unique())
    print(f"{num_onet_startups} startups out of {num_startups} decomposed")
    return df

def get_wage():
    wage_employment = pd.read_csv("../input/wage_employment/wage_employment_bls.csv")
    wage_employment = wage_employment[wage_employment.O_GROUP == "detailed"]
    wage_employment = wage_employment.rename({"OCC_CODE": "Detailed Occupation"}, axis=1)
    wage_employment = wage_employment.replace("#", np.nan).replace("*", np.nan).replace({",": ""}, regex=True)
    wage_employment[["H_MEDIAN", "A_MEDIAN"]] = wage_employment[["H_MEDIAN", "A_MEDIAN"]].astype("float")
    wage_employment["H_MEDIAN"] = wage_employment["H_MEDIAN"].round(0)
    return wage_employment

def get_skills():
    skills_df = pd.read_csv("../input/skills/Skills.csv")
    skills_df = skills_df[skills_df["Scale Name"] == "Importance"]
    skills_df = skills_df.rename(columns={"O*NET-SOC Code": "Detailed Occupation"})
    skills_df["Detailed Occupation"] = skills_df["Detailed Occupation"].replace(r'\.\d{2}', '', regex=True)
    skills_df = skills_df.pivot_table(index=skills_df["Detailed Occupation"], columns='Element Name', values='Data Value', fill_value=0)
    return skills_df

def get_skills_SOC():
    skills_df = pd.read_csv("../input/skills_df_soc_codes.csv", index_col=0)
    return skills_df

def get_wage_skills():
    wage_employment = get_wage()
    skills_df = get_skills_SOC()
    bls_df = skills_df.merge(wage_employment, on="Detailed Occupation", how="inner")
    return bls_df

def BLS(onet_titles, bls_df):
    bls_df = onet_titles.merge(bls_df, on="Detailed Occupation", how="right")
    return bls_df

# Example usage
codes = load_codes()
onet_df = get_ONET(task_sim=0.67, title_sim=0.1)
df = get_startup_data()
bls_df = get_wage_skills()
onet_titles = get_ONET_titles(df)
bls_df = BLS(onet_titles, bls_df)


4592 tasks with task similarity > 0.67 and title similarity > 0.1
1617 startups out of 2188 decomposed


In [94]:
df["similar_title_groups"] = df.Job_similarity_titles.apply(lambda x: [x[1][1][:5]+"00" for x in json.loads(x)] if not pd.isnull(x) else x)

In [95]:
df['title_overlap'] = df.apply(lambda x: x["Minor Group"] in x.similar_title_groups if isinstance(x.similar_title_groups, list) else False, axis=1)

df = df[df.title_overlap]

In [96]:
df

Unnamed: 0,organization name,founded date,website,description_all,industries_parsed,generated_description,parsed_description,Tasks/Jobs,Industry,Customers,...,Major Group,Minor Group,Broad Group,Detailed Occupation,Title,Major Group Name,Minor Group Name,Broad Group Name,similar_title_groups,title_overlap
0,Pika,2023-01-01,pika.art,Pika is a startup that develops an AI-powered ...,"['Generative AI', 'Graphic Design', 'Video']",Pika is an AI-powered platform that transforms...,"Tasks/Jobs: Video creation, Video editing, Ima...","Video creation, Video editing, Image animation...",Video Production,"Marketers, Content creators, Social media mana...",...,27-0000,27-4000,27-4030,27-4031,"Camera Operators, Television, Video, and Film","Arts, Design, Entertainment, Sports, and Media...",Media and Communication Equipment Workers,"Television, Video, and Film Camera Operators a...","[27-4000, 27-4000, 27-4000, 27-4000, 27-4000, ...",True
3,Ssemble,2023-02-13,ssemble.com,Ssemble is a collaborative online video editor...,"['SaaS', 'Video Editing']",Ssemble is a collaborative online video editor...,"Tasks/Jobs: Video editing, Script writing, Noi...","Video editing, Script writing, Noise removal, ...",Video production,"Video creators, Teams, Businesses",...,27-0000,27-4000,27-4030,27-4031,"Camera Operators, Television, Video, and Film","Arts, Design, Entertainment, Sports, and Media...",Media and Communication Equipment Workers,"Television, Video, and Film Camera Operators a...","[27-4000, 27-4000, 27-4000, 27-4000, 27-4000, ...",True
4,Brask AI,2023-04-20,www.brask.ai,Brask is a global AI content company that reim...,"['Advertising', 'Content Creators', 'Video']",Brask AI offers a suite of AI-powered tools fo...,"Tasks/Jobs: Video localization, Dubbing, Digit...","Video localization, Dubbing, Digital double cr...",Content creation,"Content entrepreneurs, Brands, Agencies, Celeb...",...,27-0000,27-4000,27-4030,27-4031,"Camera Operators, Television, Video, and Film","Arts, Design, Entertainment, Sports, and Media...",Media and Communication Equipment Workers,"Television, Video, and Film Camera Operators a...","[27-1000, 27-1000, 27-1000, 27-1000, 27-1000, ...",True
5,Pika,2023-01-01,pika.art,Pika is a startup that develops an AI-powered ...,"['Generative AI', 'Graphic Design', 'Video']",Pika is an AI-powered platform that transforms...,"Tasks/Jobs: Video creation, Video editing, Ima...","Video creation, Video editing, Image animation...",Video Production,"Marketers, Content creators, Social media mana...",...,27-0000,27-4000,27-4030,27-4032,Film and Video Editors,"Arts, Design, Entertainment, Sports, and Media...",Media and Communication Equipment Workers,"Television, Video, and Film Camera Operators a...","[27-1000, 27-1000, 27-1000, 27-1000, 27-1000, ...",True
6,Pika,2023-01-01,pika.art,Pika is a startup that develops an AI-powered ...,"['Generative AI', 'Graphic Design', 'Video']",Pika is an AI-powered platform that transforms...,"Tasks/Jobs: Video creation, Video editing, Ima...","Video creation, Video editing, Image animation...",Video Production,"Marketers, Content creators, Social media mana...",...,27-0000,27-4000,27-4030,27-4032,Film and Video Editors,"Arts, Design, Entertainment, Sports, and Media...",Media and Communication Equipment Workers,"Television, Video, and Film Camera Operators a...","[27-1000, 27-1000, 27-1000, 27-1000, 27-1000, ...",True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4149,Fixify,2023-07-01,fixify.com/,Fixify is an AI powered managed service provid...,['Information Technology'],Fixify offers an AI-powered platform that hand...,"Tasks/Jobs: Device Management, Application Man...","Device Management, Application Management, Cyb...",IT Support,"Small and Medium Businesses, Startups, Enterpr...",...,15-0000,15-1200,15-1230,15-1232,Computer User Support Specialists,Computer and Mathematical Occupations,Computer Occupations,Computer Support Specialists,"[11-3000, 11-3000, 11-3000, 11-3000, 11-3000, ...",True
4150,Fixify,2023-07-01,fixify.com/,Fixify is an AI powered managed service provid...,['Information Technology'],Fixify offers an AI-powered platform that hand...,"Tasks/Jobs: Device Management, Application Man...","Device Management, Application Management, Cyb...",IT Support,"Small and Medium Businesses, Startups, Enterpr...",...,15-0000,15-1200,15-1230,15-1232,Computer User Support Specialists,Computer and Mathematical Occupations,Computer Occupations,Computer Support Specialists,"[11-3000, 11-3000, 11-3000, 11-3000, 11-3000, ...",True
4151,Ultraview AI,2023-07-05,www.ultraview.io/,Robotics & AI UltraView mission is to enhance ...,"['Industrial', 'Software']",Ultraview AI provides drone-based inspection s...,"Tasks/Jobs: Aircraft inspection, Defect identi...","Aircraft inspection, Defect identification, Da...",Aviation Maintenance,"Airlines, Maintenance organizations",...,49-0000,49-3000,49-3010,49-3011,Aircraft Mechanics and Service Technicians,"Installation, Maintenance, and Repair Occupations","Vehicle and Mobile Equipment Mechanics, Instal...",Aircraft Mechanics and Service Technicians,"[49-3000, 49-3000, 49-3000, 49-3000, 49-3000, ...",True
4153,Ultraview AI,2023-07-05,www.ultraview.io/,Robotics & AI UltraView mission is to enhance ...,"['Industrial', 'Software']",Ultraview AI provides drone-based inspection s...,"Tasks/Jobs: Aircraft inspection, Defect identi...","Aircraft inspection, Defect identification, Da...",Aviation Maintenance,"Airlines, Maintenance organizations",...,49-0000,49-3000,49-3010,49-3011,Aircraft Mechanics and Service Technicians,"Installation, Maintenance, and Repair Occupations","Vehicle and Mobile Equipment Mechanics, Instal...",Aircraft Mechanics and Service Technicians,"[49-3000, 49-3000, 49-3000, 49-3000, 49-3000, ...",True


In [98]:
len(df["organization name"].unique())

1132

In [106]:
bls_df[bls_df["onet_weight"]>3]

Unnamed: 0,onet_title,onet_weight,Detailed Occupation,broad_occ,Major Group,Minor Group,Broad Group,Title,Major Group Name,Minor Group Name,...,H_MEDIAN,H_PCT75,H_PCT90,A_PCT10,A_PCT25,A_MEDIAN,A_PCT75,A_PCT90,ANNUAL,HOURLY
3,Project Management Specialists,11.33,13-1082,13-108,13-0000,13-1000,13-1080,Project Management Specialists,Business and Financial Operations Occupations,Business Operations Specialists,...,47.0,62.35,78.39,57500,74100,98580.0,129690,163040,,
4,Financial and Investment Analysts,61.50,13-2051,13-205,13-0000,13-2000,13-2050,Financial and Investment Analysts,Business and Financial Operations Occupations,Financial Specialists,...,48.0,62.49,84.54,60830,76880,99010.0,129970,175840,,
6,Software Developers,100.50,15-1252,15-125,15-0000,15-1200,15-1250,Software Developers,Computer and Mathematical Occupations,Computer Occupations,...,64.0,80.55,100.30,77020,101200,132270.0,167540,208620,,
51,General and Operations Managers,5.17,11-1021,,11-0000,11-1000,11-1020,General and Operations Managers,Management Occupations,Top Executives,...,49.0,77.06,111.59,46340,65180,101280.0,160290,232110,,
52,Advertising and Promotions Managers,10.50,11-2011,,11-0000,11-2000,11-2010,Advertising and Promotions Managers,Management Occupations,"Advertising, Marketing, Promotions, Public Rel...",...,63.0,90.64,,63580,88810,131870.0,188530,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
522,Medical Secretaries and Administrative Assistants,4.00,43-6013,,43-0000,43-6000,43-6010,Medical Secretaries and Administrative Assistants,Office and Administrative Support Occupations,Secretaries and Administrative Assistants,...,20.0,23.05,28.05,31900,36540,40640.0,47930,58340,,
523,"Secretaries and Administrative Assistants, Exc...",7.33,43-6014,,43-0000,43-6000,43-6010,"Secretaries and Administrative Assistants, Exc...",Office and Administrative Support Occupations,Secretaries and Administrative Assistants,...,21.0,25.10,29.97,30280,36330,44280.0,52200,62340,,
524,Data Entry Keyers,10.00,43-9021,,43-0000,43-9000,43-9020,Data Entry Keyers,Office and Administrative Support Occupations,Other Office and Administrative Support Workers,...,18.0,22.13,26.60,28250,32660,37790.0,46020,55330,,
538,"Farmworkers and Laborers, Crop, Nursery, and G...",3.67,45-2092,,45-0000,45-2000,45-2090,"Farmworkers and Laborers, Crop, Nursery, and G...","Farming, Fishing, and Forestry Occupations",Agricultural Workers,...,17.0,17.79,21.16,30590,32980,34470.0,37010,44010,,
