In [2]:
import pandas as pd
import numpy as np
import json
import re
from llms import gemini
from llms import chatGPT
import requests
import pandas as pd
import aiohttp
import asyncio
import nest_asyncio
import os
from sklearn.manifold import TSNE
from os import getenv
from dotenv import load_dotenv
import time
from sklearn.manifold import TSNE

load_dotenv("../.env",override=True)
GEO_KEY = getenv("GOOGLE_API_KEY")

x_chat = chatGPT()
x_gemini = gemini()


In [3]:



# Apply nest_asyncio to allow nested event loops in Jupyter
nest_asyncio.apply()

# Load area_code dataframe
area_code = pd.read_csv("../input/wage_employment_2022/area_definitions_m2022.csv")[[ 'State abbreviation','May 2022 MSA name','May 2022 MSA code ','County name (or Township name for the New England states)']]
area_code.columns = ["state", "place", "AREA", "name"]
area_code["Latitude"] = "N/A"
area_code["Longitude"] = "N/A"
area_code["Boundary"] = "N/A"

async def get_lat_long(session, msa_name):
    while True:
        try:
            url = f"https://maps.googleapis.com/maps/api/geocode/json?address={msa_name}&key={GEO_KEY}"
            async with session.get(url) as response:
                while True:
                    data = await response.json()
                    if data['status'] == 'OK':
                        bounds = data['results'][0]['geometry']
                        location = data['results'][0]['geometry']['location']
                        return location['lat'], location['lng'], bounds
                    else:
                        if data['status'] != "OVER_QUERY_LIMIT":
                            return None, None, None
                        print(f"Geocoding error for {msa_name}: {data['status']}")
                        await asyncio.sleep(2)
        except Exception as e:
            print(f"Error geocoding {msa_name}: {e}")
            return None, None, None

# Define the main asynchronous function
async def process_area_code_batch(area_code_batch):
    async with aiohttp.ClientSession() as session:
        tasks = []
        for index, row in area_code_batch.iterrows():
            msa_name = row['name'] + " " + row["state"]
            tasks.append(get_lat_long(session, msa_name))
        
        results = await asyncio.gather(*tasks)

        for (index, (lat, lng, bounds)) in zip(area_code_batch.index, results):
            area_code_batch.at[index, 'Latitude'] = lat
            area_code_batch.at[index, 'Longitude'] = lng
            area_code_batch.at[index, 'Boundary'] = bounds

    return area_code_batch

# Function to process the entire DataFrame in batches
async def process_area_code_in_batches(area_code, batch_size=100):
    batches = [area_code.iloc[i:i + batch_size] for i in range(0, len(area_code), batch_size)]
    results = []
    for i, batch in enumerate(batches):
        print(f"Processing batch {i + 1}/{len(batches)}")
        result_batch = await process_area_code_batch(batch)
        results.append(result_batch)
        await asyncio.sleep(10)
    return pd.concat(results)


area_code = await process_area_code_in_batches(area_code, batch_size=100)


area_code.to_csv("../output/geography.csv")



In [None]:
skill_activity = pd.read_csv("../input/onet/Skills to Work Activities.csv")[["Work Activities Element Name", "Skills Element Name"]]
skill_activity.columns = ["activity", "skill"]
skill_activity["skill_activity"] = skill_activity["skill"]+" "+skill_activity["activity"]
skill_activity = x_chat.run_batch_embeddings(skill_activity, "activity")
skill_activity = x_chat.run_batch_embeddings(skill_activity, "skill")
skill_activity = x_chat.run_batch_embeddings(skill_activity, "skill_activity")
skill_activity.to_csv("../input/onet/activity_skill_embeddings.csv", index=False)



In [4]:
skills_df = pd.read_csv("../input/onet/Skills.csv")
skills_df = skills_df[skills_df["Scale Name"] == "Importance"]
skills_df = skills_df.rename(columns={"O*NET-SOC Code": "Detailed Occupation"})
skills_df["Detailed Occupation"] = skills_df["Detailed Occupation"].replace(r'\.\d{2}', '', regex=True)
skills_df = skills_df.pivot_table(index=skills_df["Detailed Occupation"], columns='Element Name', values='Data Value', fill_value=0)
skills_df

skills_df.to_csv("../input/onet/skills_by_occupation.csv", index=False)

In [9]:
skill_activity = pd.read_csv("../output/parsed_BLS_data/activity_skill_embeddings.csv")
skill_activity.activity_embedding = skill_activity.activity_embedding.apply(lambda x: [float(y) for y in x.strip("[]").split(", ")])
skill_activity.skill_embedding = skill_activity.skill_embedding.apply(lambda x: [float(y) for y in x.strip("[]").split(", ")])

embeddings = np.vstack(skill_activity['activity_embedding'].values)
tsne = TSNE(n_components=3, perplexity=100, n_iter=1000)
tsne_results = tsne.fit_transform(embeddings)
skill_activity[["activity_embedding_tsne1","activity_embedding_tsne2"]] = tsne_results[:, [0,1]]

embeddings = np.vstack(skill_activity['activity_embedding'].values)
tsne = TSNE(n_components=3, perplexity=100, n_iter=1000)
tsne_results = tsne.fit_transform(embeddings)
skill_activity[["skill_embedding_tsne1","skill_embedding_tsne2"]] = tsne_results[:, [0,1]]

embeddings = np.vstack((skill_activity['activity_embedding'] + skill_activity['activity_embedding']).values)
tsne = TSNE(n_components=3, perplexity=100, n_iter=1000)
tsne_results = tsne.fit_transform(embeddings)
skill_activity[["activity__skill_embedding_tsne1","activity_skill_embedding_tsne2"]] = tsne_results[:, [0,1]]

skill_activity.head()
skill_activity.to_csv("../output/parsed_BLS_data/activity_skill_embeddings.csv")

Unnamed: 0,activity,skill,skill_activity,activity_embedding,skill_embedding,skill_activity_embedding,activity_embedding_tsne1,activity_embedding_tsne2,skill_embedding_tsne1,skill_embedding_tsne2,activity__skill_embedding_tsne1,activity_skill_embedding_tsne2
0,Getting Information,Reading Comprehension,Reading Comprehension Getting Information,"[-0.013470755890011787, 0.041330937296152115, ...","[0.0015323176048696041, -0.0014972264179959893...","[-0.009293608367443085, 0.018798066303133965, ...",90.934624,-24.540916,-84.633789,43.747856,39.891155,76.399742
1,"Monitoring Processes, Materials, or Surroundings",Reading Comprehension,"Reading Comprehension Monitoring Processes, Ma...","[0.012661822140216827, 0.013294070027768612, -...","[0.001248258980922401, -0.0028923852369189262,...","[0.015105859376490116, 0.004223677795380354, -...",58.246986,42.169872,-6.843826,50.400127,79.543182,-14.914734
2,"Identifying Objects, Actions, and Events",Reading Comprehension,"Reading Comprehension Identifying Objects, Act...","[-0.015479780733585358, 0.015454080887138844, ...","[0.001248258980922401, -0.0028923852369189262,...","[-0.004918965045362711, 0.008949629962444305, ...",23.2073,54.966888,17.308615,6.84676,86.459229,-35.111057
3,"Judging the Qualities of Objects, Services, or...",Reading Comprehension,Reading Comprehension Judging the Qualities of...,"[-0.014297530055046082, -0.010395382530987263,...","[0.001248258980922401, -0.0028923852369189262,...","[-0.009553848765790462, -0.012345805764198303,...",23.84016,41.664207,49.824501,46.819092,65.258965,20.578312
4,Processing Information,Reading Comprehension,Reading Comprehension Processing Information,"[-0.012010408565402031, 0.021545829251408577, ...","[0.001248258980922401, -0.0028923852369189262,...","[-0.008724566549062729, 0.015466276556253433, ...",78.850693,-35.512379,-57.170654,53.241447,66.58168,54.153648


In [3]:
dwa = pd.read_csv("../input/onet/Tasks to DWAs.csv")[3:].reset_index(drop=True)
dwa = dwa[["DWA ID","DWA Title","Task"]]
# dwa = dwa.rename({"ID":"Task ID"},axis=1)
grouped = dwa.groupby("Task").aggregate({"DWA ID":"count"}).apply(lambda x: 1/x).reset_index()
grouped = grouped.rename({"DWA ID":"dwa_count"},axis=1)
dwa = dwa.merge(grouped,on="Task")
dwa_ref = pd.read_csv("../input/onet/DWA Reference.csv")[["Element Name","DWA ID"]]
dwa_ref = dwa_ref.rename({"Element Name":"activity"},axis=1)
dwa = dwa.merge(dwa_ref,on="DWA ID",how="left")
dwa.to_csv("../output/parsed_BLS_data/dwa.csv")

In [None]:
task_statements = pd.read_csv("../input/onet/Task Statements.csv")[["Task","Title"]]
task_statements = x_chat.run_batch_embeddings(task_statements,"Task")

task_statements.to_csv('../output/all_task_occ_embeddings/task_statement_embeddings.csv', index=False)

embeddings = np.vstack(task_statements['Task_embedding'].values)
tsne = TSNE(n_components=3, perplexity=100, n_iter=1000)
tsne_results = tsne.fit_transform(embeddings)
task_statements[["all_onet_tasks_embedding_tsne1","all_onet_tasks_embedding_tsne2"]] = tsne_results[:, [0,1]]

task_statements.to_csv('../output/all_task_occ_embeddings/task_statement_embeddings.csv', index=False)



In [None]:
onet_occ = pd.read_csv("../input/onet/Occupation Data.csv")[["O*NET-SOC Code","Title"]]
onet_occ.columns = ["Detailed Occupation","Title"]
onet_occ["Detailed Occupation"] = onet_occ["Detailed Occupation"].apply(lambda x: x[:-3])

onet_occ = x_chat.run_batch_embeddings(onet_occ,"Title")
onet_occ.to_csv('../output/all_task_occ_embeddings/occupation_embeddings.csv', index=False)

embeddings = np.vstack(onet_occ['Title_embedding'].values)
tsne = TSNE(n_components=3, perplexity=100, n_iter=1000)
tsne_results = tsne.fit_transform(embeddings)
onet_occ[["all_onet_occupations_embedding_tsne1","all_onet_occupations_embedding_tsne2"]] = tsne_results[:, [0,1]]

onet_occ.to_csv('../output/all_task_occ_embeddings/occupation_embeddings.csv', index=False)