In [1]:
import pandas as pd
import numpy as np
from decimal import Decimal
from llms import gemini
from llms import chatGPT
from sklearn.metrics.pairwise import cosine_similarity
from lemmatize import lemmatize
x_chat = chatGPT()
x_gemini = gemini()




In [4]:
sample_df = pd.read_csv('../output/df_with_examples.csv')
job_cols = [[f"Job{i}",f"Job{i}_title"] for i in range(1,4)]
for job in job_cols:
    for col in job:
        sample_df[col+"_lemmatize"] = sample_df[col].apply(lambda x: lemmatize(x))

lemmatize_cols = [[f"Job{i}_lemmatize",f"Job{i}_title_lemmatize"] for i in range(1,4)]

embedding_cols = []
for i in range(1,4):  
    embedding_cols += [f"Job{i}_embedding",f"Job{i}_title_embedding"]

results = []
for i, x in list(sample_df.iterrows())[0:1]:
    name = x["organization name"]
    print(f"Processing {str(i), len(results)}: {name}")
    job_embeddings = [name]
    for job in lemmatize_cols: 
        for col in job:
            if pd.isnull(x[col]):
                job_embeddings.append(np.nan)
                continue
            job_embeddings.append(x_chat.get_embedding(x[col]))
    results.append(job_embeddings)


results_df = pd.DataFrame(results, columns=['organization name']+embedding_cols)
sample_df = sample_df.merge(results_df, on="organization name")

sample_df.to_csv('../output/df_with_examples_embeddings.csv', index=False)

Processing ('0', 0): Pika


In [5]:
df_exp = pd.read_csv('../output/onet/gpt_exposure_embeddings_lemmatize.csv')
df_exp.task_embedding = df_exp.task_embedding.apply(lambda x: x.strip("[]").split(", "))



In [6]:
task_embeddings = []
for x in df_exp.task_embedding:
    task_embeddings.append([float(y) for y in x])
df_exp.task_embedding = task_embeddings


task_embeddings = np.array(task_embeddings)
task_embeddings = np.vstack(task_embeddings)
task_embeddings.shape

(11582, 3072)

In [7]:
df_exp.title_embedding = df_exp.title_embedding.apply(lambda x: x.strip("[]").split(", "))
title_embeddings = []
for x in df_exp.task_embedding:
    title_embeddings.append([float(y) for y in x])
df_exp.title_embedding = title_embeddings


In [8]:
sample_df = pd.read_csv('../output/df_with_examples_embeddings.csv')
sample_df.loc[:, ["Job1_ONET", "Job2_ONET", "Job3_ONET"]] = 'N/A'

# list(sample_df.Job1_embedding.apply(ast.literal_eval))
for i in range(3):
    sample_df[f"Job{i+1}_embedding"] = sample_df[f"Job{i+1}_embedding"].apply(lambda x: [float(y) for y in x.strip("[]").split(", ")])
    sample_df[f"Job{i+1}_title_embedding"] = sample_df[f"Job{i+1}_title_embedding"].apply(lambda x: [float(y) for y in x.strip("[]").split(", ")])




In [9]:

results = []
# results = list(pd.read_csv("../output/onet_df.csv").values)

for index, row in list(sample_df.iterrows())[0:1]:
    print("***********************")
    print(f"Processing {index}")
    name = row["organization name"]
    print(name)
    print(row["generated_description"])
        
    job_results = []
    for i in range(1): 
        print(row[f'Job{i+1}_lemmatize'])
        example = row[f"Example{i+1}"]
        job = row[f'Job{i+1}']
        job_title = row[f'Job{i+1}_title']
        print(f"\nProcessing Example: {example} \nJob: {job}\n")

        job_embedding = np.array(x_chat.get_embedding(row[f'Job{i+1}_lemmatize'])).reshape(1,-1)
        job_title_embedding = np.array(row[f'Job{i+1}_title_embedding']).reshape(1,-1)
        cosine_sim = cosine_similarity(job_embedding, task_embeddings, 'cosine')
        
        top3 = sorted(zip(cosine_sim[0], df_exp[['Title', 'Task', 'description','title_embedding']].values), reverse=True)[:3]
        for cosine_sim_job, onet in top3:
            onet_title_embedding = np.array([onet[4]])
            cosine_sim_title = cosine_similarity(job_title_embedding, onet_title_embedding, 'cosine')[0][0]
            print(f"{onet[0]}: {onet[1]} {onet[2]} \nCosine Similarity: {cosine_sim_job}, Person Cosine Similarity: {cosine_sim_title}")
            results.append([name,example,row[f"situation{i+1}_conf_interval"], row[f"situation{i+1}_conf_interval_reasoning"],job,onet[0],onet[1],job_title, cosine_sim_job,cosine_sim_title])
 

    if index % 5 == 0:
        print("**Saving Results**\n\n")
        onet_df = pd.DataFrame(results, columns=["organization_name","example","situation_conf_interval", "situation_conf_interval_reasoning","job","onet_title","onet_task","example_job_title","task_similarity", "job_title_similarity"])
        onet_df.situation_conf_interval = onet_df.situation_conf_interval.astype("float")
        onet_df.to_csv("../output/onet_df.csv", index=False)
        
    print("***********************\n\n")


***********************
Processing 0
Pika
Pika is an AI-powered platform that allows users to create and edit films using captions and still images. The platform simplifies the video creation process by leveraging AI to generate dynamic visuals and sequences, making film production accessible to a wider audience.
Animators Multimedia Artists create storyboards illustrate scene action sequence film television program medium

Processing Example: A filmmaker uses Pika to create a quick storyboard for their short film, turning their initial script into a visual representation of the scenes, complete with dynamic camera angles and transitions. 
Job: Animators and Multimedia Artists that create storyboards to illustrate scenes or action sequences for films, television programs, or other media.

Special effects artists and animators that apply story development, directing, cinematography, and editing to animation to create storyboards that show the flow of the animation and map out key scenes

In [25]:
onet_df.situation_conf_interval = onet_df.situation_conf_interval.astype("float")
onet_df.to_csv("../output/onet_df.csv", index=False)

In [10]:
s1 = "Animators and Multimedia Artists that create storyboards to illustrate scenes or action sequences for films, television programs, or other media."
s2 = "Special effects artists and animators that apply story development, directing, cinematography, and editing to animation to create storyboards that show the flow of the animation and map out key scenes and characters."



In [89]:
row[f'Job{i+1}_title_lemmatize']

'Computer Information Systems Managers'

In [12]:
s1 = lemmatize(s1)
s2 = lemmatize(s2)

In [80]:
s2

"Computer Information Systems Managers evaluate cloud service offering different provider select suitable option company 's need"

In [13]:
e1 = x_chat.get_embedding(s1)
#e1 = df_exp[df_exp.description.str.contains("managers that evaluate the organization")].task_embedding.iloc[0]
e2 = x_chat.get_embedding(s2)

#e2 = x_chat.get_embedding("Computer Information Systems Managers evaluate cloud service offering different provider select suitable option company 's need")
e1 = np.array(e1).reshape(1,-1)
e2 = np.array(e2).reshape(1,-1)
cosine_similarity(e1,e2, 'cosine')[0][0]

0.6625274673336069

In [31]:
df_exp.head()

NameError: name 'df_exp' is not defined

In [87]:
onet[4]

'Computer Information Systems Managers'

In [52]:
job_embedding

array([[-0.0173959 , -0.00603199, -0.00627242, ..., -0.00116768,
         0.01875363, -0.00108724]])

In [53]:
e2

array([[-2.50668246e-02,  2.44815517e-02, -2.90852250e-03, ...,
        -6.56648027e-03,  1.38823958e-02, -7.29361054e-05]])