In [40]:
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import os
from os import getenv
from openai import OpenAI
import pathlib
import textwrap
import google.generativeai as genai
import time
import requests
import ast
import json
from sklearn.metrics.pairwise import cosine_similarity
import re

load_dotenv("../../.env",override=True)
GOOGLE_API_KEY = getenv("GEMINI_API_KEY")
OPENAI_API_KEY = getenv("OPENAI_API_KEY")

sample_df = pd.read_csv('../output/df_with_examples_embeddings.csv')




ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [None]:

genai.configure(api_key=GOOGLE_API_KEY)
class gemini():
    def __init__(self):
        self.model = genai.GenerativeModel('gemini-1.5-flash')
    def request(self,prompt):
        url = 'https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash-latest:generateContent'
        headers = {
            'Content-Type': 'application/json',
        }
        data = {
            "contents": [
                {
                    "parts": [
                        {
                            "text": prompt
                        }
                    ]
                }
            ]
        }
        params = {
            'key': GOOGLE_API_KEY
        }
        
        response = requests.post(url, headers=headers, json=data, params=params)
        return json.loads(response.text)

    def ask(self,prompt):
        #response = self.model.generate_content(prompt)
        response = self.request(prompt)
        if response["candidates"][0]["finishReason"] == 'SAFETY': return "N/A"
        response = response["candidates"][0]["content"]["parts"][0]["text"]
        return response

x_gemini = gemini()
x_gemini.ask("hi")



'Hi! How can I help you today? \n'

In [None]:
class chatGPT():
    def __init__(self):
        self.client = OpenAI(api_key=OPENAI_API_KEY)
        
    # def ask(self, q):
    #     stream = self.client.chat.completions.create(
    #         model="gpt-4",
    #         messages=[{"role": "user", "content": q}],
    #         stream=True,
    #         temperature=0
    #     )
    #     response = ""
    #     for chunk in stream:
    #         if chunk.choices[0].delta.content is not None:
    #             response += chunk.choices[0].delta.content

    #     self.response = response
    #     return response

    def get_embedding(self,text, model="text-embedding-3-large"):
       text = text.replace("\n", " ")
       return self.client.embeddings.create(input = [text], model=model).data[0].embedding

x_chat = chatGPT()


In [None]:
df_exp = pd.read_csv('../output/onet/gpt_exposure_embeddings.csv')
df_exp.Embeddings = df_exp.Embeddings.apply(lambda x: x.strip("[]").split(", "))
embeddings = []
for x in df_exp.Embeddings:
    embeddings.append([float(y) for y in x])


embeddings = np.array(embeddings)
embeddings = np.vstack(embeddings)
embeddings.shape


In [None]:
# df_exp["Embeddings_Object"] = embeddings
# embeddings = df_exp["Embeddings_Object"]
# df_exp.to_csv("../output/onet/gpt_exposure_parsable_embeddings.csv")
# df_exp = pd.read_csv("../output/onet/gpt_exposure_parsable_embeddings.csv")
# df_exp.head()

In [None]:
for i in range(3):
    sample_df[f"Job{i+1}_embedding"] = sample_df[f"Job{i+1}_embedding"].apply(lambda x: [float(y) for y in x.strip("[]").split(", ")])





In [53]:
prompt = """What is the first person in this sentence? Just return 1/2 words.
Example: 
************
Sentence:  A legal secretary would normally schedule appointments and send reminders.
Noun: legal secretary
************
Your turn:
Sentence: $sentence
Noun (1/2 words):
"""
def get_person(text):
    while True:
        try:
            result = x_gemini.ask(prompt.replace("$sentence",text)).replace(" \n","")
            return result
        except Exception as e:
            print(e)
            print(f"Error processing: {e}")
            time.sleep(20)



In [None]:
sample_df[["Job1_ONET", "Job2_ONET", "Job3_ONET"]].dtypes

Job1_ONET    object
Job2_ONET    object
Job3_ONET    object
dtype: object

In [55]:
# results = []
for index, row in list(sample_df.iterrows())[801:]:
    print("***********************")
    print(f"Processing {index}")
    name = row["organization name"]
    print(name)
    print(row["generated_description"])
        
    job_results = []
    for i in range(3):  # 0, 1, 2 for Job1, Job2, Job3
        job = row[f'Job{i+1}']
        print(f"\nProcessing Job: {job}")
        example = row[f"Example{i+1}"]
        job_embedding = np.array(row[f'Job{i+1}_embedding']).reshape(1,-1)
        cosine_sim = cosine_similarity(job_embedding, embeddings, 'cosine')
        example_title = get_person(job)
        example_title_embedding = np.array([x_chat.get_embedding(example_title)])
        
        top3 = sorted(zip(cosine_sim[0], df_exp[['Title', 'Task']].values), reverse=True)[:3]
        local_results = []
        for cosine_sim_job, onet in top3:
            onet_title_embedding = np.array([x_chat.get_embedding(onet[0])])
            cosine_sim_title = cosine_similarity(job_embedding, onet_title_embedding, 'cosine')[0][0]
            print(f"{onet[0]}\nCosine Similarity: {cosine_sim_job}, Person Cosine Similarity: {cosine_sim_title}")
            results.append([name,example,job,onet[0],onet[1],example_title, cosine_sim_job,cosine_sim_title])
            local_results.append([onet[0],onet[1],example_title, cosine_sim_job,cosine_sim_title])
        job_results.append(json.dumps(local_results))
    sample_df.loc[index, ["Job1_ONET", "Job2_ONET", "Job3_ONET"]] = job_results

    if index % 5 == 0:
        print("**Saving Results**\n\n")
        onet_df = pd.DataFrame(results, columns=["organization_name","example","job","onet_title","onet_task","example_job_title","task_similarity", "job_title_similarity"])
        onet_df.to_csv("../output/onet_df.csv", index=False)
    if index % 30 == 0:
        sample_df.to_csv('../output/df_with_onet.csv', index=False)
        
    print("***********************\n\n")


***********************
Processing 801
The Startup Valley
The Startup Valley offers a range of AI-powered IT solutions and services designed to help businesses adapt to the rapidly changing digital landscape. Their services are tailored to meet the specific needs of modern businesses, helping them stay ahead of the curve and achieve their goals. 


Processing Job: A system administrator at a business performs routine IT tasks like backing up data and installing software updates.
Bioinformatics Technicians
Cosine Similarity: 0.5598774035993084, Person Cosine Similarity: 0.2440712238928286
Network and Computer Systems Administrators
Cosine Similarity: 0.5559693931818895, Person Cosine Similarity: 0.4580006424912139
Network and Computer Systems Administrators
Cosine Similarity: 0.5437653463707393, Person Cosine Similarity: 0.4580115427783257

Processing Job: A QA engineer is responsible for testing and debugging software.
Software Quality Assurance Analysts and Testers
Cosine Similarity: 

In [47]:
sample_df.loc[index, ["Job1_ONET", "Job2_ONET", "Job3_ONET"]] = job_results

In [41]:
onet_df.head()

Unnamed: 0,organization_name,example,job,onet_title,onet_task,example_job_title,task_similarity,job_title_similarity
0,Advoria,A lawyer uses Advoria to set up an automated b...,Legal Secretary schedules client appointments.,Legal Secretaries and Administrative Assistants,Schedule and make appointments.,Legal Secretary \n,0.728097,0.529373
1,Advoria,A lawyer uses Advoria to set up an automated b...,Legal Secretary schedules client appointments.,Medical Secretaries and Administrative Assistants,Schedule and confirm patient diagnostic appoin...,Legal Secretary \n,0.591421,0.440896
2,Advoria,A lawyer uses Advoria to set up an automated b...,Legal Secretary schedules client appointments.,Legal Secretaries and Administrative Assistants,"Attend legal meetings, such as client intervie...",Legal Secretary \n,0.587042,0.529373
3,Advoria,A law firm uses Advoria to collect necessary c...,Legal Secretary collects and organizes client ...,Legal Secretaries and Administrative Assistants,Assist attorneys in collecting information suc...,Legal Secretary \n,0.663853,0.546011
4,Advoria,A law firm uses Advoria to collect necessary c...,Legal Secretary collects and organizes client ...,Legal Secretaries and Administrative Assistants,"Organize and maintain law libraries, documents...",Legal Secretary \n,0.613759,0.546078


In [None]:
job_embedding.shape

(3072, 1)