# Example - job embedding, resume embedding, job match generator

Here's what we're doing here:
 - Pulling in Job Descriptions (from a file here, this should be with a daily DB Pull) and using an LLM to structure them into several relevant text sections
 - Pulling in a desired Resume (again, from a file here, this should come from S3 on demand) and getting it into text format (from .PDF or .DOCX format)
 - Using an LLM to structure the resumes into several relvant text sections
 - Using python libraries we wrote to provide embedding for both resumes and job descriptions (here stored locally, these should go into our DB to be used later)
 - Using similarity metrics to generate best jobs for a person (to be done on demand or daily)

Then we have a few examples to look at.  This code is designed to be an example of how to use all these libraries.

# Tunable parameters and inputs
Here we get to decide:
 - Where the scraped jobs file is for today, and our credential file
 - The "truncation list" of vector embeddings (for example, [20,75] would mean "first compare the first 20 dimensions of an embedding vector for similarity, then do a 75-length vector of the ones that pass, then finally do all dimensions)
 - relevant fields to be compared for both jobs and applicants
 - A corresponding threshold list (cosine similarity threshold that each comparison must pass)

 ## NOTE:

At present, the files below have to exist.  Soon we will be reading from an S3 location instead of a static json file.

In [None]:
from datetime import datetime


# TODO:  Put the BOW/OpenAI embedders on a switch here
# TODO:  Something needs fixing - now we have a threshold on the compairison between 'jobsWanted_concat' and jobs "title,"
#        yet we're recommending jobs that have nothing in common in those fields, so should have a dot product of 0.  Figure it out!
# where today's scraped jobs are:
use_local_scraped_file = False # if true, use the file below.  If false, use the S3 bucket below that.
scraped_jobs_json_filename = './example_data_fromdb/new-Patagonia-jobs_05-28-2023.json'

s3_scraped_bucket_name = 'jobs-scraped-uploads'
s3_scraped_bucket_prefix = ''
datestring = str(datetime.today().strftime('%m-%d-%Y'))
s3_scraped_bucket_filename = '_final_' + datestring  + '.json'

#and here's where we'll put the file (if on S3)
s3_scraped_bucket_outfilename = '_final_recommendations_' + datestring  + '.json'


# our credentials file:
credentials_filename = "./credentials.yml"

# for using the openai embedder:
#vector_trunc_list = [20] 
# for using the Bag of Words embedder
vector_trunc_list = [] 

vector_comparison_threshold_list = [0.4]
full_vector_comparison_threshold = 0.05 # 0.4
num_top_jobs = 3

#note that the first comparison field carries the "full weight" final comparator.
#jobs_relevant_fields = ['fullJobDescription','title']
#applicant_relevant_fields = ['fullText','title']
#applicant_relevant_fields = ['resume_rawtext']
#jobs_relevant_fields = ['fullJobDescription']

#applicant_relevant_fields = ['resume_rawtext','jobsWanted_concat']
#jobs_relevant_fields = ['fullJobDescription','fullJobDescription']

#applicant_relevant_fields = ['resume_rawtext','jobsWanted_concat']
#jobs_relevant_fields = ['fullJobDescription','title']

#applicant_relevant_fields = ['locations_concat', 'jobsWanted_concat','resume_rawtext']
#jobs_relevant_fields = ['locations','title','fullJobDescription']

#applicant_relevant_fields = ['resume_rawtext','locations_concat', 'jobsWanted_concat']
#jobs_relevant_fields = ['fullJobDescription','locations','title']

#last known good
#applicant_relevant_fields = ['resume_rawtext', 'jobsWanted_concat']
#jobs_relevant_fields = ['fullJobDescription','title']

# experimental
applicant_relevant_fields = ['locations_concat','jobsWanted_concat','resume_rawtext']
jobs_relevant_fields = ['locations_concat','title','fullJobDescription']

#applicant_relevant_fields = ['jobsWanted_concat']
#jobs_relevant_fields = ['title']

uid_name_jobs = 'uuid'
uid_name_applicant = 'customer_uuid'

# Specify the name of the DynamoDB table and s3 buckets wwhere the resumes can be found:
dynamo_people_table_name = 'cheeki-job-automation-user-table'
dynamo_appliedjobs_table_name = 'cheeki-jobs-applied-to'

s3_bucket_name = 'job-automation-uploads'
s3_bucket_prefix = 'public'

# desired_job_file = './example_data_fromdb/example_resume_files/Vickers Financial_Ryan Tang_Compliance Officer.pdf'


# API Credentials and Includes

In [11]:
from __future__ import print_function
#!pip install sklearn
#!pip install exceptions
#!pip install boto3
#!nltk.download('stopwords')
from jd_tools import ContentEmbedder, JobScorer, CheekiFileHandler


import yaml
#!pip install boto3
#!pip install python-docx
import boto3
import numpy as np
import json
#!pip install nltk


with open(credentials_filename, 'r') as ymlfile:
   cfg = yaml.safe_load(ymlfile)
   our_chatgpt_key = cfg['creds']['chatgpt_key']
   # AWS credentials
   aws_access_key_id = cfg['creds']['aws_access_key']
   aws_secret_access_key = cfg['creds']['aws_secret_key']
   aws_region_name = 'us-west-2'  # Replace with your desired AWS region


# set up the variou handlers we'll need:
filehandler = CheekiFileHandler()

#Set up our job embedder:
job_embedder = ContentEmbedder()
job_embedder.add_api_key(our_chatgpt_key)

#Set up our applicant embedder:
applicant_embedder = ContentEmbedder()
applicant_embedder.add_api_key(our_chatgpt_key)


# Connect to DynamoDB
dynamodb = boto3.resource('dynamodb', aws_access_key_id=aws_access_key_id,
                          aws_secret_access_key=aws_secret_access_key,
                          region_name=aws_region_name)
# and to S3 for file IO
s3 = boto3.client('s3', aws_access_key_id=aws_access_key_id, 
                  aws_secret_access_key=aws_secret_access_key, 
                  region_name=aws_region_name)


# TODO: Use the new S3 URL to get resumes instead of searching for them(?).
# TODO: deal with openai resume length truncation, consider pre-processing

# Get and Encode the Job Descriptions

In [None]:
if use_local_scraped_file:
    with open(scraped_jobs_json_filename) as f:
        jobs_data_raw = json.load(f)
else:
    s3.download_file(s3_scraped_bucket_name, s3_scraped_bucket_filename, './temp_files/' + s3_scraped_bucket_filename)
    #content = filehandler.text_from_file('./temp_files/' + s3_scraped_bucket_filename)
    #print(content)
    #jobs_data_raw = json.load('./temp_files/' + s3_scraped_bucket_filename)
    with open('./temp_files/' + s3_scraped_bucket_filename) as f:
        jobs_data_raw = json.load(f)

print(jobs_data_raw[0:1])
for row_ind in range(len(jobs_data_raw)):
    jobs_data_raw[row_ind]['locations_concat'] = ' '.join(jobs_data_raw[row_ind]['locations'])


#make locations_concat for us later on (from 'locations):

#and set up our BOW embedder using all the available job text:
# job_embedder.setup_bow_embedder([i['fullJobDescription'] for i in jobs_data_raw])
job_embedder.setup_bow_embedder([i['fullJobDescription'] for i in jobs_data_raw] + [i['locations_concat'] for i in jobs_data_raw])

In [None]:
# jobs_data_raw2 = jobs_data_raw.copy()
# for row_ind in range(len(jobs_data_raw2)):
#     jobs_data_raw2[row_ind]['locations_concat'] = ' '.join(jobs_data_raw2[row_ind]['locations'])

# jobs_data_raw2

In [None]:
jobs_embedded_json_list = job_embedder.embed_content(jobs_data_raw,jobs_relevant_fields, vector_trunc_list,uid_name_jobs)
#for debug:
#jobs_embedded_json_list[0:1]

# Get and Encode Applicant's Resumes

Remember: in our product, we want to basically do this on-demand as people put their info into the system.

Here we're going to: 
 - Get a pdf or .doc file
 - grind it up to get the text out of it
 - (optionally - TODO) use an LLM to subdivide the resume into text segemnts
 - embed it in the way that we did the jobs

In [None]:
# # Connect to DynamoDB
# dynamodb = boto3.resource('dynamodb', aws_access_key_id=aws_access_key_id,
#                           aws_secret_access_key=aws_secret_access_key,
#                           region_name=aws_region_name)
# # and to S3 for file IO
# s3 = boto3.client('s3', aws_access_key_id=aws_access_key_id, 
#                   aws_secret_access_key=aws_secret_access_key, 
#                   region_name=aws_region_name)

# Retrieve all applicant records from the DynamoDB table
table = dynamodb.Table(dynamo_people_table_name)
table_response = table.scan()

people_uids = []
resume_fulltexts = []
applicant_records = []


# Extract the 'FileName' attribute from each record
counter = 0
for item in table_response['Items']:
    #print(item)
    file_name = item.get('resumeFileName')
    is_complete = item.get('isInitialProfileFormCompleted')

    id = item.get('id')
    if file_name and is_complete:
        #print(id,file_name)
        #get all the files in the folder
        s3_response = s3.list_objects_v2(Bucket=s3_bucket_name, Prefix=s3_bucket_prefix + '/' + id)
        # pull the first one that matches our filename
        #s3_folder_URI = s3_prefix_resumes + id
        target_file = None
        #print(s3_response)
        for obj in s3_response['Contents']:
            if file_name in obj['Key']:
                target_file = obj['Key']
                break
        if target_file is not None:
            #print(s3_bucket_name)
            if counter % 10 == 0:
                print(target_file)
            counter = counter + 1
            #print('./temp_files/' + target_file)
            s3.download_file(s3_bucket_name, target_file, './temp_files/' + file_name)
            content = filehandler.text_from_file('./temp_files/' + file_name)
            #print(content)
            if len(content) > 1:
                #we have a resume file, and one we managed to read.  Hang onto these:
                people_uids.append(id)
                resume_fulltexts.append(content)
                locs_wanted = item.get('locations')
                if locs_wanted is None:
                    locs_wanted = ' '
                else:
                    locs_wanted = ' '.join(locs_wanted)
                #print(item.get('locations'))
                applicant_records.append({
                    'customer_uuid': id,
                    'resume_rawtext':content,
                    'payRequest':item.get('desiredSalaryRange'),
                    'locations':item.get('locations'),
                    'membershipLevel':item.get('membershipLevel'),
                    'jobsWanted_concat':' '.join(item.get('jobsWanted')),
                    'locations_concat':locs_wanted
                })
                # print(item)
            
        #s3://job-automation-uploads/public/591c63c0-3933-4a74-87a8-b460dccb6126/resume--1682308070874-Robert_Shaw-bk_sans_009106 (3).pdf

And make all the structured resume/data sets:

In [None]:
#people_uids
#resume_fulltexts
applicant_records[0:2]
#jobs_embedded_json_list

applicant_records[0]['resume_rawtext']

#test_embedder = LLMEmbedder()
#test_embedder.

(the version that uses a local pdf file instead - deprecated)

In [None]:
# pdf_fulltext = pdfminer.high_level.extract_text(desired_job_file)

# #structure this text into our desired format
# # NOTE: Replace this with the applicant UUID, and a database query to get their record:

# applicant_record = {'title': 'Senior Accommodation Specialist',
#   'payRequest': {'lowestPay': 103450,
#   'targetHiringPay': 130000},
#   'yearsExpParsed': {'Years': 7},
#   'workType': [],
#   'fullText': pdf_fulltext,
#   'customer_uuid': 'e5d175a0-88eb-45e7-acc2-27f0ae5e4257'}

#[len(i[applicant_relevant_fields[0]]) for i in applicant_records]


In [None]:
#applicant_records

In [None]:
#debug - just do a few people
#applicant_embedded_json_list = [job_embedder.embed_content([i],applicant_relevant_fields, vector_trunc_list,uid_name_applicant) for i in applicant_records[0:3]]

#score all the people:
#applicant_embedded_json_list = [job_embedder.embed_content([i],applicant_relevant_fields, vector_trunc_list,uid_name_applicant) for i in applicant_records]

applicant_embedded_json_list = []
for i in range(len(applicant_records)):
    len_str = len(applicant_records[i][applicant_relevant_fields[0]])
    print('trying a string with ' , len_str, 'elements:')
    job_embedded_temp = job_embedder.embed_content([applicant_records[i]],applicant_relevant_fields, vector_trunc_list,uid_name_applicant)
    applicant_embedded_json_list.append(job_embedded_temp)


In [None]:
#applicant_records

In [None]:
#applicant_embedded_json_list

# SAVE POINT - just to save compute/API costs

In [None]:
import pickle
#dumb global pickle handlers
def psave(filename, *args):
    # Get global dictionary
    glob = globals()
    d = {}
    for v in args:
        # Copy over desired values
        d[v] = glob[v]
    with open(filename, 'wb') as f:
        # Put them in the file 
        pickle.dump(d, f)

def pload(filename):
    # Get global dictionary
    glob = globals()
    with open(filename, 'rb') as f:
        for k, v in pickle.load(f).items():
            # Set each global variable to the value from the file
            glob[k] = v

In [None]:
#choose, but choose wisely.

psave('./cheeki_jobfinderdigest_bow.pkl','applicant_records','jobs_data_raw','jobs_embedded_json_list', 'applicant_embedded_json_list', 'jobs_relevant_fields', 'applicant_relevant_fields', 'vector_trunc_list','uid_name_jobs', 'uid_name_applicant','vector_comparison_threshold_list','full_vector_comparison_threshold')

pload('./cheeki_jobfinderdigest_bow.pkl')

In [None]:
isinstance(np.zeros((10,10)),np.ndarray)

# Time to find the best jobs for our applicants

So here's a library that takes in job data and an applicant's data, and compares the relevant data to find the best matches!

In [None]:
from decimal import Decimal

class DecimalEncoder(json.JSONEncoder):
  def default(self, obj):
    if isinstance(obj, Decimal):
      return str(obj)
    if isinstance(obj,np.ndarray):
      return obj.tolist()
    return json.JSONEncoder.default(self, obj)

In [None]:
applicant_embedder = JobScorer()
job_recommendataions_final = []

#prevjobs_table = dynamodb.scan(TableName=dynamo_appliedjobs_table_name,FilterExpression=f"userId = '{userId_value}'",ProjectionExpression='id')
#previously_applied_uuids = [item['id']['S'] for item in response['Items']]

table = dynamodb.Table(dynamo_appliedjobs_table_name)


print(str(len(applicant_embedded_json_list)), ' Total Applicants...')
for applicant_ind in range(len(applicant_embedded_json_list)):
    #for applicant_ind in range(25):
    print('looking at applicant #' , str(int(applicant_ind)))
    print('Jobs wanted: ', applicant_records[applicant_ind]['jobsWanted_concat'])
    # find which jobs this applicant has already been applled to:
    #print( applicant_embedded_json_list[applicant_ind])
    applicant_uuid = applicant_embedded_json_list[applicant_ind][0]['customer_uuid']
    #print(applicant_uuid)
    #fe = "userId <> :" + applicant_uuid + ""
    #print(fe)
    #prevjobs_table = table.scan(TableName=dynamo_appliedjobs_table_name,FilterExpression=fe,ProjectionExpression='id')
    #previously_applied_uuids = [item['id']['S'] for item in response['Items']]
    response = table.scan(
        FilterExpression='userID = :uid',
        ExpressionAttributeValues={
            ':uid': applicant_uuid
        }
    )
    items=response['Items']
    previously_applied_uuids = [item['id'] for item in items]
    #best_jobs_json, best_jobs_scores = applicant_embedder.best_jobs(jobs_embedded_json_list, applicant_embedded_json_list[0], jobs_relevant_fields, applicant_relevant_fields, vector_trunc_list,uid_name_jobs, uid_name_applicant,vector_comparison_threshold_list,full_vector_comparison_threshold)
    best_jobs_json, best_jobs_scores = applicant_embedder.best_jobs(previously_applied_uuids, jobs_embedded_json_list, applicant_embedded_json_list[applicant_ind], jobs_relevant_fields, applicant_relevant_fields, vector_trunc_list,uid_name_jobs, uid_name_applicant,vector_comparison_threshold_list,full_vector_comparison_threshold)

    #get the top few jobs for this applicant
    top_n_idx = np.flip(np.argsort(best_jobs_scores)[-num_top_jobs:])
    top_n_scores = [best_jobs_scores[i] for i in top_n_idx]
    #top_n_jobs = [jobs_data_raw[i] for i in top_n_idx]
    #top_n_jobs = [jobs_data_json[i] for i in top_n_idx]
    top_n_jobs_json = [best_jobs_json[i] for i in top_n_idx]
    #top_n_jobs_keepcols = ['uuid','title','locations','timeKind','payParsed'] 
    top_n_jobs_keepcols = ['uuid','title','cheeki_score','locations'] 

    #for i in range(len(top_n_idx)):
    #    #top_n_jobs[i]['cheeki_score'] = top_n_jobs_json[i]['cheeki_score']
    #    #top_n_jobs[i]['cheeki_score'] = top_n_jobs_json[i]['cheeki_score']

    applicant_record = applicant_records[applicant_ind]
    #applicant_record['recommended_daily_jobs'] = [i for i in top_n_jobs]
    #applicant_record['recommended_daily_jobs'] = [i for i in top_n_jobs_json]
    #applicant_record['recommended_daily_jobs'] = [i[top_n_jobs_keepcols] for i in top_n_jobs_json]
    applicant_record['recommended_daily_jobs'] = [{key:listelem.get(key) for key in top_n_jobs_keepcols} for listelem in top_n_jobs_json]
    job_recommendataions_final.append(applicant_record)

#also dump this to a recommended-jobs file:
if use_local_scraped_file:
    json_object = json.dumps(job_recommendataions_final, indent=4, cls=DecimalEncoder)
    with open("daily_job_recommendations.json", "w") as outfile:
        outfile.write(json_object)
else:
    # json_string = json.dumps(job_recommendataions_final, cls=DecimalEncoder)
    # response = s3.put_object(
    #     Body=json_string,
    #     Bucket=s3_scraped_bucket_name,
    #     Key=s3_scraped_bucket_outfilename
    # )
    #write the local file too, just for examination:
    json_object = json.dumps(job_recommendataions_final, indent=4, cls=DecimalEncoder)
    with open("daily_job_recommendations.json", "w") as outfile:
        outfile.write(json_object)


In [None]:
#job_recommendataions_final

#jobs_data_raw[0]

In [None]:
#applicant_records[0]

In [None]:
# #also dump this to a recommended-jobs file:
# if use_local_scraped_file:
#     json_object = json.dumps(job_recommendataions_final, indent=4, cls=DecimalEncoder)
#     with open("daily_job_recommendations.json", "w") as outfile:
#         outfile.write(json_object)
# else:
#     # json_string = json.dumps(job_recommendataions_final, cls=DecimalEncoder)
#     # response = s3.put_object(
#     #     Body=json_string,
#     #     Bucket=s3_scraped_bucket_name,
#     #     Key=s3_scraped_bucket_outfilename
#     # )
#     #write the local file too, just for examination:
#     json_object = json.dumps(job_recommendataions_final, indent=4, cls=DecimalEncoder)
#     with open("daily_job_recommendations.json", "w") as outfile:
#         outfile.write(json_object)

In [None]:
#applicant_embedded_json_list


# Extra output set if we want it:

In [None]:
brek

In [None]:
#also dump this to a recommended-jobs file:
if use_local_scraped_file:
    json_object = json.dumps(job_recommendataions_final, indent=4, cls=DecimalEncoder)
    with open("daily_job_recommendations.json", "w") as outfile:
        outfile.write(json_object)
else:
    json_string = json.dumps(job_recommendataions_final, cls=DecimalEncoder)
    response = s3.put_object(
        Body=json_string,
        Bucket=s3_scraped_bucket_name,
        Key=s3_scraped_bucket_outfilename
    )
    #write the local file too, just for examination:
    json_object = json.dumps(job_recommendataions_final, indent=4, cls=DecimalEncoder)
    with open("daily_job_recommendations.json", "w") as outfile:
        outfile.write(json_object)

In [None]:
job_recommendataions_final

In [None]:

# # and we just want the top few:
# top_n_idx = np.flip(np.argsort(best_jobs_scores)[-num_top_jobs:])
# top_n_scores = [best_jobs_scores[i] for i in top_n_idx]
# top_n_jobs = [jobs_data_raw[i] for i in top_n_idx]

# print(applicant_records[0])

# [i.items() for i in top_n_jobs]


# Some quick Visual Confirmations - Does this work as well as we want?
 - Show the applicant's resume
 - Show the top 3 jobs we picked for them and their text

In [None]:
# top_few_jobs_json = [x for _, x in sorted(zip(best_jobs_scores, best_jobs_json),reverse=True)][0:num_top_jobs]
# top_few_uuids = [i[uid_name_jobs] for i in top_few_jobs_json]
# top_few_job_descriptions = [[i['fullJobDescription'] for i in jobs_data_raw if i[uid_name_jobs]==top_uuid][0] for top_uuid in top_few_uuids]


# print('APPLICANT RESUME FULLTEXT:')
# print(' ')
# print(applicant_record['fullText'])
# print(' ')
# for i in range(num_top_jobs):
#     print('SELECTED JOB NUMBER ', str(int(i)), ':')
#     print(' ')
#     print(top_few_job_descriptions[i])


In [None]:
#top_few_job_descriptions
#num_top_jobs