# CREAM Exploration

Looking at the new and improved CREAM toolkit.

In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
import json
import re
from operator import itemgetter
import swifter

In [2]:
# load sample keywords, rules, and data
keywords = [
    "bachelor",
    "master",
    "degree",
    "high school",
    "education",
    "diploma",
    "ged",
    "certification"
]
rules = pd.read_csv("data/demo/CREAM/education_rules.csv")
with open("data/demo/TaskMatch/sample_job_ads_in_line.csv", 'r') as f:
    data = [x.strip() for x in f.readlines()]
data = pd.DataFrame(data, columns=["Job Description"])

keywords, rules, data

(['bachelor',
  'master',
  'degree',
  'high school',
  'education',
  'diploma',
  'ged',
  'certification'],
                                    rule  education
 0                              bachelor          0
 1                                master          0
 2                                degree          0
 3                           high school          0
 4                             education          0
 ..                                  ...        ...
 89           at least a bachelor degree          1
 90  must be a graduate of an accredited          1
 91                  needed ba bs degree          1
 92          bachelors degree experience          1
 93             experience or a bachelor          1
 
 [94 rows x 2 columns],
                                          Job Description
 0         ﻿Wash and fold the clothes and do the laundry.
 1      Pack luggage for travel and move bags to the p...
 2                                       Wash the dishes.
 3    

In [3]:
# sbert model
#model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
model = SentenceTransformer("thenlper/gte-large")

In [4]:
# vectorize rules
encoded_rules = model.encode(rules['rule'].tolist())
rule_map = dict(zip(rules['rule'].tolist(), rules['education'].tolist()))
len(encoded_rules)

94

In [5]:
# similarity function
def get_sim(model, rule_map, encoded_rules, q):
    sim_scores = util.cos_sim(model.encode([q]), encoded_rules)
    return dict(zip(rule_map.keys(), sim_scores[0].tolist()))

In [6]:
scores = get_sim(model, rule_map, encoded_rules, "bachelor degree required")
scores

{'bachelor': 0.887619137763977,
 'master': 0.8026555776596069,
 'degree': 0.8911867737770081,
 'high school': 0.7840946912765503,
 'education': 0.8150884509086609,
 'diploma': 0.8584140539169312,
 'ged': 0.8136746883392334,
 'certification': 0.8032225370407104,
 'qualifications associate degree ': 0.8551264405250549,
 'accredited college or university ': 0.8332047462463379,
 'at least a bachelor degree': 0.8332047462463379,
 'ba bs degree in a related': 0.9294393658638,
 'bachelor degree and years work experience': 0.8935132026672363,
 'bachelor degree ba bs': 0.921425461769104,
 'bachelor degree in a': 0.9164381623268127,
 'bachelor degree or equivalent': 0.9259461760520935,
 'bachelor degree skills': 0.9242619872093201,
 'bs ba degree in related field': 0.9198769927024841,
 'bs degree or equivalent': 0.8903144598007202,
 'bs or ms degree': 0.903844952583313,
 'college degree programs': 0.8594380021095276,
 'degree required bachelor degree': 0.8599889874458313,
 'education and experie

In [7]:
# label via max score
def label_from_max(scores, rule_map):
    max_rule = max(scores, key=scores.get)
    label = rule_map[max_rule]
    return max_rule, label, scores[max_rule]

In [8]:
label_from_max(scores, rule_map)

('education and experience bachelor degree', 1, 0.9810521602630615)

In [11]:
# if keyword found, get context window for similarity scoring
def get_context(text, keywords):
    n = 4
    text = text.lower()
    text = re.sub(r'[^a-z0-9]+', ' ', text)

    words = text.split()
    found_index = [i for i, w in enumerate(words) if any(k.strip() in w for k in keywords)]
    context = [" ".join(words[max(0, idx-n):min(idx+n+1, len(words))]) for idx in found_index]

    return '|'.join(context)

In [12]:
results = data["Job Description"].apply(lambda x: get_context(x, keywords))
[x for x in results if x != ""]

['a team of higher education professionals',
 'and risk taking are encouraged and every employee has',
 'bachelor s degree preferred or|bachelor s degree preferred or equivalent combination|or equivalent combination of education training and experience',
 'hand tools high school diploma or ged|high school diploma or ged',
 'other state and local certifications',
 'relationships through project interaction bachelor s degree in civil|project interaction bachelor s degree in civil or environmental',
 'master s in environmental engineering',
 'bachelor s degree in construction|bachelor s degree in construction management engineering',
 'hospitality primary and higher education justice and solar energy',
 'oversee the certification and final funding of',
 'of a high school degree required',
 'bachelor s degree preferred in|bachelor s degree preferred in business finance',
 'data query programs are managed appropriately',
 'high school diploma required',
 'relationships comprehensive trainin

In [13]:
## helper function to run CREAM on all data points
def __helper__(row):
    global keywords
    global model
    global rule_map
    global encoded_rules
    
    THRESHOLD = 0.9
    
    text = row["Job Description"]
    context = get_context(text, keywords).split('|')
    
    if len(context) > 0 and context[0] != "":
        all_scores = []
        for c in context:
            scores = get_sim(model, rule_map, encoded_rules, c)
            all_scores.append(label_from_max(scores, rule_map))
        max_score = max(all_scores, key=itemgetter(2))
        if max_score[2] >= THRESHOLD:
            return max_score[0], max_score[1], max_score[2]
        else:
            return None, 0, None
    else:
        return None, None, None

In [14]:
data[['inferred_rule', 'inferred_label', 'inferred_confidence']] = data.apply(__helper__, axis=1, result_type="expand")

In [15]:
# all matches exceeding our set threshold
data[data['inferred_rule'].isna() == False]

Unnamed: 0,Job Description,inferred_rule,inferred_label,inferred_confidence
69,"Bachelor’s degree preferred, or equivalent com...",nursing degree,1.0,0.922145
163,Own a basic set of hand tools High school dipl...,degree required high school,1.0,0.969878
303,"Bachelor's degree in construction management, ...",bachelor degree or equivalent,1.0,0.907762
328,You’ll be a great fit if in addition to the co...,general education degree ged,1.0,0.966497
329,"Bachelor’s degree, preferred in Business, Fina...",nursing degree,1.0,0.927838
...,...,...,...,...
27571,Must have a high school diploma or GED and be ...,degree required high school,1.0,0.938976
27607,Education and Experience: Completion of a home...,education bachelor degree,1.0,0.926923
27624,"Licensure, certification, and/or registration:...",experience or a bachelor,1.0,0.906362
27660,License/Certifications,certification,0.0,0.921547


In [16]:
# all matches exceeding threshold and receiving a label of 1 (true)
data[data['inferred_label'] == 1]

Unnamed: 0,Job Description,inferred_rule,inferred_label,inferred_confidence
69,"Bachelor’s degree preferred, or equivalent com...",nursing degree,1.0,0.922145
163,Own a basic set of hand tools High school dipl...,degree required high school,1.0,0.969878
303,"Bachelor's degree in construction management, ...",bachelor degree or equivalent,1.0,0.907762
328,You’ll be a great fit if in addition to the co...,general education degree ged,1.0,0.966497
329,"Bachelor’s degree, preferred in Business, Fina...",nursing degree,1.0,0.927838
...,...,...,...,...
27533,High School Diploma or equivalent required.,a high school degree,1.0,0.923840
27571,Must have a high school diploma or GED and be ...,degree required high school,1.0,0.938976
27607,Education and Experience: Completion of a home...,education bachelor degree,1.0,0.926923
27624,"Licensure, certification, and/or registration:...",experience or a bachelor,1.0,0.906362
