In [3]:
"""Evaluation analysis - 

1) We compare lightcast's algorithm with our lightcast mapped skills at the skill level. We guarantee to map to skills based on setting the minimum cosine similarity threshold to 0;
2) We also compare top extracted skills per occupation with ESCO's essential skills.
"""

"Evaluation analysis - \n\n1) We compare lightcast's algorithm with our lightcast mapped skills at the skill level. We guarantee to map to skills based on setting the minimum cosine similarity threshold to 0;\n2) We also compare top extracted skills per occupation with ESCO's essential skills.\n"

In [4]:
from ojd_daps_skills import config, bucket_name, logger
from ojd_daps_skills.getters.data_getters import (
    get_s3_resource,
    load_s3_data,
    save_to_s3,
)
import pandas as pd
import random
import ast

#to check job titles at the occupation level
from ojd_daps_skills.utils.sql_conn import est_conn
import os
import itertools

  from .autonotebook import tqdm as notebook_tqdm


### 0. Relevant functions for analysis.

In [5]:
def percent_overlap(ojo_skills, lightcast_skills):
    """Calculate the percent overlap between two lists"""
    
    if len(ojo_skills) and len(lightcast_skills) > 0:

        setA = set(ojo_skills)
        setB = set(lightcast_skills)

        overlap = setA & setB
        universe = setA | setB


        result1 = float(len(overlap)) / len(setA) * 100
        result2 = float(len(overlap)) / len(setB) * 100
        result3 = float(len(overlap)) / len(universe) * 100

        return result1, result2, result3
    
    else:
        return 100, 100, 100

### 1. Relevant parameters and datasets for analysis.

In [6]:
s3 = get_s3_resource()
ojo_lightcast_skills = load_s3_data(s3, bucket_name, 'escoe_extension/outputs/evaluation/ojo_esmi_skills/ojo_lightcast_skills_20221115.json')
esco_occupations = load_s3_data(s3, bucket_name, 'escoe_extension/outputs/evaluation/aggregate_ojo_esco/ojo_esco_occupation_skills_results_v2.json')
esco_occupations_df = pd.DataFrame(esco_occupations).T.sort_values('skills_in_ojo_esco_percent', ascending=False).reset_index().rename(columns={'index':'occupation'})
esco_skills = load_s3_data(s3, bucket_name, 'escoe_extension/outputs/data/skill_ner_mapping/esco_data_formatted.csv')
esco_hier_mapper = load_s3_data(s3, bucket_name, 'escoe_extension/outputs/data/skill_ner_mapping/esco_hier_mapper.json')

### 2. Lightcast comparison analysis

In [7]:
for job_id, skill_info in ojo_lightcast_skills.items():
    comps = percent_overlap(skill_info['ojo_skills'], skill_info['lightcast_skills'])
    for comp_type, comp in zip(('ojo_skills_overlap', 'lightcast_skills_overlap', 'universal_overlap'), comps):
        skill_info[comp_type] = comp    

In [8]:
ojo_lightcast_skills_df = pd.DataFrame(ojo_lightcast_skills).T
ojo_lightcast_skills_df = ojo_lightcast_skills_df.sort_values('lightcast_skills_overlap', ascending=False)
ojo_lightcast_skills_df = ojo_lightcast_skills_df[~(ojo_lightcast_skills_df['ojo_skills'].str.len() == 0) & (ojo_lightcast_skills_df['lightcast_skills'].str.len() != 0)]

In [9]:
print('percent overlap analysis')

print(f"the % of job adverts with no skills overlap is: {len(ojo_lightcast_skills_df[ojo_lightcast_skills_df['ojo_skills_overlap'] == 0.0])/len(ojo_lightcast_skills_df)}")
print(f"the average # of lightcast skills we extract is: {ojo_lightcast_skills_df.ojo_skills.apply(lambda x: len(x)).mean()}")
print(f"the median # of lightcast skills we extract is: {ojo_lightcast_skills_df.ojo_skills.apply(lambda x: len(x)).median()}")

print(f"the average # of lightcast skills lightcast extracts is: {ojo_lightcast_skills_df.lightcast_skills.apply(lambda x: len(x)).mean()}")
print(f"the median # of lightcast skills lightcast extracts is: {ojo_lightcast_skills_df.lightcast_skills.apply(lambda x: len(x)).median()}")

print(f"of the job adverts with overlap, on average, {ojo_lightcast_skills_df[ojo_lightcast_skills_df['lightcast_skills_overlap'] != 0.0].lightcast_skills_overlap.mean()} of lightcast skills are present in our current approach.")
print(f"of the job adverts with overlap, the median is {ojo_lightcast_skills_df[ojo_lightcast_skills_df['lightcast_skills_overlap'] != 0.0].lightcast_skills_overlap.median()} of lightcast skills are present in our current approach.")

print(f"of the job adverts with overlap, on average, {ojo_lightcast_skills_df[ojo_lightcast_skills_df['ojo_skills_overlap'] != 0.0].ojo_skills_overlap.mean()} of our skills are present in lighcast skills.")
print(f"of the job adverts with overlap, the median is {ojo_lightcast_skills_df[ojo_lightcast_skills_df['ojo_skills_overlap'] != 0.0].ojo_skills_overlap.median()} of our skills are present in lightcast skills.")

percent overlap analysis
the % of job adverts with no skills overlap is: 0.425531914893617
the average # of lightcast skills we extract is: 10.872340425531915
the median # of lightcast skills we extract is: 10.0
the average # of lightcast skills lightcast extracts is: 6.74468085106383
the median # of lightcast skills lightcast extracts is: 5.0
of the job adverts with overlap, on average, 39.318289194020196 of lightcast skills are present in our current approach.
of the job adverts with overlap, the median is 33.33333333333333 of lightcast skills are present in our current approach.
of the job adverts with overlap, on average, 25.087275154612616 of our skills are present in lighcast skills.
of the job adverts with overlap, the median is 21.428571428571427 of our skills are present in lightcast skills.


### 3. ESCO occupations comparison analysis

In [10]:
print(f"{len(esco_occupations_df)} occupations (with 100 or more job adverts) in ESCO were also found in OJO.")
print(f"the average # of adverts per occupation (with 100 or more job adverts) is {esco_occupations_df.no_of_job_adverts.mean()}")
print(f"on average, {esco_occupations_df['skills_in_ojo_esco_percent'].mean()} percent of essential ESCO skills per occupation are were extracted from our algorithm.")
print(f"the median percent of essential ESCO skills per occupation are were extracted from our algorithm is {esco_occupations_df['skills_in_ojo_esco_percent'].median()}.")

print(f"the maximum % of skills mentioned in essential ESCO skills in OJO job adverts is {esco_occupations_df.iloc[0].skills_in_ojo_esco_percent}, for the occupation {esco_occupations_df.iloc[0].occupation}.")
print(f"there are {len(esco_occupations_df[esco_occupations_df['skills_in_ojo_esco_percent'] == 0.0])} occupations with no overlap.")

58 occupations (with 100 or more job adverts) in ESCO were also found in OJO.
the average # of adverts per occupation (with 100 or more job adverts) is 345.5344827586207
on average, 19.58816405821023 percent of essential ESCO skills per occupation are were extracted from our algorithm.
the median percent of essential ESCO skills per occupation are were extracted from our algorithm is 19.05241935483871.
the maximum % of skills mentioned in essential ESCO skills in OJO job adverts is 54.54545454545454, for the occupation project manager.
there are 2 occupations with no overlap.


In [11]:
random.seed(42)
occupation_name = random.choice(esco_occupations_df.occupation)

print(f'--top OJO skills for "{occupation_name}" that are not essential ESCO skills--')
print(list(esco_occupations_df[esco_occupations_df.occupation == occupation_name].in_ojo_not_esco))
print(f'--essential ESCO skills for "{occupation_name}" that were not extracted--')
print(list(esco_occupations_df[esco_occupations_df.occupation == occupation_name].in_esco_not_ojo))

--top OJO skills for "legal assistant" that are not essential ESCO skills--
[['CRM', 'commend a marketing strategy', 'apply case-load management', 'liaise with customers on behalf of the company', 'cloud technologies', 'improve quality of care', 'establish brand identity', 'demonstrate the motivation for sales', 'ETL', 'perform projects management', 'use CAD software', 'personnel management', 'application process', 'mathematics', 'cultivating land and crops', 'managing sub-contract labourers', 'delivering sales pitches', 'develop new recipes', 'arts and humanities', 'perform troubleshooting', 'developing a project schedule', 'interact with customers', 'principles of leadership', 'carrying out a sales analysis', 'operating scientific and laboratory equipment', 'manage business risks', 'applying general knowledge', 'advise customers', 'attending events', 'estimate project costs', 'show determination', 'following reporting procedures', 'respond to customer requests', 'create accounts', 'p

In [12]:
random.seed(54)
occupation_name = random.choice(esco_occupations_df.occupation)

print(f'--top OJO skills for "{occupation_name}" that are not essential ESCO skills--')
print(list(esco_occupations_df[esco_occupations_df.occupation == occupation_name].in_ojo_not_esco))
print(f'--essential ESCO skills for "{occupation_name}" that were not extracted--')
print(list(esco_occupations_df[esco_occupations_df.occupation == occupation_name].in_esco_not_ojo))

--top OJO skills for "financial analyst" that are not essential ESCO skills--
[['CRM', 'commend a marketing strategy', 'liaise with customers on behalf of the company', 'cloud technologies', 'improve quality of care', 'establish brand identity', 'ETL', 'GPS', 'use CAD software', 'carry out cleaning duties', 'application process', 'mathematics', 'cultivating land and crops', 'PR', 'managing sub-contract labourers', 'maintain professional documentation', 'delivering sales pitches', 'perform troubleshooting', 'handle difficult clients', 'developing a project schedule', 'principles of leadership', 'operating scientific and laboratory equipment', 'manage business risks', 'applying general knowledge', 'attending events', 'show determination', 'respond to customer requests', 'create accounts', 'prioritise in accordance with changing circumstances', 'principles of project control', 'support staff', 'ensure shipments are unloaded in a safe and effective manner', 'engineering, manufacturing and 

In [13]:
random.seed(72)
occupation_name = random.choice(esco_occupations_df.occupation)

print(f'--top OJO skills for "{occupation_name}" that are not essential ESCO skills--')
print(list(esco_occupations_df[esco_occupations_df.occupation == occupation_name].in_ojo_not_esco))
print(f'--essential ESCO skills for "{occupation_name}" that were not extracted--')
print(list(esco_occupations_df[esco_occupations_df.occupation == occupation_name].in_esco_not_ojo))

--top OJO skills for "finance assistant" that are not essential ESCO skills--
[['CRM', 'commend a marketing strategy', 'liaise with customers on behalf of the company', 'cloud technologies', 'improve quality of care', 'establish brand identity', 'ETL', 'use CAD software', 'carry out cleaning duties', 'application process', 'mathematics', 'cultivating land and crops', 'PR', 'managing sub-contract labourers', 'delivering sales pitches', 'perform troubleshooting', 'handle difficult clients', 'developing a project schedule', 'principles of leadership', 'operating scientific and laboratory equipment', 'manage business risks', 'applying general knowledge', 'attending events', 'show determination', 'respond to customer requests', 'create accounts', 'prioritise in accordance with changing circumstances', 'principles of project control', 'ensure shipments are unloaded in a safe and effective manner', 'engineering, manufacturing and construction not elsewhere classified', 'perform inspections', 

#### 3.1 ESCO occupations comparison analysis - skill group level

In [14]:
esco_skills = esco_skills[~esco_skills['hierarchy_levels'].isna()]
esco_skills['hierarchy_levels'] = esco_skills['hierarchy_levels'].apply(ast.literal_eval)
esco_skills_dict = esco_skills.set_index('description')['hierarchy_levels'].to_dict()

In [15]:
def get_skill_level(skill_list):
    
    skill_group_list = [esco_skills_dict.get(i) for i in skill_list]
    
    skill_group_list_flat = []
    for skill_group in skill_group_list:
        if skill_group is not None:
            skill_group_flat = list(itertools.chain(*skill_group))
            for flat_group in skill_group_flat:
                if flat_group is not None and flat_group.count('.') == 2:
                    skill_group_list_flat.append(flat_group)
                
    return list(set(skill_group_list_flat))

In [16]:
#add skill groups to esco ojo comparisons 
skill_cols = ['in_both_ojo_esco', 'in_ojo_not_esco', 'in_esco_not_ojo']
for skill_col in skill_cols:
    esco_occupations_df[f'{skill_col}_skill_group'] = esco_occupations_df[skill_col].apply(get_skill_level)

In [17]:
#compare at the skill level group!
def compare_skill_groups(esco_skill_groups, ojo_skill_groups):
    
    esco_skill_groups = set(esco_skill_groups)
    ojo_skill_groups = set(ojo_skill_groups)
    
    in_both_ojo_esco = set.intersection(esco_skill_groups, ojo_skill_groups)
    skill_group_percent = (len(in_both_ojo_esco)/len(esco_skill_groups))* 100

    return skill_group_percent

In [18]:
esco_occupations_df['esco_skill_groups'] = esco_occupations_df['in_esco_not_ojo_skill_group'] + esco_occupations_df['in_both_ojo_esco_skill_group']
esco_occupations_df['esco_skill_groups'] = esco_occupations_df['esco_skill_groups'].apply(lambda x: list(set(x)))

esco_occupations_df['ojo_skill_groups'] = esco_occupations_df['in_ojo_not_esco_skill_group'] + esco_occupations_df['in_both_ojo_esco_skill_group']
esco_occupations_df['ojo_skill_groups'] = esco_occupations_df['ojo_skill_groups'].apply(lambda x: list(set(x)))

esco_occupations_df['skill_groups_in_ojo_esco_percent'] = esco_occupations_df.apply(lambda skill: compare_skill_groups(skill['esco_skill_groups'], skill['ojo_skill_groups']), axis=1)

In [19]:
print(f"on average, {esco_occupations_df.skill_groups_in_ojo_esco_percent.mean()} percent of essential ESCO skill groups per occupation are were extracted from our algorithm.")
print(f"the median percent of essential ESCO skills per occupation that were extracted from our algorithm is {esco_occupations_df['skill_groups_in_ojo_esco_percent'].median()}.")
print(f"{(len(esco_occupations_df[esco_occupations_df['skill_groups_in_ojo_esco_percent'] == 100])/len(esco_occupations_df))*100} % of occupations have 100% of ESCO essential skills at the skill group level.")
print(f"the occupation with the least amount of skill level group overlap is {esco_occupations_df.sort_values('skill_groups_in_ojo_esco_percent').iloc[0].occupation}, with {esco_occupations_df.sort_values('skill_groups_in_ojo_esco_percent').iloc[0].skill_groups_in_ojo_esco_percent}% overlap at the skill group level.")

on average, 94.53219206056886 percent of essential ESCO skill groups per occupation are were extracted from our algorithm.
the median percent of essential ESCO skills per occupation that were extracted from our algorithm is 97.83720930232558.
46.55172413793103 % of occupations have 100% of ESCO essential skills at the skill group level.
the occupation with the least amount of skill level group overlap is personal trainer, with 27.77777777777778% overlap at the skill group level.


#### 3.2 eyeball that job titles are representative of occupation

In [22]:
conn = est_conn()
file_name = "escoe_extension/outputs/data/model_application_data/dedupe_analysis_skills_sample.json"
job_ads = load_s3_data(s3, bucket_name, file_name)
job_ads_df = pd.DataFrame(job_ads)

In [206]:
occs_compared = list(esco_occupations_df.occupation)
job_ads_df['occupation'] = job_ads_df.occupation.str.lower()
job_ads_df = job_ads_df[job_ads_df['occupation'].isin(occs_compared)]

In [207]:
job_ids_formatted = ", ".join([f'"{id_}"' for id_ in list(job_ads_df.job_id)])
job_title_q = f"SELECT id, job_title_raw FROM raw_job_adverts WHERE id in ({job_ids_formatted})"
job_titles_df = pd.read_sql(job_title_q, conn).rename(columns={'id': 'job_id'})
job_ads_with_titles = pd.merge(job_ads_df, job_titles_df)

In [208]:
job_titles_count_df = pd.DataFrame(job_ads_with_titles.groupby(['occupation', 'job_title_raw'])['job_title_raw'].count()).rename(columns={'job_title_raw': 'count'}).reset_index()

In [213]:
random.seed(155)
occ = random.choice(occs_compared)
job_titles_count_df[job_titles_count_df['occupation'] == occ].sort_values('count', ascending=False)

Unnamed: 0,occupation,job_title_raw,count
2212,information manager,IT Project Manager,26
2201,information manager,IT Manager,17
2217,information manager,IT Service Desk Manager,2
2245,information manager,Senior IT Project Manager,2
2181,information manager,Head of IT,2
...,...,...,...
2198,information manager,IT Infrastructure Lead,1
2197,information manager,IT Implementation Manager - ERP - Remote - &#1...,1
2196,information manager,IT Contract Manager - Vendor Manager - Amsterdam,1
2195,information manager,IT Contract Manager - Parking,1


In [214]:
random.seed(77)
occ = random.choice(occs_compared)
job_titles_count_df[job_titles_count_df['occupation'] == occ].sort_values('count', ascending=False)

Unnamed: 0,occupation,job_title_raw,count
4748,sales assistant,Sales Assistant,109
4711,sales assistant,Branch Sales Assistant,7
4728,sales assistant,Immediate Start - Sales Assistant,6
4744,sales assistant,Part Time Sales Assistant,6
4720,sales assistant,Customer Service and Sales Assistant,5
...,...,...,...
4746,sales assistant,Part Time Sales Assistant - Bridgwater,1
4747,sales assistant,Part Time Sales Assistant 12m FTC,1
4750,sales assistant,Sales Assistant (Fixed Term contract - 6 Months),1
4751,sales assistant,Sales Assistant - Sports Minded Individuals N...,1


In [215]:
random.seed(159)
occ = random.choice(occs_compared)
job_titles_count_df[job_titles_count_df['occupation'] == occ].sort_values('count', ascending=False)

Unnamed: 0,occupation,job_title_raw,count
3244,office administrator,Office Administrator,67
3194,office administrator,Admin Officer,10
3245,office administrator,Office Administrator,9
3199,office administrator,Admin Officer AO - Band E,4
3203,office administrator,Administration Officer,4
...,...,...,...
3225,office administrator,Fines Officer/Administrator,1
3224,office administrator,Finance admin officer 5197795,1
3223,office administrator,Finance Admin Officer,1
3222,office administrator,Dutch speaking Office Administrator - Graduate...,1
