Author: Jeff Magouirk
<br>Date : 09/10/2020
<br>Confluence page - https://confluence.dhigroupinc.com/display/MATCH/MATCH-500-spike-what-data-is-available-t
<br>Jira Page:https://jira.dhigroupinc.com/browse/MATCH-500
<br>BitBucket:

In [None]:
#!pip install --upgrade pip
#!pip install --upgrade jsonpath-ng
#!pip install --upgrade pandas
#!pip install sagemaker
#!pip install boto3
#!pip install s3fs

In [None]:
import os
import re
import json
import boto3
from itertools import chain
from concurrent.futures import ProcessPoolExecutor
import multiprocessing as mp

import sagemaker
from sagemaker import get_execution_role

sagemaker_session = sagemaker.Session()

s3 = boto3.resource('s3')

import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from jsonpath_ng import jsonpath, parse

from dsmatch import local_bucket,s3_ds_bucket
from dsmatch.util.parallel import get_n_splits

<h3> Bringing in the parsed Burning Glass files of job descriptions </h3>

In [None]:
input_dir_jd = os.path.join(local_bucket, 'data', 'efc', 'jobseeker-applies','job_descriptions')#Calling parsed jd by BG
folders = [x for x in os.listdir(input_dir_jd) if not x.endswith('.csv')]#calling the folders insde the jobseeker-apples
output_dir_jd='bg-jd-skill-names' # output dir 
output_path_jd = os.path.join(local_bucket, 'data', 'efc', 'jobseeker-applies', output_dir_jd)#output path

total_jd=[]#making an object
for folder in folders:
#     already_processed_names = [f.split('/')[-1][:-4] for f in os.listdir(output_path_jd) if f.endswith('.csv')]#
    filelist=[os.path.join(input_dir_jd, folder, x)\
              for x in os.listdir(os.path.join(input_dir_jd, folder)) \
#               if x.endswith(".json") and x[:-5] not in already_processed_names
             ]
    total_jd.extend(filelist)

print("Number of files need to process",len(total_jd))
#print("Number Processed files",len)

<h3> Defined function to read in the filelist </h3>

In [None]:
def process_files(filelist):
    records = []
    for filename in filelist:
        with open(filename, 'r') as f:
            d = json.load(f)
            jsonpath_expr = parse(f'$..skillrollup[*].canonskill[*].variant[*]')
            matches = [match.value for match in jsonpath_expr.find(d)]
            if len(matches) > 0:        
                splitted = filename.split('/')
                date_name = splitted[-2]
                job_application_id = splitted[-1][:-5]
                records.append((date_name, job_application_id, matches))
    return records
            
max_workers = mp.cpu_count() - 1

Splitting the records for multiple processors

In [None]:
n_splits = get_n_splits(total_jd, chunksize=.05, min_chunksize=10)

records = []

with ProcessPoolExecutor(max_workers=max_workers) as executor:
    args_list = [X for X in np.array_split(total_jd, n_splits)]
    records.extend(chain(*tqdm(executor.map(process_files, args_list), total=len(args_list))))

<h4> Looking at the output of date_name, job_application_id, parsed data(variant) </h4>

In [None]:
df_0 = pd.DataFrame(records)
df_0.info()
df_0.head()

Looking at the output of the variant field where the the languages are. 

In [None]:
print(df_0.iloc[2,2])

<h4> Finding the languages in the parsed job description field variant from Burning Glass </h4>

In [None]:
df_jd = pd.DataFrame(records, columns=['date', 'job_application_id', 'variant'])

df_jd = df_jd.explode('variant')
df_jd.drop_duplicates(inplace=True)

df_jd['variant'] = df_jd['variant'].str.lower()

###Top 20 langugaes spoken in the world, by number of estimated speakers

L = ['arabic','english','chinese','japanese','mandarin','hinidi','spanish','french','bengali','russian',
    'portuguese','urdu','german','indonesian','swahili','marathi','telugu','turkish','cantonese','tamil',
    'western punjabi','punjabi']

df_jd["Required_language"]= df_jd['variant'].str.contains('|'.join(L), flags=re.I)

print('Required Language = \n',df_jd['Required_language'].value_counts())
df_language = pd.DataFrame(df_jd[df_jd['Required_language']==True])
print('Variant = \n',df_language['variant'].value_counts().sort_values(ascending =False))
df_cnts = pd.DataFrame(df_language['job_application_id'].value_counts().sort_values(ascending=False))
print('Counts of Languages = \n',df_cnts['job_application_id'].value_counts().sort_values(ascending=False))





In [None]:
df_language.info()
df_language = df_language.rename(columns={'job_application_id':'job_application.data.id'})
df_language.head(n=36)

<h3> Looking at Language the Resume is written </h3>

In [None]:
df_aug = pd.read_csv('s3://dev-dhi-match-datascience/data/efc/live-feed/raw-20200826.csv')
df_aug.info(verbose=True)

<h4>Exploding out the languagues and language competencies of the job seekers</h4>

In [None]:
t2 = df_aug[['jobseeker.data.id','jobseeker.data.languages','job_application.data.job_id','job_application.data.id']]
t2 = t2.dropna()
t2['jobseeker.data.languages'] = t2['jobseeker.data.languages'].apply(eval)
t3 = t2.explode('jobseeker.data.languages') 
t3 = t3.dropna()
t3['language'] = t3['jobseeker.data.languages'].apply(lambda x: x['language'])
t3['language_competency'] = t3['jobseeker.data.languages'].apply(lambda x: 
                                                        (x['language'],x['language_competency']))
print(t3.shape)
t3.head()

<h4> Merging the language of the resume with the langugage requirement on the job description</h4>

In [None]:
df_aug1 = df_aug[['job_application.data.id','Language_JD','Language_Resume']]
df_merge = df_language.merge(df_aug1,on='job_application.data.id', how='left')
df_merge["Language_Resume"].replace({"en": "english", "fr": "french","de":"german",
                                    'zh-cn':'chinese-prc','cy':'welsh',
                                    'it':'italian','ca':'catalan','ro':'romanian',
                                    'ar':'arabic','es':'spanish','nl':'dutch'}, inplace=True)
df_merge.head()


In [None]:
counts = df_merge[['job_application.data.id']]
print('Id counts =',counts.shape)
counts = counts.drop_duplicates()
print('Unique Id counts =',counts.shape)
df_merge1 = df_merge[df_merge['variant']==df_merge['Language_Resume']]
print(df_merge1.shape)
df_cnts = df_merge1.variant.value_counts().sort_values(ascending=False)
print(df_cnts.head())
df_merge1.head()
df_merge2 = df_merge1.drop_duplicates()
df_merge2.shape