In [5]:
# Configure
print_debug = True
csv_file = './examples/cv.csv' # the csv text data include the following fields: id (int), text (string)

In [6]:
import pandas as pd

# Load and clean data
print('Load and clean data')
df = pd.read_csv(csv_file)
df = df.dropna()
df['text'] = df['text'].str.lower()
if print_debug:
    print(df.head(4))

Load and clean data
    id                                               text
0  1.0  "job title: software engineer\ncompany: xyz te...
1  2.0  "job title: data scientist\ncompany: abc data\...
4  3.0  "resume\nname: john doe\ncontact: john.doe@exa...
5  4.0  "resume\nname: jane smith\ncontact: jane.smith...


In [7]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
# Load pre-trained models 
# soft skills extraction: https://huggingface.co/jjzha/jobbert_skill_extraction
# hard skills extraction: https://huggingface.co/jjzha/jobbert_knowledge_extraction
token_soft_skill_classifier = pipeline(model='jjzha/jobbert_skill_extraction', aggregation_strategy='first')
token_hard_skill_classifier = pipeline(model='jjzha/jobbert_knowledge_extraction', aggregation_strategy='first')

def aggregate_skill_span(results):
    """Aggregate consecutive classified ntities into one.
    """    
    new_results = []
    current_result = results[0]

    for result in results[1:]:
        if result["start"] <= current_result["end"] + 1:
            current_result["word"] += " " + result["word"]
            current_result["end"] = result["end"]
        else:
            new_results.append(current_result)
            current_result = result

    new_results.append(current_result)
    
    # remove invalid skills that are 1 character and none-alphabet (e.g. punctuation or other symbols that are wrongly classifierd)
    new_results = [x for x in new_results if (len(x["word"]) > 1 or x["word"].isalpha())]

    # remove invalid skills that are all numeric
    new_results = [x for x in new_results if (not x["word"].isnumeric())]

    return [x for x in new_results if (len(x["word"]) > 1 or x["word"].isalpha())]

def extract_skill_entities(text):
    """Extract both soft and hard skills
    """   
    # soft skills     
    output_soft_skills = token_soft_skill_classifier(text)
    for result in output_soft_skills:
        if result.get("entity_group"):
            result["entity"] = "Soft Skill"
            del result["entity_group"]
    # hard skills
    output_hard_skills = token_hard_skill_classifier(text)
    for result in output_hard_skills:
        if result.get("entity_group"):
            result["entity"] = "Hard Skill"
            del result["entity_group"]
    # aggregates
    if len(output_soft_skills) > 0:
        output_soft_skills = aggregate_skill_span(output_soft_skills)
    if len(output_hard_skills) > 0:
        output_hard_skills = aggregate_skill_span(output_hard_skills)


    return output_soft_skills, output_hard_skills

# Copy dataframe and create placeholder for soft and hard skills
out_df = df.copy(deep=True)
out_df['soft_skills'] = pd.Series(dtype='string')
out_df['hard_skills'] = pd.Series(dtype='string')

# Loop through each row
score_thres = 0.5
for i, row in out_df.iterrows():
    output_soft_skills, output_hard_skills = extract_skill_entities(row['text'])

    # Extract soft skills
    soft_skills = set()
    for soft_skill in output_soft_skills:
        if soft_skill['score'] > score_thres:
            soft_skills.add(soft_skill['word'])
    if soft_skills:
        out_df.at[i,'soft_skills'] = str(soft_skills)
    
    # Extract hard skills
    hard_skills = set()
    for hard_skill in output_hard_skills:
        if hard_skill['score'] > score_thres:
            hard_skills.add(hard_skill['word'])
    if hard_skills:
        out_df.at[i,'hard_skills'] = str(hard_skills)

    if print_debug:
        print(f'CV ID: {i}')
        print('\tSoft Skills')
        print(soft_skills)
        print('\tHard Skills')
        print(hard_skills)

out_df.to_csv('./output/cv_classified.csv', index=False)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


CV ID: 0
	Soft Skills
['passionate', 'design and develop software solutions', 'gathering user requirements', 'defining system functionality', 'writing code', 'analytical mind', 'problem - solving aptitude', 'work independently', 'organizational and leadership skills']
	Hard Skills
['software', 'java', 'ruby on rails', '. net programming languages', 'jscript. net', 'software development', 'scripting', 'project management', 'system monitoring tools', 'new relic', 'automated testing frameworks', 'selected programming languages', 'c + +', 'java / j2ee platform', 'relational databases', 'mysql', 'nosql databases']
CV ID: 1
	Soft Skills
['analyze large amounts of raw information', 'find patterns', 'conducting full lifecycle analysis', ', activities and design', 'develop analysis and reporting capabilities', 'monitor performance and quality control plans', 'identify improvements', 'analytical mind', 'business acumen', 'math skills', 'problem - solving aptitude', 'communication and presentatio