In [1]:
list_data_fields = ['LinkedIn/output/JD_Business Intelligence.csv','LinkedIn/output/JD_Data Analyst.csv',
                    'LinkedIn/output/JD_Data Architect.csv','LinkedIn/output/JD_Data Engineer.csv',
                    'LinkedIn/output/JD_Data Scientist.csv','LinkedIn/output/JD_Business Analyst.csv','LinkedIn/output/JD_Database Administrator.csv']


In [2]:
import pandas as pd
def read_and_clean_single_role(input_path=''):
    df = pd.read_csv(input_path,sep='|')

    # Drop Null Values for description column
    df = df.dropna(axis=0,subset=['description'])

    # Remove the same description & company & jobTitle & location & type
    df = df.drop_duplicates(subset=['jobTitle','location','company','type','description'],keep='first')

    return df
    
def create_field_DataFrame(input_list = list):
    list_dfs=[]
    for item in input_list:
        print(item)
        df_tmp = read_and_clean_single_role('../'+item)
        list_dfs.append(df_tmp)

    df = pd.concat(list_dfs)
    df = df.reset_index()
    return df

In [3]:
df_DS = create_field_DataFrame(list_data_fields)
df_lang = pd.read_csv('ds_job_lang.csv')

LinkedIn/output/JD_Business Intelligence.csv
LinkedIn/output/JD_Data Analyst.csv
LinkedIn/output/JD_Data Architect.csv
LinkedIn/output/JD_Data Engineer.csv
LinkedIn/output/JD_Data Scientist.csv
LinkedIn/output/JD_Business Analyst.csv
LinkedIn/output/JD_Database Administrator.csv


In [4]:
df_DS['lang'] = df_lang['language']
df_DS = df_DS[df_DS['lang']=='en']
df_DS
# df_DS = df_DS.reset_index('id')
df_DS.reset_index(inplace=True,drop=True)

In [5]:
df_DS

Unnamed: 0,index,id,jobTitle,location,company,type,description,lang
0,0,2975673294,Data Engineer in Business Intelligence Team (B...,"Ho Chi Minh City, Ho Chi Minh City, Vietnam",Agoda,Full-time · Entry level,About the job\nAbout Agoda\n\nAgoda is an onli...,en
1,1,2993666500,"Business Intelligence Intern, Kobiton","Ho Chi Minh City, Ho Chi Minh City, Vietnam","KMS Technology, Inc.",Internship · Internship,About the job\nThis job is sourced from a job ...,en
2,2,2975674182,"Business Intelligence Developer, Product Team ...","Ho Chi Minh City, Ho Chi Minh City, Vietnam",Agoda,Full-time · Associate,About the job\nAbout Agoda\n\nAgoda is an onli...,en
3,3,2993740820,"Business Intelligence (Edtech Company, Upto 10...","Hanoi, Hanoi, Vietnam",X3english Limited Company,"₫15,000,000/month - ₫23,000,000/month · Full-t...",About the job\nThis job is sourced from a job ...,en
4,4,2945707463,Senior Business Intelligence Analyst (0824),"Hanoi, Hanoi, Vietnam",Techcombank (TCB),Full-time · Associate,About the job\nJob Purpose\nThe Job Holder Res...,en
...,...,...,...,...,...,...,...,...
427,16,2990265063,"Database Administrator (DBA), based in Da Nang","Đà Nang, Da Nang City, Vietnam","KMS Technology, Inc.",Full-time · Associate,About the job\nAs a senior DBA you will have t...,en
428,17,3008519916,Junior/Senior Database Administrator (DBA),"Ho Chi Minh City, Ho Chi Minh City, Vietnam","KMS Technology, Inc.",Full-time · Mid-Senior level,About the job\nAs a senior DBA you will have t...,en
429,18,2698740162,[Local Product] Database Administrator,"Ho Chi Minh City, Ho Chi Minh City, Vietnam",Shopee,Full-time · Mid-Senior level,About the job\nAbout The Team\n\nThe Product D...,en
430,19,2973389208,"[Local Product] Database Administrator (MySQL,...","Ho Chi Minh City, Ho Chi Minh City, Vietnam",Shopee,Full-time · Entry level,About the job\nWhat Will You Do\nResponsible f...,en


In [7]:
#imports
import pandas as pd
import numpy as np
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.initializers import Constant
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm
from keras.layers import Dense,SpatialDropout1D
import contractions
import re 
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
# initializing Stop words libraries
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /Users/vankhaido/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/vankhaido/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vankhaido/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
def clean(desc):
    desc = contractions.fix(desc)
    desc = re.sub("[!@.$\'\'':()]", "", desc)
    return desc

def tokenize_and_tag(desc):
    tokens = nltk.word_tokenize(desc.lower())
    filtered_tokens = [w for w in tokens if not w in stop_words]
    tagged = nltk.pos_tag(filtered_tokens)
    return tagged

df_DS['description'] = df_DS['description'].apply(lambda x: re.sub(r'[!;,\s]', ' ', x))
df_DS

Unnamed: 0,index,id,jobTitle,location,company,type,description,lang
0,0,2975673294,Data Engineer in Business Intelligence Team (B...,"Ho Chi Minh City, Ho Chi Minh City, Vietnam",Agoda,Full-time · Entry level,About the job About Agoda Agoda is an online ...,en
1,1,2993666500,"Business Intelligence Intern, Kobiton","Ho Chi Minh City, Ho Chi Minh City, Vietnam","KMS Technology, Inc.",Internship · Internship,About the job This job is sourced from a job b...,en
2,2,2975674182,"Business Intelligence Developer, Product Team ...","Ho Chi Minh City, Ho Chi Minh City, Vietnam",Agoda,Full-time · Associate,About the job About Agoda Agoda is an online ...,en
3,3,2993740820,"Business Intelligence (Edtech Company, Upto 10...","Hanoi, Hanoi, Vietnam",X3english Limited Company,"₫15,000,000/month - ₫23,000,000/month · Full-t...",About the job This job is sourced from a job b...,en
4,4,2945707463,Senior Business Intelligence Analyst (0824),"Hanoi, Hanoi, Vietnam",Techcombank (TCB),Full-time · Associate,About the job Job Purpose The Job Holder Respo...,en
...,...,...,...,...,...,...,...,...
427,16,2990265063,"Database Administrator (DBA), based in Da Nang","Đà Nang, Da Nang City, Vietnam","KMS Technology, Inc.",Full-time · Associate,About the job As a senior DBA you will have to...,en
428,17,3008519916,Junior/Senior Database Administrator (DBA),"Ho Chi Minh City, Ho Chi Minh City, Vietnam","KMS Technology, Inc.",Full-time · Mid-Senior level,About the job As a senior DBA you will have to...,en
429,18,2698740162,[Local Product] Database Administrator,"Ho Chi Minh City, Ho Chi Minh City, Vietnam",Shopee,Full-time · Mid-Senior level,About the job About The Team The Product Deve...,en
430,19,2973389208,"[Local Product] Database Administrator (MySQL,...","Ho Chi Minh City, Ho Chi Minh City, Vietnam",Shopee,Full-time · Entry level,About the job What Will You Do Responsible for...,en


In [10]:
import re
import pandas as pd
import spacy
from spacy.util import filter_spans
from spacy.tokens import Span
from spacy.matcher import Matcher
from spacy import displacy
from IPython.display import HTML, display
import en_core_web_sm
nlp = en_core_web_sm.load()

In [11]:
def highlight_terms(terms, texts):
    for doc in nlp.pipe(texts):
        for sentence in set([tok.sent for tok in doc if tok.lower_ in terms]):
            text = sentence.text.strip()
            markup = re.sub(fr'(?i)\b({"|".join(terms)})\b', r'<strong>\1</strong>', text)
            display(HTML(markup))
            print('-----')



In [22]:
matcher = Matcher(nlp.vocab)
pattern = [{'POS': 'NOUN', 'OP': '+'}, {'LOWER': 'skill'}]
matcher.add('skill_noun', [pattern])

# pattern = [{'POS': 'NOUN', 'OP': '+'}, {'LOWER': 'knowledge'}]
# matcher.add('knowledge_noun', [pattern])

pattern = [{'LOWER': 'knowledge'}, {'POS': 'ADP'}, {'POS': {'IN': ('DET', 'NOUN', 'PROPN')}, 'OP': '+'}]
matcher.add('knowledge_adp', [pattern])

In [26]:
def show_extraction(examples, *extractors):
    seen = set()
    for doc in nlp.pipe(examples):
        doc.ents = filter_spans([Span(doc, start, end, label) for extractor in extractors for label, start, end in extractor(doc)])
        for tok in doc:
            if tok.lower_ == 'skill':
                sentence = tok.sent
                if sentence.text in seen:
                    continue
                seen.update([sentence.text])
                if not sentence.ents:
                    doc.ents = list(doc.ents) + [Span(doc, tok.i, tok.i+1, 'MISSING')]
                displacy.render(sentence, style='ent', options = {'colors': {'MISSING': 'pink',
                                                                            'SKILL': 'lightgreen'}})
            if tok.lower_ == 'knowledge':
                sentence = tok.sent
                if sentence.text in seen:
                    continue
                seen.update([sentence.text])
                if not sentence.ents:
                    doc.ents = list(doc.ents) + [Span(doc, tok.i, tok.i+1, 'MISSING')]
                displacy.render(sentence, style='ent', options = {'colors': {'MISSING': 'pink',
                                                                            'KNOWLEDGE': 'lightgreen'}})

def get_extractions(examples, *extractors):
    # Could use context instead of enumerate
    for idx, doc in enumerate(nlp.pipe(examples, batch_size=100, disable=['ner'])):
        for ent in filter_spans([Span(doc, start, end, label) for extractor in extractors for label, start, end in extractor(doc)]):
            sent = ent.root.sent
            yield ent.text, idx, ent.start, ent.end, ent.label_, sent.start, sent.end

In [27]:
ads = list(df_DS['description'])
show_extraction(ads[:100], matcher)

In [28]:
def extract_df(*extractors, n_max=None, **kwargs):
    if n_max is None:
        n_max = len(df_DS)
    ent_df = pd.DataFrame(list(get_extractions(df_DS[:n_max]['description'], *extractors)),
                          columns=['text', 'docidx', 'start', 'end', 'label', 'sent_start', 'sent_end'])
    return ent_df.merge(df_DS, how='left', left_on='docidx', right_index=True)
    # return ent_df

ent_df = extract_df(matcher)

In [29]:
def aggregate_df(df, col=['text']):
    return (df
            .groupby(col)
            .agg(n_JDs=('id', 'nunique'))
            .reset_index()
            .sort_values(['n_JDs'], ascending=False)
        )

In [30]:
aggregate_df(ent_df)


Unnamed: 0,text,n_JDs
191,opportunities Skill,81
87,communication skill,10
60,Knowledge of programming languages,10
138,knowledge of SQL,9
70,Knowledge of the Business Intelligence,9
...,...,...
88,computer knowledge,1
90,data engineers Knowledge,1
91,data structures Knowledge,1
92,date knowledge,1
