<h2>Create Training Dataset</h2>

In [3]:
#imports
import pandas as pd
import numpy as np
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.initializers import Constant
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm
from keras.layers import Dense,SpatialDropout1D
import contractions
import re 
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
# initializing Stop words libraries
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk.tokenize import word_tokenize

[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1129)>
[nltk_data] Error loading averaged_perceptron_tagger: <urlopen error
[nltk_data]     [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify
[nltk_data]     failed: unable to get local issuer certificate
[nltk_data]     (_ssl.c:1129)>
[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1129)>


In [4]:
def clean(desc):
    desc = contractions.fix(desc)
    desc = re.sub("[+/!@.$\'\'':()]", "", desc)
    return desc

In [5]:
def tokenize_and_tag(desc):
    tokens = nltk.word_tokenize(desc.lower())
    filtered_tokens = [w for w in tokens if not w in stop_words]
    tagged = nltk.pos_tag(filtered_tokens)
    return tagged

In [6]:
def extract_POS(tagged):
    #pattern 1
    grammar1 = ('''Noun Phrases: {<DT>?<JJ>*<NN|NNS|NNP>+}''')
    chunkParser = nltk.RegexpParser(grammar1)
    tree1 = chunkParser.parse(tagged)

    # typical noun phrase pattern appending to be concatted later
    g1_chunks = []
    for subtree in tree1.subtrees(filter=lambda t: t.label() == 'Noun Phrases'):
        g1_chunks.append(subtree)
    
    #pattern 2
    grammar2 = ('''NP2: {<IN>?<JJ|NN>*<NNS|NN>} ''')
    chunkParser = nltk.RegexpParser(grammar2)
    tree2 = chunkParser.parse(tagged)

    # variation of a noun phrase pattern to be pickled for later analyses
    g2_chunks = []
    for subtree in tree2.subtrees(filter=lambda t: t.label() == 'NP2'):
        g2_chunks.append(subtree)
        
    #pattern 3
    grammar3 = (''' VS: {<VBG|VBZ|VBP|VBD|VB|VBN><NNS|NN>*}''')
    chunkParser = nltk.RegexpParser(grammar3)
    tree3 = chunkParser.parse(tagged)

    # verb-noun pattern appending to be concatted later
    g3_chunks = []
    for subtree in tree3.subtrees(filter=lambda t: t.label() == 'VS'):
        g3_chunks.append(subtree)
        
        
    # pattern 4
    # any number of a singular or plural noun followed by a comma followed by the same noun, noun, noun pattern
    grammar4 = ('''Commas: {<NN|NNS>*<,><NN|NNS>*<,><NN|NNS>*} ''')
    chunkParser = nltk.RegexpParser(grammar4)
    tree4 = chunkParser.parse(tagged)

    # common pattern of listing skills appending to be concatted later
    g4_chunks = []
    for subtree in tree4.subtrees(filter=lambda t: t.label() == 'Commas'):
        g4_chunks.append(subtree)
        
    return g1_chunks, g2_chunks, g3_chunks, g4_chunks

In [7]:
def training_set(chunks):
    '''creates a dataframe that easily parsed with the chunks data '''
    df = pd.DataFrame(chunks)    
    df.fillna('X', inplace = True)
    
    train = []
    for row in df.values:
        phrase = ''
        for tup in row:
            # needs a space at the end for seperation
            phrase += tup[0] + ' '
        phrase = ''.join(phrase)
        # could use padding tages but encoder method will provide during 
        # tokenizing/embeddings; X can replace paddding for now
        train.append( phrase.replace('X', '').strip())

    df['phrase'] = train

    #returns 50% of each dataframe to be used if you want to improve execution time
    # return df.phrase.sample(frac = 0.5)
    # Update: only do 50% if running on excel
    return df.phrase

def strip_commas(df):
    '''create new series of individual n-grams'''
    grams = []
    for sen in df:
        sent = sen.split(',')
        for word in sent:
            word = word.strip()
            grams.append(word)
    return pd.Series(grams)

In [8]:
def generate_phrases(desc):
    tagged = tokenize_and_tag(desc)
    g1_chunks, g2_chunks, g3_chunks, g4_chunks = extract_POS(tagged)
    c = training_set(g4_chunks)       
    separated_chunks4 = strip_commas(c)
    phrases = pd.concat([training_set(g1_chunks),
                          training_set(g2_chunks), 
                          training_set(g3_chunks),
                          separated_chunks4], 
                            ignore_index = True )
    return phrases

In [9]:
"""Creates corpus from feature column, which is a pandas series"""
def create_corpus(df):
    corpus=[]
    for phrase in tqdm(df):
        words=[word.lower() for word in word_tokenize(phrase) if(word.isalpha()==1)]
        corpus.append(words)
    return corpus

"""Create padded sequences of equal lenght as input to LSTM"""
def create_padded_inputs(corpus):
    MAX_LEN=20
    tokenizer_obj=Tokenizer()
    tokenizer_obj.fit_on_texts(corpus)
    sequences=tokenizer_obj.texts_to_sequences(corpus)

    phrase_pad=pad_sequences(sequences,maxlen=MAX_LEN,truncating='post',padding='post')
    return phrase_pad

In [11]:
def get_predictions(desc):
    #clean
    desc = clean(desc)
    #load model
    model = tf.keras.models.load_model('models/lstm_skill_extractor.h5')
    #tokenize and convert to phrases
    phrases = generate_phrases(desc)
    #preprocess unseen data
    corpus=create_corpus(phrases)
    corpus_pad = create_padded_inputs(corpus)
    #get predicted classes
    predictions = (model.predict(corpus_pad) > 0.65).astype('int32')
    #return predicted skills as list
    out = pd.DataFrame({'Phrase':phrases, 'Class':predictions.ravel()})
    skills = out.loc[out['Class'] == 1]
    return skills['Phrase'].tolist()

<h3>Read Data Science JD in English</h3>

In [12]:
list_data_fields = ['LinkedIn/output/JD_Business Intelligence.csv','LinkedIn/output/JD_Data Analyst.csv',
                    'LinkedIn/output/JD_Data Architect.csv','LinkedIn/output/JD_Data Engineer.csv',
                    'LinkedIn/output/JD_Data Scientist.csv','LinkedIn/output/JD_Business Analyst.csv','LinkedIn/output/JD_Database Administrator.csv']

def read_and_clean_single_role(input_path=''):
    df = pd.read_csv(input_path,sep='|')

    # Drop Null Values for description column
    df = df.dropna(axis=0,subset=['description'])

    # Remove the same description & company & jobTitle & location & type
    df = df.drop_duplicates(subset=['jobTitle','location','company','type','description'],keep='first')

    return df
    
def create_field_DataFrame(input_list = list):
    list_dfs=[]
    for item in input_list:
        print(item)
        df_tmp = read_and_clean_single_role('../'+item)
        list_dfs.append(df_tmp)

    df = pd.concat(list_dfs)
    df = df.reset_index()
    return df

def read_DS_JD_en():
    df_DS = create_field_DataFrame(list_data_fields)
    df_lang = pd.read_csv('ds_job_lang.csv')
    df_DS['lang'] = df_lang['language']
    df_DS = df_DS[df_DS['lang']=='en']
    df_DS
    # df_DS = df_DS.reset_index('id')
    df_DS.reset_index(inplace=True,drop=True)
    return df_DS

In [13]:
df_DS = read_DS_JD_en()
df_DS

LinkedIn/output/JD_Business Intelligence.csv
LinkedIn/output/JD_Data Analyst.csv
LinkedIn/output/JD_Data Architect.csv
LinkedIn/output/JD_Data Engineer.csv
LinkedIn/output/JD_Data Scientist.csv
LinkedIn/output/JD_Business Analyst.csv
LinkedIn/output/JD_Database Administrator.csv


Unnamed: 0,index,id,jobTitle,location,company,type,description,lang
0,0,2975673294,Data Engineer in Business Intelligence Team (B...,"Ho Chi Minh City, Ho Chi Minh City, Vietnam",Agoda,Full-time · Entry level,About the job\nAbout Agoda\n\nAgoda is an onli...,en
1,1,2993666500,"Business Intelligence Intern, Kobiton","Ho Chi Minh City, Ho Chi Minh City, Vietnam","KMS Technology, Inc.",Internship · Internship,About the job\nThis job is sourced from a job ...,en
2,2,2975674182,"Business Intelligence Developer, Product Team ...","Ho Chi Minh City, Ho Chi Minh City, Vietnam",Agoda,Full-time · Associate,About the job\nAbout Agoda\n\nAgoda is an onli...,en
3,3,2993740820,"Business Intelligence (Edtech Company, Upto 10...","Hanoi, Hanoi, Vietnam",X3english Limited Company,"₫15,000,000/month - ₫23,000,000/month · Full-t...",About the job\nThis job is sourced from a job ...,en
4,4,2945707463,Senior Business Intelligence Analyst (0824),"Hanoi, Hanoi, Vietnam",Techcombank (TCB),Full-time · Associate,About the job\nJob Purpose\nThe Job Holder Res...,en
...,...,...,...,...,...,...,...,...
427,16,2990265063,"Database Administrator (DBA), based in Da Nang","Đà Nang, Da Nang City, Vietnam","KMS Technology, Inc.",Full-time · Associate,About the job\nAs a senior DBA you will have t...,en
428,17,3008519916,Junior/Senior Database Administrator (DBA),"Ho Chi Minh City, Ho Chi Minh City, Vietnam","KMS Technology, Inc.",Full-time · Mid-Senior level,About the job\nAs a senior DBA you will have t...,en
429,18,2698740162,[Local Product] Database Administrator,"Ho Chi Minh City, Ho Chi Minh City, Vietnam",Shopee,Full-time · Mid-Senior level,About the job\nAbout The Team\n\nThe Product D...,en
430,19,2973389208,"[Local Product] Database Administrator (MySQL,...","Ho Chi Minh City, Ho Chi Minh City, Vietnam",Shopee,Full-time · Entry level,About the job\nWhat Will You Do\nResponsible f...,en


<h3>Run clean and tokenize</h3>

In [14]:
def clean_tokenize_tag(df: pd.DataFrame):
    df['description'] = df['description'].apply(lambda x: [contractions.fix(word) for word in x.split()])
    df['description'] = [' '.join(map(str, l)) for l in df['description']]
    df['description'] = df['description'].str.lower()
    # df['tokenized_desc'] = df['description'].apply(lambda x: tokenize_and_tag(x))
    return df 


df = clean_tokenize_tag(df_DS)
# df
    



In [15]:
df

Unnamed: 0,index,id,jobTitle,location,company,type,description,lang
0,0,2975673294,Data Engineer in Business Intelligence Team (B...,"Ho Chi Minh City, Ho Chi Minh City, Vietnam",Agoda,Full-time · Entry level,about the job about agoda agoda is an online t...,en
1,1,2993666500,"Business Intelligence Intern, Kobiton","Ho Chi Minh City, Ho Chi Minh City, Vietnam","KMS Technology, Inc.",Internship · Internship,about the job this job is sourced from a job b...,en
2,2,2975674182,"Business Intelligence Developer, Product Team ...","Ho Chi Minh City, Ho Chi Minh City, Vietnam",Agoda,Full-time · Associate,about the job about agoda agoda is an online t...,en
3,3,2993740820,"Business Intelligence (Edtech Company, Upto 10...","Hanoi, Hanoi, Vietnam",X3english Limited Company,"₫15,000,000/month - ₫23,000,000/month · Full-t...",about the job this job is sourced from a job b...,en
4,4,2945707463,Senior Business Intelligence Analyst (0824),"Hanoi, Hanoi, Vietnam",Techcombank (TCB),Full-time · Associate,about the job job purpose the job holder respo...,en
...,...,...,...,...,...,...,...,...
427,16,2990265063,"Database Administrator (DBA), based in Da Nang","Đà Nang, Da Nang City, Vietnam","KMS Technology, Inc.",Full-time · Associate,about the job as a senior dba you will have to...,en
428,17,3008519916,Junior/Senior Database Administrator (DBA),"Ho Chi Minh City, Ho Chi Minh City, Vietnam","KMS Technology, Inc.",Full-time · Mid-Senior level,about the job as a senior dba you will have to...,en
429,18,2698740162,[Local Product] Database Administrator,"Ho Chi Minh City, Ho Chi Minh City, Vietnam",Shopee,Full-time · Mid-Senior level,about the job about the team the product devel...,en
430,19,2973389208,"[Local Product] Database Administrator (MySQL,...","Ho Chi Minh City, Ho Chi Minh City, Vietnam",Shopee,Full-time · Entry level,about the job what will you do responsible for...,en


<h3>Create chunks_1 and save to Pickle file</h3>

In [18]:
print(df.loc[0]['description'])

about the job about agoda agoda is an online travel booking platform for accommodations, flights, and more. we build and deploy cutting-edge technology that connects travelers with more than 2.5 million accommodations globally. based in asia and part of booking holdings, our 4,000+ employees representing 90+ nationalities foster a work environment rich in diversity, creativity, and collaboration. we innovate through a culture of experimentation and ownership, enhancing the ability for our customers to experience the world. get to know our team the data department oversees all of agoda’s data-related requirements. our ultimate goal is to enable and increase the use of data in the company through creative approaches and the implementation of powerful resources such as operational and analytical databases, queue systems, bi tools, and data science technology. we hire the brightest minds from around the world to take on this challenge and equip them with the knowledge and tools that contri

In [110]:
# text = df.loc[0]['description']
g1_chunks = []
g2_chunks = []
g3_chunks = []
g4_chunks = []
for index,row in df.iterrows():
    g1_chunks = g1_chunks + extract_POS(row['tokenized_desc'])[0]
    g2_chunks = g2_chunks + extract_POS(row['tokenized_desc'])[1]
    g3_chunks = g3_chunks + extract_POS(row['tokenized_desc'])[2]
    g4_chunks = g4_chunks + extract_POS(row['tokenized_desc'])[3]


In [112]:
# print(len(df.loc[0]['tokenized_desc']))
print(len(g1_chunks))
print(len(g2_chunks))
print(len(g3_chunks))
print(len(g4_chunks))


41897
44612
18716
2469


In [116]:
import pickle
with open('../pickles/chunks_1.pickle', 'wb') as fp1:
    pickle.dump(g1_chunks, fp1)
with open('../pickles/chunks_2.pickle', 'wb') as fp2:
    pickle.dump(g2_chunks, fp2)
with open('../pickles/chunks_3.pickle', 'wb') as fp3:
    pickle.dump(g3_chunks, fp3)
with open('../pickles/chunks_4.pickle', 'wb') as fp4:
    pickle.dump(g4_chunks, fp4)


In [121]:
import pickle
infile = open('../pickles/chunks_4.pickle','rb')
new_dict = pickle.load(infile)
infile.close()
print(len(new_dict))

2469


<h3>Get 20% of each chunks to Label</h3>

In [170]:
import random
c1 = g1_chunks
c2 = g2_chunks
c3 = g3_chunks
c4 = g4_chunks

In [171]:
print(len(c1))
print(len(c2))
print(len(c3))
print(len(c4))

41897
44612
18716
2469


<h3>generate phrases from 4 chunks above</h3>

In [172]:
c1_phrases = training_set(c1)
c2_phrases = training_set(c2)
c3_phrases = training_set(c3)
c4_phrases = training_set(c4)
c4_phrases = strip_commas(c4_phrases)

In [139]:
print('1\n',c1_phrases)


1
 0       large disconnected datasets
1                  azure databricks
2                initiative get job
3                              epam
4                          projects
                   ...             
8374                        ability
8375                          enjoy
8376                        ability
8377                 southeast asia
8378                     passionate
Name: phrase, Length: 8379, dtype: object


In [140]:
print('2\n',c2_phrases)


2
 0                                       practices
1                          quarterly company trip
2                                     bring world
3                 career path fun team activities
4       bachelor software development mathematics
                          ...                    
8917                                       inmail
8918                        document requirements
8919                                       hà nội
8920                                  performance
8921                                    able read
Name: phrase, Length: 8922, dtype: object


In [141]:
print('3\n',c3_phrases)


3
 0                                                  access
1                                           gcs data lake
2                                   big data technologies
3                                               positions
4                                             methodology
                              ...                        
3738                                   job accountability
3739                                         technologies
3740                                               injury
3741    talent acquisition specialist job poster location
3742                             quality contribute sales
Name: phrase, Length: 3743, dtype: object


In [142]:
print('4\n',c4_phrases)

4
 0                           digital platform engineering
1                                                    etc
2                                                   work
3                                requirement elicitation
4      define product roadmap vision prioritize features
                             ...                        
488                              actions consumers/users
489                                  sql aws/gcp/azure ‘
490                                          user access
491    possess great knowledge requirement elicitatio...
492                                                  cpd
Length: 493, dtype: object


In [173]:
df_total = pd.concat([c1_phrases,c2_phrases,c3_phrases,c4_phrases],ignore_index=True)

In [174]:
df_total = pd.DataFrame({'Phrase':df_total})

In [175]:
target_list = [0 for i in range(len(df_total))]
df_total['Target'] = target_list

In [177]:
df_total.drop_duplicates(subset=['Phrase'],keep='first',inplace=True)

In [178]:
df_total

Unnamed: 0,Phrase,Target
0,job agoda,0
1,online travel,0
2,platform accommodations,0
3,flights,0
4,deploy cutting-edge technology,0
...,...,...
112227,elk,0
112251,ability understand,0
112398,chi minh,0
112410,levels performance,0


In [158]:
df_total.to_csv('../output/total_phrases.csv',index=False)