In [14]:
import pandas as pd 
import numpy as np
import nltk
import re
import string
import matplotlib.pyplot as plt
import math

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.cluster import KMeans
from sklearn import metrics
from scipy.spatial.distance import cdist


nltk.download('stopwords')
nltk.download('tagsets')


[nltk_data] Downloading package stopwords to /Users/boluo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package tagsets to /Users/boluo/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


True

In [15]:
job_df = pd.read_csv('TalentConnectWithIndustries.csv')
job_df = job_df.rename(columns={"Job Title": "job-title","Unnamed: 0": "index","#": "source","Employer":"company","Internship/Project Description":"Job Description"})
job_df.loc[:,'source'] = 'TalentConnect'
job_df.columns

Index(['index', 'source', 'job-title', 'ID', 'company', 'Vacancies',
       'Estimated Start Date', 'Estimated End Date', 'Job Description',
       'Learning Outcomes', 'Finance', 'Healthcare', 'Supply, Logistics',
       'Retail, Marketing', 'Research', 'Public Sector'],
      dtype='object')

In [16]:
stop = stopwords.words('english')
other_stop_words = ['skill','join','center','increase','imporve','turning','work','tool','postion','within','big','main','description','part','people','make','nation','come','Singapore','insight','strong','customer','role','develop','requirement','quality','working','support','solution','provide','knowledge','reporting','problem','platform','job','key','performance','external','bi','build','year','opportunity','excellent','good','issue','technical','improvement','internal','eg','etl','information','required','preferred','including','help','also','perform','understand','set','understanding','identify','solving','using','report','new','ability','source','growth','various','industry','etc','well','looking','da','d2c','field','use','trend','sources','able','us']
stop = stop + other_stop_words
def clean_text(text,stop):
	text = text.lower()
	text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)
	text = re.sub(r'[0-9]+', '',text)
	text = ' '.join(text.split())

	cleaned = word_tokenize(text)
	tokens = [x for x in cleaned if x not in stop]
	return tokens

In [17]:
def lemming(tokens):
    lemmatizer = WordNetLemmatizer()
    lem = [lemmatizer.lemmatize(x) for x in tokens]
    return lem

In [18]:
def stemming(tokens):
    stemmer = SnowballStemmer(language='english')
    stems = [stemmer.stem(x) for x in tokens]
    return stems

In [19]:
job_titles = ['data engineer','business analyst','data analyst','data scientist','machine learning engineer']
job_titles_split = [x.split(' ')  for x in job_titles]
job_titles_split[3].append('science')
print(job_titles)

['data engineer', 'business analyst', 'data analyst', 'data scientist', 'machine learning engineer']


In [20]:
def find_category(title,keywords):
    category = -1
    score = 0
    for i in range(len(job_titles_split)):
        score_1 = 0
        for word in title:
            if (word in job_titles_split[i]):
                score_1 = score_1+1
        score_1 = score_1/len(job_titles_split[i])
        score_2 = 0
        for word in job_titles_split[i]:
            if(word in keywords.keys()):
                score_2+=keywords[word]
        if(score_1+score_2>score):
            score = score_1 +score_2
            category =i
        score_1=0
        score_2=0
    return category

In [21]:
def normalize_title(text,keywords):
    text = text.lower()
    text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)
    text = re.sub(r'[0-9]+', '',text)
    text = " ".join(text.split())
    tokens = word_tokenize(text)
    tagged = nltk.pos_tag(lemming(tokens))
    new_title = []
    for lem in tagged:
        if(lem[1] in ['NN','NNS','JJ']):
            new_title.append(lem[0])
    category = find_category(new_title,keywords)
    return job_titles[category]

In [22]:
cleaned_text = []
tokens = []
lems = []
stems = []
tagged = []
all_tags = []

description = 'Job Description'
title = 'job-title'

for index, row in job_df.iterrows():
    cur = clean_text(job_df[description][index],stop)
    lem_cur = lemming(cur)
    stem_cur = stemming(cur)
    with_tag = nltk.pos_tag(lem_cur)

    tokens.append(cur)
    lems.append(lem_cur)
    stems.append(stem_cur)
    tagged.append(with_tag)
    all_tags.extend(with_tag)
    temp = []

    grammar = ['NN','NNS','VB','VBG','VBD','VBN','VBP','VBZ']
    for x in with_tag:
        if(x[1] in grammar):
            temp.append(x[0])
    temp = ' '.join(temp)
    cleaned_text.append(temp)

job_df['tokens'] = tokens
job_df['lems'] = lems
job_df['tagged'] = tagged
job_df['stems'] = stems
job_df['cleaned_text'] = cleaned_text

In [23]:
vocabulary = job_df['cleaned_text']
def vectorize(vocabulary,max_df,stop,low,high):
    cv = CountVectorizer(max_df=max_df,stop_words=stop,analyzer='word',ngram_range=(low,high))
    count_vector = cv.fit_transform(vocabulary)
    feature_names = cv.get_feature_names()
    tfidf_transformer = TfidfTransformer(smooth_idf = True,use_idf = True)
    tfidf_vector = tfidf_transformer.fit(count_vector)

    sum_words = count_vector.sum(axis = 0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in cv.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return cv, tfidf_transformer, feature_names, tfidf_vector, words_freq

cv, tfidf_transformer, feature_names, tfidf_vector, words_freq = vectorize(vocabulary,0.6,stop,1,2)

#cos_sim = cosine_similarity(tfidf_vector, tfidf_vector)
#words_freq[:20]

In [24]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col,coo_matrix.data)
    return sorted(tuples,key = lambda x:(x[1],x[0]),reverse = True)

def extract_topn(feature_names,sorted_words,topn):
    n = topn
    if (n<1): n = math.floor(len(sorted_words)*topn)
    sorted_words = sorted_words[:n]
    score_vals = []
    feature_vals = []
    for index, score in sorted_words:
        score_vals.append(round(score,3))
        feature_vals.append(feature_names[index])
    results = {}
    for index in range(len(feature_vals)):
        results[feature_vals[index]] = score_vals[index]
    return results


key_words = []
title_norm = []
for index, row in job_df.iterrows():
    doc = job_df['cleaned_text'][index]
    tfidf_vector = tfidf_transformer.transform(cv.transform([doc]))
    sorted_words = sort_coo(tfidf_vector.tocoo())
    keywords = extract_topn(feature_names,sorted_words,0.25)
    key_words.append(keywords)
    title_norm.append(normalize_title(job_df[title][index],keywords))
job_df['normalized_title'] = title_norm
job_df['key_words'] = key_words
job_df.to_csv('TalentConnect.csv')


In [25]:
job_df

Unnamed: 0,index,source,job-title,ID,company,Vacancies,Estimated Start Date,Estimated End Date,Job Description,Learning Outcomes,...,"Retail, Marketing",Research,Public Sector,tokens,lems,tagged,stems,cleaned_text,normalized_title,key_words
0,1,TalentConnect,College Intern - Data Science (Remote),67596,HP SINGAPORE PTE. LTD,3,2021-06-01,2021-12-27,Team Description: This position is with the D...,Team Description: This position is with the D...,...,False,False,False,"[team, position, data, analytics, team, smart,...","[team, position, data, analytics, team, smart,...","[(team, NN), (position, NN), (data, NNS), (ana...","[team, posit, data, analyt, team, smart, manuf...",team position data analytics team manufacturin...,data scientist,"{'smarc': 0.208, 'manufacturing': 0.186, 'anal..."
1,2,TalentConnect,Junior Data Scientist,66426,Rio Tinto,2,2021-05-03,2021-12-31,This role is a great opportunity to be involve...,This role is a great opportunity to be involve...,...,True,False,False,"[great, involved, delivery, analytics, proof, ...","[great, involved, delivery, analytics, proof, ...","[(great, JJ), (involved, JJ), (delivery, NN), ...","[great, involv, deliveri, analyt, proof, conce...",delivery analytics proof concept play executio...,data scientist,"{'science': 0.162, 'data science': 0.16, 'code..."
2,3,TalentConnect,Data Science_AMTNP,68454,GlaxoSmithKline / Glaxo Wellcome Manufacturing...,1,2021-07-05,2021-12-31,Modern manufacturing is no longer just about f...,Modern manufacturing is no longer just about f...,...,False,False,False,"[modern, manufacturing, longer, finding, ways,...","[modern, manufacturing, longer, finding, way, ...","[(modern, JJ), (manufacturing, NN), (longer, R...","[modern, manufactur, longer, find, way, oper, ...",manufacturing finding way operate reducing exp...,data scientist,"{'site': 0.26, 'manufacturing': 0.2, 'way': 0...."
3,4,TalentConnect,Digital Performance Management_NPAMT,68455,GlaxoSmithKline / Glaxo Wellcome Manufacturing...,1,2021-07-05,2021-12-31,"In alignment with the digitalisation, data and...","In alignment with the digitalisation, data and...",...,False,False,False,"[alignment, digitalisation, data, analytics, s...","[alignment, digitalisation, data, analytics, s...","[(alignment, JJ), (digitalisation, NN), (data,...","[align, digitalis, data, analyt, strategi, exi...",digitalisation data analytics strategy exists ...,machine learning engineer,"{'phase': 0.297, 'dashboard': 0.21, 'student':..."
4,5,TalentConnect,Digital Twin_NPAMT,68465,GlaxoSmithKline / Glaxo Wellcome Manufacturing...,1,2021-07-05,2021-12-31,Opportunity Statement Dolutegravir (or “DTG”)...,Opportunity Statement Dolutegravir (or “DTG”)...,...,False,False,False,"[statement, dolutegravir, “, dtg, ”, first, ap...","[statement, dolutegravir, “, dtg, ”, first, ap...","[(statement, NN), (dolutegravir, NN), (“, NNP)...","[statement, dolutegravir, “, dtg, ”, first, ap...",statement dolutegravir dtg approved state food...,machine learning engineer,"{'twin': 0.368, 'plant': 0.24, 'campaign': 0.2..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
146,147,TalentConnect,Intern - Data Science (ref: 342612),172146,Glaxo Wellcome Manufacturing Pte Ltd,1,2023-01-09,2023-05-26,Office Location: Jurong Hiring Manager: AMT S...,Office Location: Jurong Hiring Manager: AMT S...,...,False,False,False,"[office, location, jurong, hiring, manager, am...","[office, location, jurong, hiring, manager, am...","[(office, NN), (location, NN), (jurong, IN), (...","[offic, locat, jurong, hire, manag, amt, singa...",office location hiring manager amt singapore p...,data scientist,"{'gsk': 0.213, 'disease': 0.199, 'place': 0.16..."
147,148,TalentConnect,Digital Platform & Integration (Data Science I...,176054,ENGIE SERVICES SINGAPORE PTE. LTD.,1,2022-12-19,2023-05-31,Job Description: ·Provide detailed documentat...,Job Description: ·Provide detailed documentat...,...,False,False,False,"[·provide, detailed, documentation, capture, p...","[·provide, detailed, documentation, capture, p...","[(·provide, RB), (detailed, JJ), (documentatio...","[·provid, detail, document, captur, project, r...",documentation capture project requirement stan...,data scientist,"{'format': 0.15, 'data format': 0.15, 'configu..."
148,149,TalentConnect,System Analyst Associate,176797,Maha Chemicals (Asia) Pte Ltd,1,2022-12-01,2023-05-01,Responsibilities: - Draw flow charts to visual...,Responsibilities: - Draw flow charts to visual...,...,False,False,False,"[responsibilities, draw, flow, charts, visuali...","[responsibility, draw, flow, chart, visualize,...","[(responsibility, NN), (draw, NN), (flow, JJ),...","[respons, draw, flow, chart, visual, workflow,...",responsibility draw chart visualize process ga...,business analyst,"{'visualize process': 0.158, 'user test': 0.15..."
149,150,TalentConnect,"Intern – Sales Operations, APJ",176941,Thermo Fisher Scientific Ltd,1,2023-01-03,2023-06-05,How Will You Make an Impact? The Sales Operat...,How Will You Make an Impact? The Sales Operat...,...,True,False,False,"[impact, sales, operations, intern, apj, suppo...","[impact, sale, operation, intern, apj, support...","[(impact, JJ), (sale, NN), (operation, NN), (i...","[impact, sale, oper, intern, apj, support, ove...",sale operation apj supporting life science pro...,business analyst,"{'sale': 0.465, 'sale operation': 0.302, 'anal..."


filter by job_title by finding all jobs containing the tokens of the preferred job_title