In [1]:
%run development/libraries/placements-proxy

In [2]:
def remove_html(anystring):
  if anystring:
    parser=HTMLParser()
    anystring = parser.unescape(anystring)
    anystring=re.sub("<.*?>"," ",anystring)
    #anystring = re.sub("&\w+;"," ",anystring)
    anystring=re.sub("\xa0"," ",anystring)
  return anystring

remove_html = udf(remove_html , StringType())

In [3]:
country = 'hk'
train = spark.read.parquet("s3a://exploratory/"+country+"/data/train_v4")

In [4]:
def process_profile_text(JsGlobal_JobSeekerWorkExperience_Position, JsGlobal_JobSeekerWorkExperience_JobDuties , JsGlobal_JobSeekerSkill_Skill):
  text = ''
  if JsGlobal_JobSeekerWorkExperience_Position and JsGlobal_JobSeekerWorkExperience_JobDuties: 
    if len(JsGlobal_JobSeekerWorkExperience_Position) == len(JsGlobal_JobSeekerWorkExperience_JobDuties):
      for i in range(len(JsGlobal_JobSeekerWorkExperience_Position)):
        text += JsGlobal_JobSeekerWorkExperience_Position[i] + JsGlobal_JobSeekerWorkExperience_JobDuties[i] + ' '
    else:
      text = ' '
  if JsGlobal_JobSeekerSkill_Skill:
    text += "," . join(JsGlobal_JobSeekerSkill_Skill)     
  return text

process_profile_text = udf(process_profile_text, StringType())

concat_list = udf(lambda x: ", " . join(x) if x and len(x)>0 else '')

In [5]:
#candidate profile
profile_text = train.select(
  'JobSeekerId',
  'JsGlobal_JobSeekerWorkExperience_Position', 
  'JsGlobal_JobSeekerWorkExperience_JobDuties', 
  'JsGlobal_JobSeekerSkill_Skill'
).dropDuplicates().select(
  process_profile_text('JsGlobal_JobSeekerWorkExperience_Position' , 'JsGlobal_JobSeekerWorkExperience_JobDuties' , 'JsGlobal_JobSeekerSkill_Skill').alias('text')
)

#candidate online resume
resume_text = train.select(
  'ResumeId',
  'JsResume_WorkExperience_Position', 
  'JsResume_SkillsContent', 
).dropDuplicates().select(
  concat(
    concat_list('JsResume_WorkExperience_Position'),
    remove_html('JsResume_SkillsContent')
  ).alias('text')
).dropna(subset=['text'])

#jobAds
jobads_text = train.select(  
  'JobAdId',
  'JobAd_JobTitle',
  'JobAd_JobDescriptionRequirement'
).dropDuplicates().select(
  concat(
    'JobAd_JobTitle',
    remove_html('JobAd_JobDescriptionRequirement')
  ).alias('text')
)

#create a corpus
text_df = profile_text.union(resume_text).union(jobads_text)

In [6]:
print(profile_text.count())
print(resume_text.count())
print(jobads_text.count())
print(text_df.count())

In [7]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml import Pipeline


regex_tokenizer = RegexTokenizer(inputCol='text', outputCol="tokenized_text" , pattern="[\s,(),\W+]", minTokenLength=2)
removal = StopWordsRemover(inputCol="tokenized_text", outputCol="text_filtered")
cv = CountVectorizer(inputCol='text_filtered', outputCol = 'desc_count', vocabSize = 50000, minDF = 3)
idf = IDF(inputCol = "desc_count", outputCol = "desc_features")
lda = LDA(k = 500, featuresCol = 'desc_features', maxIter = 600)

stages = [regex_tokenizer, removal, cv, idf, lda]

pipeline = Pipeline(stages=stages)
ldaModel = pipeline.fit(text_df)

In [8]:
ldf_df = ldaModel.transform(train)

In [9]:
def cosine_similarity(vec1, vec2):
  dotprod = vec1.dot(vec2)
  norm1 = float(numpy.sqrt(vec1.dot(vec1)))
  norm2 = float(numpy.sqrt(vec2.dot(vec2)))
  if norm1 == 0 or norm2 == 0:
    return 0.0
  else:
    a = dotprod/(norm1*norm2)
    return a.item()
cosine_similarity = udf(cosine_similarity, FloatType())

def hellinger_distance(vec1, vec2):
  if (numpy.count_nonzero(vec1) == 0) or (numpy.count_nonzero(vec2) == 0):
    return 0.0
  else:
    _SQRT2 = numpy.sqrt(2)
    temp = 1 - numpy.sqrt(numpy.sum((numpy.sqrt(vec1) - numpy.sqrt(vec2)) ** 2)) / _SQRT2
    return temp.item() # to change it from a numpy float to an ordinary float
hellinger_distance = udf(hellinger_distance, FloatType())

In [10]:
temps_ds = ldf_df.select(
    'Id',
    cosine_similarity(col('job_topics'),col('cand_topics')).alias('lda_cosine_similarity'),
    hellinger_distance(col('job_topics'),col('cand_topics')).alias('lda_hellinger_distance')
  )