In [1]:
%run development/libraries/placements-proxy


In [2]:
def array_deduplicates(arr):
  if arr:
    return list(set(arr))
  else:
    return []
array_deduplicates = udf(array_deduplicates, ArrayType(StringType()))

In [3]:
directory = 's3a://sa-matching-dev/placements_proxy_jobsdb/exploratory/hk/data'
country = 'hk'

In [4]:
train = spark.read.parquet("s3a://exploratory/"+country+"/data/train_v4")

In [5]:
display(train.select('Job_Industry').limit(5))

JsGlobal_JobSeekerWorkExperience_Industry
"List(49, 17)"
"List(65, 65)"
"List(66, 39, 66, 41, 66)"
""
"List(41, 2)"


In [6]:
display(train.select(array_deduplicates('Job_Industry')).limit(5))

array_deduplicates(JsGlobal_JobSeekerWorkExperience_Industry)
"List(17, 49)"
List(65)
"List(39, 66, 41)"
List()
"List(2, 41)"


In [7]:
industry_dataset = train.select(
  'JobAd_IndustryId',
  array_deduplicates('Job_Industry').alias('Job_Industry'),
  'label'
).select(
  explode('Candidate_Industry').alias('Candidate_Industry'),
  'JobAd_IndustryId',
  'label'
)

## Breakdown the ratio function

In [9]:
def generate_ratio_lookup(dataset, left_col, right_col, label_col='label', ratio_col='ratio', min_sample=20):
    base_positive = dataset.filter(
      col(label_col) == 1
    ).groupBy(
      left_col,
      right_col
    ).agg(count("*").alias('total_lr')).join(
      dataset.filter(col(label_col) == 1).select(col(left_col).alias('l_lookup')).groupBy('l_lookup').agg(count('*').alias('total_l')),
      [col(left_col) == col('l_lookup')]
    ).withColumn('ratio_p', col('total_lr') / col('total_l')).filter(col('total_l') > min_sample)
    
    base_negative = dataset.filter(
    col(label_col) == 0
  ).groupBy(
    left_col,
    right_col
  ).agg(count("*").alias('total_lr')).join(
    dataset.filter(col(label_col) == 0).select(col(left_col).alias('l_lookup')).groupBy('l_lookup').agg(count('*').alias('total_l')),
    [col(left_col) == col('l_lookup')]
  ).withColumn('ratio_n', col('total_lr') / col('total_l')).filter(col('total_l') > min_sample)
    
    base_lookup = base_positive.select(
      left_col,
      right_col,
      'ratio_p'
    ).join(
      base_negative.select(
        left_col,
        right_col,
        'ratio_n'
      ), 
      on=[left_col,right_col],
      how="full_outer"
    ).fillna(
      {'ratio_p' : 0, 'ratio_n' : 0}
    ).withColumn(
      ratio_col, 
      col('ratio_p') - col('ratio_n')
    ).select(left_col,right_col,ratio_col)
    
    return base_lookup

In [10]:
#only positive
display(generate_ratio_lookup(industry_dataset , 'Candidate_Industry' , 'JobAd_IndustryId' , label_col='label', ratio_col='ratio', min_sample=20))

JsGlobal_JobSeekerWorkExperience_Industry,JobAd_IndustryId,total_lr,l_lookup,total_l,ratio_p
20,64,6,20,392,0.0153061224489795
49,17,1,49,767,0.0013037809647979
56,33,3,56,526,0.0057034220532319
2,22,2,2,1861,0.0010746910263299
1,41,5,1,1196,0.0041806020066889
41,31,13,41,530,0.0245283018867924
2,40,1,2,1861,0.0005373455131649651
58,62,1,58,181,0.005524861878453
37,24,95,37,797,0.1191969887076537
33,66,62,33,906,0.0684326710816777


In [11]:
def generate_ratio_lookup_negative(dataset, left_col, right_col, label_col='label', ratio_col='ratio', min_sample=20):

    base_negative = dataset.filter(
      col(label_col) == 0
    ).groupBy(
      left_col,
      right_col
    ).agg(count("*").alias('total_lr')).join(
      dataset.filter(col(label_col) == 0).select(col(left_col).alias('l_lookup')).groupBy('l_lookup').agg(count('*').alias('total_l')),
      [col(left_col) == col('l_lookup')]
    ).withColumn('ratio_n', col('total_lr') / col('total_l')).filter(col('total_l') > min_sample)

   
    return base_negative

In [12]:
#only negative
display(generate_ratio_lookup_negative(industry_dataset , 'JsGlobal_JobSeekerWorkExperience_Industry' , 'JobAd_IndustryId' , label_col='label', ratio_col='ratio', min_sample=20))

JsGlobal_JobSeekerWorkExperience_Industry,JobAd_IndustryId,total_lr,l_lookup,total_l,ratio_n
2,40,45,2,18076,0.0024894888249612
41,31,149,41,4093,0.0364036159296359
1,41,108,1,11937,0.0090474993717014
20,64,40,20,3906,0.0102406554019457
56,33,59,56,4656,0.0126718213058419
49,17,19,49,6470,0.002936630602782
58,62,20,58,1139,0.0175592625109745
3,49,2,3,126,0.0158730158730158
2,22,1,2,18076,5.532197388802833e-05
55,41,2,55,502,0.0039840637450199
