In [1]:
%run development/libraries/placements-proxy

In [2]:
from pyspark.ml.linalg import VectorUDT

In [3]:
country = 'hk'

directory = 's3a://exploratory/'+country+'/data'
training = spark.read.parquet(directory + '/model_prepared_data_train_v6')
testing = spark.read.parquet(directory + '/model_prepared_data_test_v6')

In [4]:
print(training.count())
print("positive: ",training.filter(col('label')==1).count()*100/training.count())
print("negative: ", training.filter(col('label')==0).count()*100/training.count())

In [5]:
# Select the require features for the model
columns = [
  'salary_deviation_ratio', 
  'education_ratio', 
  'work_location_ratio' , 
  'JsResume_YearOfWorkExperience', 
  'JsResume_AuthorizedToWork',
  'lda_cosine_similarity',
  'lda_hellinger_distance',
  'job_function_1_ratio',
  'job_function_2_ratio',
  'industry_ratio',
  
  'JobAd_EmploymentTermContract', 
  'JobAd_EmploymentTermContractToPerm', 
  'JobAd_EmploymentTermFreelance', 
  'JobAd_EmploymentTermFullTime', 
  'JobAd_EmploymentTermInternship', 
  'JobAd_EmploymentTermNonImmigrantVisa', 
  'JobAd_EmploymentTermPartTime', 
  'JobAd_EmploymentTermPermanent', 
  'JobAd_EmploymentTermTempToPerm', 
  'JobAd_EmploymentTermTemporary'
]

In [6]:
# Balance the dataset with specific ratio for positive and negative signals
training = dataset_balancer(training, ratio=1.)
balanced_testing = dataset_balancer(testing, ratio=1.)

In [7]:
training.groupBy('label').count().show()

In [8]:
# Vectorize features into one single featue 
va = VectorAssembler(inputCols=columns, outputCol="features")

In [9]:
gbt = GBTClassifier(
  maxDepth=2, 
  maxBins=32, 
  minInstancesPerNode=1, 
  minInfoGain=0.0, 
  maxMemoryInMB=256, 
  cacheNodeIds=False, 
  checkpointInterval=10, #cache will get checkpointed (Writing to disk) every 10 iterations
  lossType="logistic", #logloss
  maxIter=200, # number fo trees
  stepSize=0.01, #learning rate 
  seed=123456789, 
  subsamplingRate=1.0
)

gbt_pipeline = Pipeline(stages=[va, gbt])
gbt_model = gbt_pipeline.fit(training)

transformed_training = gbt_model.transform(training)
print('training_score:' ,  BinaryClassificationEvaluator().evaluate(transformed_training))

transformed_balanced_testing = gbt_model.transform(balanced_testing)
print('balanced_testing_score:' ,  BinaryClassificationEvaluator(balancedToggle=True).evaluate(transformed_balanced_testing))

transformed_testing = gbt_model.transform(testing)
print('testing_score:' ,  BinaryClassificationEvaluator().evaluate(transformed_testing))

In [10]:
# check for overfitting : 0.5876366019318441-0.5639991429157845

In [11]:
secondelement=udf(lambda v:float(v[1]),FloatType())

display(
  transformed_training.select(
    'label', 
    secondelement('probability').alias('model_score')
  ).withColumn(
    "probability_rounded",
    round(col('model_score')*100)
  ).groupBy(
    'probability_rounded', 'label'
  ).agg(
    count('*').alias('frequency')
  ).join(
    transformed_training.groupBy('label').count(),
    'label'
  ).withColumn(
    'density', col('frequency')/col('count')
  ).sort('probability_rounded', 'label')
)

label,probability_rounded,frequency,count,density
0,6.0,9,21904,0.0004108838568298028
0,7.0,269,21904,0.0122808619430241
1,7.0,11,21985,0.0005003411416875142
0,8.0,316,21904,0.014426588750913
1,8.0,16,21985,0.000727768933363657
0,9.0,280,21904,0.0127830533235938
1,9.0,14,21985,0.0006367978166931999
0,10.0,257,21904,0.0117330168005843
1,10.0,9,21985,0.0004093700250170571
0,11.0,504,21904,0.0230094959824689


In [12]:
display(
  transformed_testing.select(
    'label', 
    secondelement('probability').alias('model_score')
  ).withColumn(
    "probability_rounded",
    round(col('model_score')*100)
  ).groupBy(
    'probability_rounded', 'label'
  ).agg(
    count('*').alias('frequency')
  ).join(
    transformed_testing.groupBy('label').count(),
    'label'
  ).withColumn(
    'density', col('frequency')/col('count')
  ).sort('probability_rounded', 'label')
)

label,probability_rounded,frequency,count,density
0,6.0,93,247340,0.00037600064688283336
0,7.0,2210,247340,0.0089350691356028
1,7.0,23,28184,0.000816065852966222
0,8.0,2780,247340,0.0112395892294008
1,8.0,30,28184,0.0010644337212602
0,9.0,2587,247340,0.0104592868116762
1,9.0,33,28184,0.0011708770933863
0,10.0,3005,247340,0.0121492682137947
1,10.0,22,28184,0.0007805847289242123
0,11.0,5376,247340,0.0217352632004528


In [13]:
display(
  transformed_training.select(
    'label', 
    secondelement('probability').alias('model_score')
  ).withColumn(
    "probability_rounded",
    round(col('model_score')*100)
  ).groupBy('probability_rounded', 'label').count().sort('probability_rounded', 'label')
)

probability_rounded,label,count
6.0,0,9
7.0,0,269
7.0,1,11
8.0,0,316
8.0,1,16
9.0,0,280
9.0,1,14
10.0,0,257
10.0,1,9
11.0,0,504


In [14]:
display(
  transformed_balanced_testing.select(
    'label', 
    secondelement('probability').alias('model_score')
  ).withColumn(
    "probability_rounded",
    round(col('model_score')*100)
  ).groupBy('probability_rounded', 'label').count().sort('probability_rounded', 'label')
)

probability_rounded,label,count
6.0,0,9
7.0,0,286
7.0,1,23
8.0,0,334
8.0,1,30
9.0,0,279
9.0,1,33
10.0,0,360
10.0,1,22
11.0,0,594


In [15]:
display(
  transformed_testing.select(
    'label', 
    secondelement('probability').alias('model_score')
  ).withColumn(
    "probability_rounded",
    round(col('model_score')*100)
  ).groupBy('probability_rounded', 'label').count().sort('probability_rounded', 'label')
)

probability_rounded,label,count
6.0,0,93
7.0,0,2210
7.0,1,23
8.0,0,2780
8.0,1,30
9.0,0,2587
9.0,1,33
10.0,0,3005
10.0,1,22
11.0,0,5376


In [16]:
from pyspark.sql.window import Window
from pyspark.sql.functions import cume_dist, row_number

windowval = (Window.partitionBy('label').orderBy(desc('probability_rounded'))
             .rangeBetween(Window.unboundedPreceding, 0))

cdf = transformed_training.select(
  'label', 
  secondelement('probability').alias('model_score')
).withColumn(
  "probability_rounded",
  round(col('model_score')*100)
).groupBy(
  'probability_rounded', 'label'
).agg(
  count('*').alias('frequency')
).join(
  transformed_training.groupBy('label').count(),
  'label'
).withColumn(
  'density', col('frequency')/col('count')
).withColumn('cum_density', sum('density').over(windowval)).sort('probability_rounded', 'label')

display(cdf)

label,probability_rounded,frequency,count,density,cum_density
0,6.0,9,21834,0.00041220115416323167,1.0
0,7.0,269,21834,0.0123202344966565,0.9995877988458368
1,7.0,11,21985,0.0005003411416875142,1.0000000000000004
0,8.0,316,21834,0.0144728405239534,0.9872675643491802
1,8.0,16,21985,0.000727768933363657,0.9994996588583128
0,9.0,280,21834,0.0128240359073005,0.9727947238252268
1,9.0,14,21985,0.0006367978166931999,0.9987718899249493
0,10.0,257,21834,0.0117706329577722,0.9599706879179262
1,10.0,9,21985,0.0004093700250170571,0.998135092108256
0,11.0,504,21834,0.0230832646331409,0.948200054960154


In [17]:
from pyspark.sql.window import Window
from pyspark.sql.functions import cume_dist, row_number

windowval = (Window.partitionBy('label').orderBy(desc('probability_rounded'))
             .rangeBetween(Window.unboundedPreceding, 0))

cdf = transformed_testing.select(
  'label', 
  secondelement('probability').alias('model_score')
).withColumn(
  "probability_rounded",
  round(col('model_score')*100)
).groupBy(
  'probability_rounded', 'label'
).agg(
  count('*').alias('frequency')
).join(
  transformed_testing.groupBy('label').count(),
  'label'
).withColumn(
  'density', col('frequency')/col('count')
).withColumn('cum_density', sum('density').over(windowval)).sort('probability_rounded', 'label')

display(cdf)

label,probability_rounded,frequency,count,density,cum_density
0,6.0,93,247340,0.00037600064688283336,1.0
0,7.0,2210,247340,0.0089350691356028,0.9996239993531172
1,7.0,23,28184,0.000816065852966222,1.0000000000000002
0,8.0,2780,247340,0.0112395892294008,0.9906889302175144
1,8.0,30,28184,0.0010644337212602,0.999183934147034
0,9.0,2587,247340,0.0104592868116762,0.9794493409881134
1,9.0,33,28184,0.0011708770933863,0.9981195004257736
0,10.0,3005,247340,0.0121492682137947,0.9689900541764372
1,10.0,22,28184,0.0007805847289242123,0.9969486233323872
0,11.0,5376,247340,0.0217352632004528,0.9568407859626424
