In [1]:
%run "./data-pipeline-naive-bayes2"

In [2]:
# Add libraries needed for the ML pipeline
from pyspark.ml import Pipeline
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.feature import *
from pyspark.ml.tuning import *
from pyspark.ml.evaluation import *
from pyspark.ml.classification import *

In [3]:
# Optional: Printing training_data and testing_data count
print(training_dataset.count())
print(testing_dataset.count())

In [4]:
assembler = VectorAssembler(
    inputCols=["p_positive_industry", "p_positive_pitch" , "p_positive_salary"],
    outputCol="features")
train = assembler.transform(training_dataset)

assembler = VectorAssembler(
    inputCols=["p_positive_industry", "p_positive_pitch" , "p_positive_salary"],
    outputCol="features")
test = assembler.transform(testing_dataset)

display(
  train.select('application_id', 'job_industry_code', 'cand_industry_code', 'label', 'p_positive_industry','p_positive_pitch', 'p_positive_salary', 'features')
)

application_id,job_industry_code,cand_industry_code,label,p_positive_industry,p_positive_pitch,p_positive_salary,features
389560396,36,11.0,0.0,0.437100794168,0.183328717807,0.496363430896,"List(1, 3, List(), List(0.437100794168, 0.183328717807, 0.496363430896))"
388833851,10,51.0,0.0,0.437100794168,0.183328717807,0.496363430896,"List(1, 3, List(), List(0.437100794168, 0.183328717807, 0.496363430896))"
388553176,10,0.0,0.0,0.437100794168,0.183328717807,0.496363430896,"List(1, 3, List(), List(0.437100794168, 0.183328717807, 0.496363430896))"
390344332,51,35.0,0.0,0.437100794168,0.183328717807,0.496363430896,"List(1, 3, List(), List(0.437100794168, 0.183328717807, 0.496363430896))"
391169388,51,27.0,0.0,0.437100794168,0.316407750511,0.496363430896,"List(1, 3, List(), List(0.437100794168, 0.316407750511, 0.496363430896))"
389517490,51,58.0,0.0,0.437100794168,0.316407750511,0.496363430896,"List(1, 3, List(), List(0.437100794168, 0.316407750511, 0.496363430896))"
389014055,22,7.0,0.0,0.437100794168,0.316407750511,0.496363430896,"List(1, 3, List(), List(0.437100794168, 0.316407750511, 0.496363430896))"
389555649,42,42.0,0.0,0.0626356741498,0.316407750511,0.496363430896,"List(1, 3, List(), List(0.0626356741498, 0.316407750511, 0.496363430896))"
392096843,42,18.0,0.0,0.437100794168,0.183328717807,0.496363430896,"List(1, 3, List(), List(0.437100794168, 0.183328717807, 0.496363430896))"
391293271,42,1.0,0.0,0.437100794168,0.316407750511,0.496363430896,"List(1, 3, List(), List(0.437100794168, 0.316407750511, 0.496363430896))"


In [5]:
nb = NaiveBayes(smoothing=1.0, modelType="multinomial")
model = nb.fit(train)

In [6]:
predictions = model.transform(test)

In [7]:
evaluator = MulticlassClassificationEvaluator()
evaluator.evaluate(predictions)

In [8]:
evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(predictions)

In [9]:
predictions.groupBy('prediction').count().show()

In [10]:
display(
  predictions.select('label','prediction').groupBy('label','prediction').agg(count('*').alias('total'))
)

label,prediction,total
1.0,1.0,61758
0.0,1.0,606333
1.0,0.0,49044
0.0,0.0,549333


In [11]:
df = predictions.select('label','prediction').groupBy('label','prediction').agg(count('*').alias('total'))
v1 = df.collect()[0][2]
v2 = df.collect()[1][2]
v3 = df.collect()[2][2]
v4 = df.collect()[3][2]
total_count = predictions.count()

# true positives
TP_11 = (float(v1) / float(total_count))*100
print('TP_11: ' + str(TP_11))+ '\n'

# true negatives
TN_00 = (float(v4) / float(total_count))*100
print('TN_00: ' + str(TN_00))+ '\n' + '\n'
  
# False Positive 
FP_01 = (float(v2) / float(total_count))*100
print('FP_01: ' + str(FP_01)) + '\n' 

# false negative
FN_10 = (float(v3) / float(total_count))*100
print('FN_10: ' + str(FN_10))

In [12]:

predictions_logloss = predictions.select('application_id', 'label', 'features', 'prediction', 'probability').withColumn(
  'positive_probability', positive_probability(predictions.probability)
).withColumn(
  'negative_probability', negative_probability(predictions.probability)
)
predictions_logloss = predictions_logloss.withColumn(
  'score', score(predictions_logloss.positive_probability, predictions_logloss.negative_probability)
)
predictions_logloss = predictions_logloss.withColumn(
  'logloss', binary_logloss(predictions_logloss.label, predictions_logloss.score)
)

predictions_logloss.agg(avg('logloss')).show()

In [13]:
predictions_logloss.agg(avg('score')).show()

In [14]:
display(predictions_logloss)

application_id,label,features,prediction,probability,positive_probability,negative_probability,score,logloss
399883966,0.0,"List(1, 3, List(), List(0.437100794168, 0.316407750511, 0.496363430896))",1.0,"List(1, 2, List(), List(0.499760861055633, 0.500239138944367))",0.500239138944367,0.499760861055633,0.500239138944367,0.6936255728600305
393171301,0.0,"List(1, 3, List(), List(0.437100794168, 0.183328717807, 0.496363430896))",0.0,"List(1, 2, List(), List(0.5009594431388954, 0.4990405568611047))",0.4990405568611047,0.5009594431388954,0.4990405568611047,0.6912301329926198
397455095,0.0,"List(1, 3, List(), List(0.437100794168, 0.183328717807, 0.496363430896))",0.0,"List(1, 2, List(), List(0.5009594431388954, 0.4990405568611047))",0.4990405568611047,0.5009594431388954,0.4990405568611047,0.6912301329926198
399252377,0.0,"List(1, 3, List(), List(0.437100794168, 0.183328717807, 0.496363430896))",0.0,"List(1, 2, List(), List(0.5009594431388954, 0.4990405568611047))",0.4990405568611047,0.5009594431388954,0.4990405568611047,0.6912301329926198
398321736,0.0,"List(1, 3, List(), List(0.437100794168, 0.316407750511, 0.496363430896))",1.0,"List(1, 2, List(), List(0.499760861055633, 0.500239138944367))",0.500239138944367,0.499760861055633,0.500239138944367,0.6936255728600305
393832482,0.0,"List(1, 3, List(), List(0.437100794168, 0.183328717807, 0.496363430896))",0.0,"List(1, 2, List(), List(0.5009594431388954, 0.4990405568611047))",0.4990405568611047,0.5009594431388954,0.4990405568611047,0.6912301329926198
398341887,0.0,"List(1, 3, List(), List(0.437100794168, 0.316407750511, 0.496363430896))",1.0,"List(1, 2, List(), List(0.499760861055633, 0.500239138944367))",0.500239138944367,0.499760861055633,0.500239138944367,0.6936255728600305
396897212,0.0,"List(1, 3, List(), List(0.437100794168, 0.316407750511, 0.496363430896))",1.0,"List(1, 2, List(), List(0.499760861055633, 0.500239138944367))",0.500239138944367,0.499760861055633,0.500239138944367,0.6936255728600305
393696224,0.0,"List(1, 3, List(), List(0.437100794168, 0.183328717807, 0.496363430896))",0.0,"List(1, 2, List(), List(0.5009594431388954, 0.4990405568611047))",0.4990405568611047,0.5009594431388954,0.4990405568611047,0.6912301329926198
391130649,0.0,"List(1, 3, List(), List(0.437100794168, 0.316407750511, 0.496363430896))",1.0,"List(1, 2, List(), List(0.499760861055633, 0.500239138944367))",0.500239138944367,0.499760861055633,0.500239138944367,0.6936255728600305
