In [2]:
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import col, lit, udf
from pyspark.ml.classification import NaiveBayes

In [3]:
# Set the file paths
labelsPath="gs://bigdataween-ngo/death.avro"
featuresPath="gs://bigdataween-ngo/dataproc-results/features.parquet"

In [5]:
# Load the features extracted in the Extract-features notebook
features = spark.read.parquet(featuresPath)
# Load the labels
labels = spark.read.format("avro").load(labelsPath)

In [7]:
# Join the features with the labels for training
data = features.join(labels.withColumn('label', lit(1)), on="person_id", how="left").fillna({"label":0})

In [10]:
# Split the data into train and test
trainData, testData = data.randomSplit([0.5, 0.5])

In [11]:
# Create the Naive Bayes classifier and train it
clf = NaiveBayes(labelCol="label", featuresCol="featuresBinomial", modelType="bernoulli")
mlModel = clf.fit(trainData)

In [None]:
# Compute the predictions and measure false positives and true positives
predictions = mlModel.transform(testData)
fp = predictions.where( (col("label")==0) & (col("prediction") == 1.0) ).count()    
tp = predictions.where( (col("label")==1) & (col("prediction") == 1.0) ).count()
print("FP = {} - TP ={}".format(fp,tp))

In [None]:
predictions.printSchema()