In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://www-us.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop2.7.tgz
!tar xf spark-3.1.1-bin-hadoop2.7.tgz

In [None]:
!pip install -q findspark

In [None]:
import os

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop2.7"

In [None]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
df = spark.sql("select 'spark' as hello ")
df.show()

+-----+
|hello|
+-----+
|spark|
+-----+



In [None]:
!pip install pyspark

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/45/b0/9d6860891ab14a39d4bddf80ba26ce51c2f9dc4805e5c6978ac0472c120a/pyspark-3.1.1.tar.gz (212.3MB)
[K     |████████████████████████████████| 212.3MB 72kB/s 
[?25hCollecting py4j==0.10.9
[?25l  Downloading https://files.pythonhosted.org/packages/9e/b6/6a4fb90cd235dc8e265a6a2067f2a2c99f0d91787f06aca4bcf7c23f3f80/py4j-0.10.9-py2.py3-none-any.whl (198kB)
[K     |████████████████████████████████| 204kB 35.2MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.1.1-py2.py3-none-any.whl size=212767604 sha256=535e9c3ecea2e69d4b828d204ecc8d744fef92eb224d8b52358343b57f80af7b
  Stored in directory: /root/.cache/pip/wheels/0b/90/c0/01de724414ef122bd05f056541fb6a0ecf47c7ca655f8b3c0f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.1.1


In [None]:
from pyspark.ml.feature import VectorAssembler
import numpy as np
import scipy
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql import SparkSession
from pyspark.ml.linalg import SparseVector

#read input and show
df = spark.read.options(delimiter=',', header=True).csv('/content/Absenteeism_at_work.csv')
df = df.withColumn("MOA", df["Month of absence"] - 0).withColumn("label", df['Seasons'] - 0).withColumn("ROA", df["Reason for absence"] - 0).\
    withColumn("distance", df["Distance from Residence to Work"] - 0).withColumn("BMI", df["Body mass index"] - 0)
df.show(5)

+---+------------------+----------------+---------------+-------+----------------------+-------------------------------+------------+---+----------------------+----------+--------------------+---------+---+--------------+-------------+---+------+------+---------------+-------------------------+---+-----+----+--------+----+
| ID|Reason for absence|Month of absence|Day of the week|Seasons|Transportation expense|Distance from Residence to Work|Service time|Age|Work load Average/day |Hit target|Disciplinary_failure|Education|Son|Social drinker|Social smoker|Pet|Weight|Height|Body mass index|Absenteeism_time_in_hours|MOA|label| ROA|distance| BMI|
+---+------------------+----------------+---------------+-------+----------------------+-------------------------------+------------+---+----------------------+----------+--------------------+---------+---+--------------+-------------+---+------+------+---------------+-------------------------+---+-----+----+--------+----+
| 11|                26| 

In [None]:
#combine column label and distance to new column name features
#https://spark.apache.org/docs/latest/ml-features#vectorindexer
merge_col = VectorAssembler(inputCols=["label", "MOA"], outputCol='features')
df = merge_col.transform(df)
df.select("features").show(5)

+---------+
| features|
+---------+
|[1.0,7.0]|
|[1.0,7.0]|
|[1.0,7.0]|
|[1.0,7.0]|
|[1.0,7.0]|
+---------+
only showing top 5 rows



In [None]:
#split 70, 30
(trainingData, testData) = df.randomSplit([0.7, 0.3], 1000)
#make the prediction
nb = NaiveBayes(smoothing=1.0, modelType="multinomial")
#train the model
model = nb.fit(trainingData)
#select example rows to display
predictions = model.transform(testData)
predictions.select("prediction", "label", "features").show(5)
#compute accuracy on the test set
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Naive Bayes - Test Accuracy = %g" % (accuracy))
print("Naive Bayes - Test Error = %g" % (1.0 - accuracy))

+----------+-----+----------+
|prediction|label|  features|
+----------+-----+----------+
|       1.0|  2.0| [2.0,1.0]|
|       3.0|  4.0|[4.0,11.0]|
|       0.0|  1.0| [1.0,8.0]|
|       3.0|  4.0|[4.0,12.0]|
|       0.0|  1.0| [1.0,8.0]|
+----------+-----+----------+
only showing top 5 rows

Naive Bayes - Test Accuracy = 0.0612245
Naive Bayes - Test Error = 0.938776


In [None]:
#save model
model.save('/content/myModel')