In [1]:
from pyspark.sql.types import *
from pyspark.sql import *
from pyspark.sql.functions import *

from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer , VectorAssembler
# from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator

## Overview

This notebook will show you how to create and query a table or DataFrame that you uploaded to DBFS. [DBFS](https://docs.databricks.com/user-guide/dbfs-databricks-file-system.html) is a Databricks File System that allows you to store data for querying inside of Databricks. This notebook assumes that you have a file already inside of DBFS that you would like to read from.

This notebook is written in **Python** so the default cell type is Python. However, you can use different languages by using the `%LANGUAGE` syntax. Python, Scala, SQL, and R are all supported.

In [3]:
# File location and type
file_location = "/FileStore/tables/acme_v1__3_-fec8c.csv"
file_type = "csv"

# CSV options
infer_schema = "false"
first_row_is_header = "True"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

display(df.limit(5))

_c0,x1_common,x2_common,label,pii,AEF4WWEIRL,W1GZOJ08EO,BZEZO9N92C,R07UHWAHJS,2WRC17IY73,A9HHX76KTM,Z32DK0RBYO,BD6MYSRUED,ZGJTWQKEHJ
0,12.381735341663855,11.85775926993845,1,BAMU28882CEU8Q7ANC6T,7.235086904497715,2.0549818241379096,0.0917409959731845,-5.104800492412162,-3.689885558537489,7.980177705739071,3.954580146867475,-1.0724734835904155,1.0460186497055737
1,10.17998359035289,14.548865558560244,1,U913B15ZY31OIK283WWG,2.9670936547714244,0.6424813545656485,6.23959455146703,3.1678980061994397,7.62650952867228,6.81490388225262,-0.4883851575679546,2.533829122641387,-0.5404640891120633
2,9.407502925242463,26.96852137618436,1,X57P6F402BBQWF8DRR9O,4.242884025687742,5.37467628662406,3.2201881815041924,0.0843882839563772,8.789391043291062,7.316668274896622,2.9268591734990275,0.8900107280250427,-0.3209941729398013
3,9.760448504193729,21.80953754127941,1,3RD2ME3R1KEEI4R9OA8V,4.931865594516976,3.246671182409486,-0.4106572597630933,-3.249715733753884,7.196447704669756,1.2410646993677652,-0.9292499917143175,5.922778931079563,5.832253431953291
4,9.993611320573129,4.516766363045848,1,A3KI5I8BW0U40QZL0QDL,-0.0351122521765461,-2.3243470907169046,7.341788030228343,6.751744277755482,5.320464403145406,-0.4476127485691914,-0.9556982909101316,6.232263764815457,0.1872486595670639


In [4]:
df.printSchema()

In [5]:
df_str_int = df.select(
col('label').cast(IntegerType()),
col('_c0').cast(IntegerType()),
col('x1_common').cast(IntegerType()),
col('x2_common').cast(IntegerType()),
# col('pii').cast(IntegerType()),
col('AEF4WWEIRL').cast(IntegerType()),
col('W1GZOJ08EO').cast(IntegerType()),
col('BZEZO9N92C').cast(IntegerType()),
col('R07UHWAHJS').cast(IntegerType()),
col('2WRC17IY73').cast(IntegerType()),
col('A9HHX76KTM').cast(IntegerType()),
col('Z32DK0RBYO').cast(IntegerType()),
col('BD6MYSRUED').cast(IntegerType()),
col('ZGJTWQKEHJ').cast(IntegerType()),
)


In [6]:
display(df_str_int.select([count(when(col(c).isNull(), c)).alias(c) for c in df_str_int.columns]))

label,_c0,x1_common,x2_common,AEF4WWEIRL,W1GZOJ08EO,BZEZO9N92C,R07UHWAHJS,2WRC17IY73,A9HHX76KTM,Z32DK0RBYO,BD6MYSRUED,ZGJTWQKEHJ
0,0,0,0,0,0,1,0,0,0,1,1,0


In [7]:
df_no_null = df_str_int.fillna(0)
display(df_no_null.select([count(when(col(c).isNull(), c)).alias(c) for c in df_no_null.columns]))

label,_c0,x1_common,x2_common,AEF4WWEIRL,W1GZOJ08EO,BZEZO9N92C,R07UHWAHJS,2WRC17IY73,A9HHX76KTM,Z32DK0RBYO,BD6MYSRUED,ZGJTWQKEHJ
0,0,0,0,0,0,0,0,0,0,0,0,0


In [8]:
features = df_no_null.drop('label').columns
va = VectorAssembler(inputCols=features, outputCol="features")
dataset_assembled = va.transform(df_no_null)


In [9]:
(train , test) = dataset_assembled.randomSplit([0.6, 0.4])

In [10]:
logistic_regression = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10)
model = logistic_regression.fit(train)
predictions = model.transform(test)

In [11]:
confusion_matrix = predictions.groupBy('label').pivot('prediction').count()
display(confusion_matrix)

label,0.0,1.0
1,109,1071
0,3978,80


In [12]:
TP = 1071
TN = 3978
FP = 80
FN = 109

cm = confusion_matrix.collect()
print("Test Accuracy: {0}".format((cm[0][2]+train_results[1][1])/train.count()))

In [13]:
evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(predictions)

In [14]:
evaluator = BinaryClassificationEvaluator(labelCol="label", metricName="areaUnderROC")
evaluator.evaluate(predictions)