In [1]:
from pyspark.sql import SparkSession

from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.feature import IndexToString
from pyspark.ml import Pipeline

spark = SparkSession \
.builder \
.appName("Python Spark Logistic") \
.getOrCreate()

In [4]:
# базовые функции
def get_dummy(df,categoricalCols,continuousCols,labelCol):
    from pyspark.ml import Pipeline
    from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
    from pyspark.sql.functions import col
    indexers = [ StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c))
                 for c in categoricalCols ]

    # декодирование фич
    encoders = [ OneHotEncoder(inputCol=indexer.getOutputCol(),
                 outputCol="{0}_encoded".format(indexer.getOutputCol()))
                 for indexer in indexers ]
    
    # сборка в колонку
    assembler = VectorAssembler(inputCols=[encoder.getOutputCol() for encoder in encoders]
                                + continuousCols, outputCol="features")

    pipeline = Pipeline(stages=indexers + encoders + [assembler])

    model=pipeline.fit(df)
    data = model.transform(df)
    data = data.withColumn('label',col(labelCol))

    return data.select('features','label')

In [5]:
df = spark.read.format('com.databricks.spark.csv') \
            .options(header='true', inferschema='true') \
            .load("data/banks.csv",header=True);
df.drop('day','month','poutcome').show(5)

+---+----------+-------+---------+-------+-------+-------+----+-------+--------+--------+-----+--------+-------+
|age|       job|marital|education|default|balance|housing|loan|contact|duration|campaign|pdays|previous|deposit|
+---+----------+-------+---------+-------+-------+-------+----+-------+--------+--------+-----+--------+-------+
| 59|    admin.|married|secondary|     no|   2343|    yes|  no|unknown|    1042|       1|   -1|       0|    yes|
| 56|    admin.|married|secondary|     no|     45|     no|  no|unknown|    1467|       1|   -1|       0|    yes|
| 41|technician|married|secondary|     no|   1270|    yes|  no|unknown|    1389|       1|   -1|       0|    yes|
| 55|  services|married|secondary|     no|   2476|    yes|  no|unknown|     579|       1|   -1|       0|    yes|
| 54|    admin.|married| tertiary|     no|    184|     no|  no|unknown|     673|       2|   -1|       0|    yes|
+---+----------+-------+---------+-------+-------+-------+----+-------+--------+--------+-----+-

In [9]:
# deposit -> y (как переименовать?)

df = # ваш код здесь

In [10]:
catcols = ['job','marital','education','default',
           'housing','loan','contact','poutcome']

num_cols = ['balance', 'duration','campaign','pdays','previous',]
labelCol = 'y'

data = get_dummy(df,catcols,num_cols,labelCol)
data.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(29,[3,11,13,16,1...|  yes|
|(29,[3,11,13,16,1...|  yes|
|(29,[2,11,13,16,1...|  yes|
|(29,[4,11,13,16,1...|  yes|
|(29,[3,11,14,16,1...|  yes|
+--------------------+-----+
only showing top 5 rows



In [12]:
# примените класс, который сделает трансформацию фичей
# каждую переменную из указанной колонки превратит в числовое значение (в атрибут для ML)
labelIndexer = # ваш код здесь
labelIndexer.transform(data).show(5, True)

+--------------------+-----+------------+
|            features|label|indexedLabel|
+--------------------+-----+------------+
|(29,[3,11,13,16,1...|  yes|         1.0|
|(29,[3,11,13,16,1...|  yes|         1.0|
|(29,[2,11,13,16,1...|  yes|         1.0|
|(29,[4,11,13,16,1...|  yes|         1.0|
|(29,[3,11,14,16,1...|  yes|         1.0|
+--------------------+-----+------------+
only showing top 5 rows



In [15]:
# примените класс, который сделает трансформацию 
# добавит indexedLabel к features и укажет для алгоритма, что это целевое значение
featureIndexer = # ваш код здесь
featureIndexer.transform(data).show(5, True)

+--------------------+-----+--------------------+
|            features|label|     indexedFeatures|
+--------------------+-----+--------------------+
|(29,[3,11,13,16,1...|  yes|(29,[3,11,13,16,1...|
|(29,[3,11,13,16,1...|  yes|(29,[3,11,13,16,1...|
|(29,[2,11,13,16,1...|  yes|(29,[2,11,13,16,1...|
|(29,[4,11,13,16,1...|  yes|(29,[4,11,13,16,1...|
|(29,[3,11,14,16,1...|  yes|(29,[3,11,14,16,1...|
+--------------------+-----+--------------------+
only showing top 5 rows



In [16]:
# train / test split
(trainingData, testData) = # сделайте разбивку на train (70%) / test (30%) 

trainingData.show(5,False)
testData.show(5,False)

+-----------------------------------------------------------------------------------------------+-----+
|features                                                                                       |label|
+-----------------------------------------------------------------------------------------------+-----+
|(29,[0,11,13,16,17,18,19,21,24,25,26,27],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,37.0,84.0,11.0,-1.0])|no   |
|(29,[0,11,13,16,17,18,19,21,24,25,26,27],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,80.0,155.0,3.0,-1.0])|no   |
|(29,[0,11,13,16,17,18,19,21,24,25,26,27],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,99.0,15.0,5.0,-1.0]) |no   |
|(29,[0,11,13,16,17,18,19,21,24,25,26,27],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,189.0,90.0,2.0,-1.0])|no   |
|(29,[0,11,13,16,17,18,19,21,24,25,26,27],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,333.0,80.0,6.0,-1.0])|no   |
+-----------------------------------------------------------------------------------------------+-----+
only showing top 5 rows

+--------------------------------------

In [66]:
# Классификаторы
# Сделайте выбор классификатора
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import NaiveBayes

In [67]:
logr = #
dTree = #
nb = #

In [73]:
# !
# Нет XGBoost в Spark
# надо использовать (http://docs.h2o.ai/sparkling-water/2.1/latest-stable/doc/pysparkling.html)
# или https://xgboost.readthedocs.io/en/latest/jvm/xgboost4j_spark_tutorial.html

In [68]:
# оформите архитектуру pipeline
labelConverter = IndexToString(inputCol="prediction", 
                               outputCol="predictedLabel",
                               labels=labelIndexer.labels)
pipeline = Pipeline(stages=[labelIndexer, 
                            featureIndexer, 
                            # алгоритм,
                            labelConverter])
model = pipeline.fit(trainingData)

In [69]:
# 
predictions = model.transform(testData)
predictions.select("features","label","predictedLabel").show(5)

+--------------------+-----+--------------+
|            features|label|predictedLabel|
+--------------------+-----+--------------+
|(29,[0,11,13,16,1...|  yes|           yes|
|(29,[0,11,13,16,1...|   no|            no|
|(29,[0,11,13,16,1...|  yes|           yes|
|(29,[0,11,13,16,1...|  yes|           yes|
|(29,[0,11,13,16,1...|   no|           yes|
+--------------------+-----+--------------+
only showing top 5 rows



In [70]:
# оцените качество
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator() # укажите нужные параметры в скобках
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

Test Error = 0.209125


In [72]:
lrModel = model.stages[2]
trainingSummary = lrModel.summary

# ROC_AUC
trainingSummary.roc.show(5)
print("areaUnderROC: " + str(trainingSummary.areaUnderROC))

# F-mesuare
# а какая метрика была бы полезна ещё? Почему?
fMeasure = trainingSummary.fMeasureByThreshold
maxFMeasure = fMeasure.groupBy().max('F-Measure').select('max(F-Measure)').head(5)

+--------------------+-------------------+
|                 FPR|                TPR|
+--------------------+-------------------+
|                 0.0|                0.0|
|0.001418439716312...|0.01926721415034744|
|0.003404255319148936|0.03790271636133923|
|0.004539007092198...| 0.0574857864813645|
|0.005673758865248227|0.07706885660138976|
+--------------------+-------------------+
only showing top 5 rows

areaUnderROC: 0.8875113685747952


In [65]:
# исследуйте локально 
# %pyspark
# https://github.com/jupyter-incubator/sparkmagic