In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

In [2]:
sc = SparkContext.getOrCreate(SparkConf().setMaster("local[*]"))

spark = SparkSession \
    .builder \
    .getOrCreate()

In [3]:
df = (spark.read
          .format("csv")
          .option('header', 'true')
          .load("./datasets/uci-diabetes.csv"))

In [4]:
from pyspark.sql.functions import col
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import Normalizer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline

In [5]:
columns  = df.columns

dataset = df.select(col(columns[1]).cast('float'),
                    col(columns[2]).cast('float'),
                    col(columns[3]).cast('string'),
                    col(columns[4]).cast('float'),
                   )

In [6]:
indexer = StringIndexer(inputCol="code", outputCol="codeIndex")
encoder = OneHotEncoder(inputCol="codeIndex", outputCol="codeVec")
vectorAssembler = VectorAssembler(inputCols=['value','codeVec'],
                                  outputCol="features")
#normalizer = Normalizer(inputCol="features", outputCol="features_norm", p=1.0)
scaler = StandardScaler(inputCol="features", outputCol="features_norm")
pipeline = Pipeline(stages=[indexer, encoder, vectorAssembler, scaler])
final_df = pipeline.fit(dataset).transform(dataset)

In [7]:
(training_data, test_data) = final_df.randomSplit([0.8,0.2])

In [8]:
lr = LogisticRegression()
param_grid = (ParamGridBuilder()
              .baseOn({lr.labelCol : 'Class'})
              .baseOn([lr.predictionCol, 'pred_lr'])
              .baseOn([lr.featuresCol, 'features_norm'])
              .addGrid(lr.regParam, [0.01,0.1,0,1,10,20])
              .addGrid(lr.elasticNetParam, [0,0.25,0.75,1])
              .addGrid(lr.fitIntercept, [True,False])
              .addGrid(lr.maxIter, [10,100,1000])
              .addGrid(lr.aggregationDepth, [2,5,10]).build())

#evaluator1 = BinaryClassificationEvaluator(labelCol= 'Class',rawPredictionCol= 'pred_lr',metricName='areaUnderROC')
evaluator2 = BinaryClassificationEvaluator(labelCol= 'Class',rawPredictionCol= 'pred_lr',metricName='areaUnderPR')


#cv1 = CrossValidator(estimator=svc, estimatorParamMaps=param_grid, evaluator=evaluator1,seed=0,parallelism=2)
cv2 = CrossValidator(estimator=lr, estimatorParamMaps=param_grid, evaluator=evaluator2,seed=0,parallelism=4)

In [11]:
cvModel2 = cv2.fit(training_data)

In [12]:
evaluator2.evaluate(cvModel2.transform(training_data))

0.9999441289751798

In [13]:
evaluator2.evaluate(cvModel2.transform(test_data))

0.9999260119862776

In [15]:
cvModel2.save('./SavedModel/lr_model')

In [10]:
from pyspark.ml.tuning import CrossValidatorModel

model_path= r'.\model\lr_main_model_without_timestamp'
cvModelRead = CrossValidatorModel.read().load(model_path)

In [12]:
evaluator2.evaluate(cvModelRead.transform(test_data))

0.9998488797037863