In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

In [2]:
sc = SparkContext.getOrCreate(SparkConf().setMaster("local[*]"))

spark = SparkSession \
    .builder \
    .getOrCreate()

In [3]:
df = (spark.read
          .format("csv")
          .option('header', 'true')
          .load("./datasets/uci-diabetes.csv"))

In [4]:
from pyspark.sql.functions import col
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import Normalizer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline

In [5]:
columns  = df.columns

dataset = df.select(col(columns[1]).cast('float'),
                    col(columns[2]).cast('float'),
                    col(columns[3]).cast('string'),
                    col(columns[4]).cast('float'),
                   )

In [6]:
indexer = StringIndexer(inputCol="code", outputCol="codeIndex")
encoder = OneHotEncoder(inputCol="codeIndex", outputCol="codeVec")
vectorAssembler = VectorAssembler(inputCols=['value','codeVec'],
                                  outputCol="features")
#normalizer = Normalizer(inputCol="features", outputCol="features_norm", p=1.0)
scaler = StandardScaler(inputCol="features", outputCol="features_norm")
pipeline = Pipeline(stages=[indexer, encoder, vectorAssembler, scaler])
final_df = pipeline.fit(dataset).transform(dataset)

In [7]:
(training_data, test_data) = final_df.randomSplit([0.8,0.2])

In [8]:
svc = LinearSVC()
param_grid = (ParamGridBuilder()
              .baseOn({svc.labelCol : 'Class'})
              .baseOn([svc.predictionCol, 'pred_svc'])
              .baseOn([svc.featuresCol, 'features_norm'])
              .addGrid(svc.regParam, [0.01,0.1,0,1,10,20,50,100]).build())

evaluator1 = BinaryClassificationEvaluator(labelCol= 'Class',rawPredictionCol= 'pred_svc',metricName='areaUnderROC')
evaluator2 = BinaryClassificationEvaluator(labelCol= 'Class',rawPredictionCol= 'pred_svc',metricName='areaUnderPR')


cv1 = CrossValidator(estimator=svc, estimatorParamMaps=param_grid, evaluator=evaluator1,seed=0,parallelism=2)
cv2 = CrossValidator(estimator=svc, estimatorParamMaps=param_grid, evaluator=evaluator2,seed=0,parallelism=2)

In [9]:
cvModel1 = cv1.fit(training_data)

In [10]:
cvModel2 = cv2.fit(training_data)

In [11]:
evaluator1.evaluate(cvModel1.transform(training_data))

0.9998800575723652

In [12]:
evaluator2.evaluate(cvModel2.transform(training_data))

0.9999445237920661

In [13]:
evaluator1.evaluate(cvModel1.transform(test_data))

0.9998187749184487

In [14]:
evaluator2.evaluate(cvModel2.transform(test_data))

0.9996771068776235

In [16]:
cvModel2.save('./SavedModel/svc_model')

In [None]:
CrossV