In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

In [2]:
sc = SparkContext.getOrCreate(SparkConf().setMaster("local[*]"))

spark = SparkSession \
    .builder \
    .getOrCreate()

### Reading the Dataset

In [3]:
df = (spark.read
          .format("csv")
          .option('header', 'true')
          .load("./datasets/uci-diabetes.csv"))

In [4]:
from pyspark.sql.functions import col
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import Normalizer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.tuning import CrossValidatorModel
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline

### Selecting Features from Dataset and casting it into required Dtype

In [5]:
columns  = df.columns

dataset = df.select(col(columns[1]).cast('float'),
                    col(columns[2]).cast('float'),
                    col(columns[3]).cast('string'),
                    col(columns[4]).cast('float'),
                   )

### Feature Transformation

In [6]:
indexer = StringIndexer(inputCol="code", outputCol="codeIndex")
encoder = OneHotEncoder(inputCol="codeIndex", outputCol="codeVec")
vectorAssembler = VectorAssembler(inputCols=['value','codeVec'],
                                  outputCol="features")
#normalizer = Normalizer(inputCol="features", outputCol="features_norm", p=1.0)
scaler = StandardScaler(inputCol="features", outputCol="features_norm")
pipeline = Pipeline(stages=[indexer, encoder, vectorAssembler, scaler])
final_df = pipeline.fit(dataset).transform(dataset)

### Splitting the Dataset into training data and test data

In [7]:
(training_data, test_data) = final_df.randomSplit([0.8,0.2])

### Building model

In [8]:
rfc = RandomForestClassifier()
param_grid = (ParamGridBuilder()
              .baseOn({rfc.labelCol : 'Class'})
              .baseOn([rfc.predictionCol, 'pred_rfc'])
              .baseOn([rfc.featuresCol, 'features_norm'])
              .addGrid(rfc.maxDepth, [2,3,5,7,10])
              .addGrid(rfc.numTrees, [20,40,60,100]).build())

evaluator = BinaryClassificationEvaluator(labelCol= 'Class',rawPredictionCol= 'pred_rfc')


cv = CrossValidator(estimator=rfc, estimatorParamMaps=param_grid, evaluator=evaluator,seed=0,parallelism=2)

### Training the Model

In [9]:
cvModel = cv.fit(training_data)

In [10]:
cvModel.avgMetrics

[0.9907995934538381,
 0.98932682254515,
 0.9896496412132924,
 0.9904135883706606,
 0.9946421483289132,
 0.99472240110399,
 0.9927426197917957,
 0.9957333813049786,
 0.9977731023353886,
 0.9990947528184775,
 0.9990608183789493,
 0.9991855197425894,
 0.9994780640569818,
 0.9995967075709027,
 0.9995967075709027,
 0.9996363995610589,
 0.9995177560471382,
 0.9996889414760861,
 0.9995570155807463,
 0.9996363995610589]

### Evaluating the Model Using Different Evaluation Metrics

In [11]:
evaluator.evaluate(cvModel.transform(test_data),{evaluator.metricName: 'areaUnderPR'})

0.9999265793613511

In [12]:
evaluator.evaluate(cvModel.transform(test_data),{evaluator.metricName: 'areaUnderROC'})

0.9998403575989783

### Best Params

In [13]:
cvModel.bestModel.getNumTrees

40

In [14]:
cvModel.bestModel.getMaxDepth()

10

### Saving the Trained Problem

In [15]:
path= r'./SavedModel/rfc_model'
cvModel.save(path)

### Loading the Trained Model

In [16]:
#rfc_model= CrossValidatorModel.read().path(path)

### Result Summary

In [1]:
import pandas as pd

summary_= {
    'areaUnderPR' : 0.9999265793613511,
    'areaUnderROC' : 0.9998403575989783
}
summary= pd.Series(summary_)
summary

areaUnderPR     0.999927
areaUnderROC    0.999840
dtype: float64