In [203]:
from pyspark import SparkContext
from pyspark.mllib.classification import LogisticRegressionWithLBFGS,SVMWithSGD,LogisticRegressionWithSGD, NaiveBayes
from pyspark.mllib.regression import LabeledPoint,LinearRegressionWithSGD,RidgeRegressionWithSGD,LassoWithSGD
from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
from pyspark.mllib.feature import StandardScaler,ChiSqSelector,Normalizer,PCA
from pyspark.mllib.evaluation import RegressionMetrics, BinaryClassificationMetrics
from pyspark.mllib.stat import Statistics, MultivariateStatisticalSummary
from pyspark.mllib.util import MLUtils
from pyspark.ml import Pipeline
from pyspark.sql import SQLContext
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.regression import DecisionTreeRegressor, LinearRegression, RandomForestRegressor
from pyspark.ml.evaluation import BinaryClassificationEvaluator, RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
import time

In [204]:
def importRawData(sc, trainingFile, validationFile):
    """
    :param sc: Spark Context
    :param filePath: path to data .csv file
    :return: RDD of (LabeledPoint, index)
    """
    tRdd = sc.textFile(trainingFile)
    vRdd = sc.textFile(validationFile)
    tCount = tRdd.count()
    vCount = vRdd.count()
    tRdd = tRdd.filter(lambda line:'?' not in line and line!="").map(lambda line: str(line).replace(" ","").split(","))
    vRdd = vRdd.filter(lambda line:'?' not in line and line!="").map(lambda line: str(line).replace(" ","").split(","))
    print "Filtering Invaild records..."
    print "Training data Input: %s records, %s remain" % (tCount, tRdd.count())
    print "Validation data Input: %s records, %s remain" % (vCount, vRdd.count())
    transformationFunction = makeTransformationFunction(tRdd)
    tRdd = tRdd.map(transformationFunction)
    vRdd = vRdd.map(transformationFunction)
    return (tRdd,vRdd)

def makeTransformationFunction(rdd):
    fCount = len(rdd.first())-1
    indic = []
    for i in range(fCount):
        if rdd.first()[i].isdigit():
            temp = rdd.map(lambda a:int(a[i]))
            binMin = temp.min()
            binSize = (temp.max()-binMin)/10
            indic.append((False, [binMin, binSize]))
        else:
            distinct = rdd.map(lambda a:a[i]).distinct().collect()
            indic.append((True, distinct))
    def transformationFunction(array):
        result = [1 if array[-1].startswith('>50K') else 0]
        for i, f in enumerate(array):
            if i==len(array)-1:
                break
            elif indic[i][0]:
                for j in range(len(indic[i][1])):
                    result.append(1 if f==indic[i][1][j] else 0)
            else:
                result.append((int(f)-indic[i][1][0])/indic[i][1][1])
        return result
    return transformationFunction


def featureEngineering(trainingData,validationData, zNorm = True, l2Norm = True, categorical = True, topFeature = 20):
    """
    this function is to provide feature engineering and transformation
    including partition, normalization, feature selection, dimension reduction
    :param rawData: RDD[Vector] of raw data
    :param corrSelection: local matrix of correliation matrix, if provided will perform CFS
    :param zNorm: boolean of whether to perform Z normalization or not
    :param l2Norm: boolean of whether to perform L2 normalization or not
    :param categorical: boolean of whether to perform chi square feature selection or not
    :param topFeature: select top features if using chi square selection
    :return: tuple of (RDD[LabeledPoint] trainingData, RDD[LabeledPoint] validationData)
    """
    print "=============================================="
    print "FEATURE ENGINEERING"
    print "=============================================="
    print "partitioning..."
    tFeatures = trainingData.map(lambda a:a[1:])
    vFeatures = validationData.map(lambda a:a[1:])
    tLabel = trainingData.map(lambda a:a[0])
    vLabel = validationData.map(lambda a:a[0])
    print "Feature Selection... "
    if categorical:
        selector = ChiSqSelector(topFeature).fit(tLabel.zip(tFeatures).map(lambda lp:LabeledPoint(lp[0], lp[1])))
        tFeatures = selector.transform(tFeatures)
        vFeatures = selector.transform(vFeatures)
    featureCount = len(tFeatures.first())
    print "Selected %s Features: " % featureCount
    
    return (tLabel.zip(tFeatures).map(lambda lp:LabeledPoint(lp[0], lp[1])).cache(), \
            vLabel.zip(vFeatures).map(lambda lp:LabeledPoint(lp[0], lp[1])).cache())

def selectClassificationModel(sc, trainingData, validationData):
    """
    wrapper function to evaluate and select all the classification models
    :param sc: spark context
    :param trainingData: RDD[LabeledPoint] of training data
    :param validationData: RDD[LabeledPoint] of validation data
    :return: None
    """
    print "=============================================="
    print "CLASSIFICATION"
    print "=============================================="
    classificationModels = [
                                (SVMWithSGD, {"intercept":True, "regType":None}), 
                                (SVMWithSGD, {"intercept":True, "regType":"l1"}), 
                                (SVMWithSGD, {"intercept":True, "regType":"l2"}), 
                                (LogisticRegressionWithLBFGS, {"intercept":True, "regType":"l1"}), 
                                (LogisticRegressionWithLBFGS, {"intercept":True, "regType":"l2"}), 
                                (LogisticRegressionWithSGD, {"intercept":True, "regType":None}), 
                                (LogisticRegressionWithSGD, {"intercept":True, "regType":"l1"}), 
                                (LogisticRegressionWithSGD, {"intercept":True, "regType":"l2"})
                           ]
    for modelClass, kwargs in classificationModels:
        trainClassificationModel(sc, trainingData, validationData, modelClass, **kwargs)
    #GBT is waaaaaay too slow for this dataset
    classificationModels = [
                                (DecisionTree, {"numClasses":2,"categoricalFeaturesInfo":{},"minInstancesPerNode":100 ,"impurity":"gini"}), 
                                (DecisionTree, {"numClasses":2,"categoricalFeaturesInfo":{},"minInstancesPerNode":100 , "impurity":"entropy"}), 
                                (RandomForest, {"numClasses":2,"categoricalFeaturesInfo":{},"numTrees":20, "impurity":"gini"}), 
                                (RandomForest, {"numClasses":2,"categoricalFeaturesInfo":{},"numTrees":20, "impurity":"entropy"}) 
                           ]
    for modelClass, kwargs in classificationModels:
        trainClassificationTreeModel(sc, trainingData, validationData, modelClass, **kwargs)

def trainClassificationModel(sc, trainingData, validationData, modelClass, **kwargs):
    """
    train classification models for NOT-TREE based model
    :param sc: spark context
    :param trainingData: RDD[LabeledPoint] of training data
    :param validationData: RDD[LabeledPoint] of validation data
    :modelClass: model CLASS that use to train
    :kwargs: key-value paired arguments for modelClass, would be passes in directly
    :return: None
    """
    print "Classification Model: %s %s" % (modelClass.__name__, kwargs)
    startTime = time.time()
    model = modelClass.train(trainingData, **kwargs)
    model.clearThreshold()
    validationsResult = validationData.map(lambda lp:(float(model.predict(lp.features)), lp.label))
    metric = BinaryClassificationMetrics(validationsResult)
    # the error rate search is to search for overall best error rate
    # regardless of precision and recall, however they could be evaluate by PR area and ROC area
    errors = []
    for i in range(1, 11):
        err = validationsResult.filter(lambda (predict,label):(1 if predict>i/10.0 else 0)!=label).count() \
                                            / float(validationsResult.count())
        errors.append((err, i/10.0))
    errors.sort(key=lambda t:t[0])
    print "[ Error: %.4f\t\tPrecision-recall: %.4f\tROC: %.4f ] - %s sec" \
            % (errors[0][0], metric.areaUnderPR, metric.areaUnderROC, (time.time()-startTime))
        
def trainClassificationTreeModel(sc, trainingData, validationData, modelClass, **kwargs):
    """
    train classification models for TREE based model
    :param sc: spark context
    :param trainingData: RDD[LabeledPoint] of training data
    :param validationData: RDD[LabeledPoint] of validation data
    :modelClass: model CLASS that use to train
    :kwargs: key-value paired arguments for modelClass, would be passes in directly
    :return: None
    """
    print "Classification Model: %s %s" % (modelClass.__name__, kwargs)
    startTime = time.time()
    model = modelClass.trainClassifier(trainingData, **kwargs)
    validationFeatures = validationData.map(lambda lp:lp.features)
    # !!!beware, due to some stange bug, DO NOT chain RDD transformation on tree model predict, count() immediately!!!
    validationsResult = model.predict(validationFeatures)
    totalCount = validationsResult.count()
    validationsResult = validationsResult.zip(validationData.map(lambda lp:lp.label))
    errCount = validationsResult.filter(lambda (predict,label):predict!=label).count()
    validationsResult = validationsResult.zip(validationData.map(lambda lp:lp.label))
    err = float(errCount) / totalCount
    print "[ Error: %.4f ] - %s sec" % (err, (time.time()-startTime))

In [205]:
def selectClassificationPipelineModel(sqlContext, trainingData, validationData):
    """
    wrapper function to evaluate and select all the classification models using pipeline CV
    :param sqlContext: spark sql context
    :param trainingData: RDD[LabeledPoint] of training data
    :param validationData: RDD[LabeledPoint] of validation data
    :return: None
    """
    print "=============================================="
    print "CLASSIFICATION WITH PIPELINE CV"
    print "=============================================="
    classificationModels = [
                                (LogisticRegression(), {"fitIntercept":True, "regParam":[0.1,0.01,0.001]})
                           ]
    for modelObject, kwargs in classificationModels:
        trainClassificationPipelineModel(sqlContext, trainingData, validationData, modelObject, kwargs)
    
def trainClassificationPipelineModel(sqlContext, trainingData, validationData, modelObject, kwargs):
    """
    train classification models for NOT-TREE based model
    :param sqlContext: spark sql context
    :param trainingData: RDD[LabeledPoint] of training data
    :param validationData: RDD[LabeledPoint] of validation data
    :modelObject: model OBJECT that use to train
    :kwargs: key-value dict for cross validation, would be passes in directly
    :return: None
    """
    print "Classification Model: %s %s" % (modelObject.__class__.__name__, kwargs)
    startTime = time.time()
    trainingDF = sqlContext.createDataFrame(trainingData).cache()
    validationDF = sqlContext.createDataFrame(validationData).cache()
    grid = ParamGridBuilder().baseOn(kwargs).build()
    evaluator = BinaryClassificationEvaluator()
    cv = CrossValidator(estimator=modelObject, estimatorParamMaps=grid, evaluator=evaluator, numFolds = 5)
    cvModel = cv.fit(trainingDF)
    validationResult = cvModel.transform(validationDF)
    areaUnderROC = evaluator.evaluate(validationResult, {evaluator.metricName: "areaUnderROC"})
    areaUnderPR = evaluator.evaluate(validationResult, {evaluator.metricName: "areaUnderPR"})
    print "[ Precision-recall: %.4f\tROC: %.4f ] - %s sec" \
            % (areaUnderPR, areaUnderROC, (time.time()-startTime))

In [206]:
if __name__=="__main__":
    filePath1 = "file:///ipython/adult_data.csv"
    filePath2 = "file:///ipython/adult.test.csv"
    sc = SparkContext(appName="MainContext")
    sqlContext = SQLContext(sc)
    try:
        trainingData, validationData = importRawData(sc, filePath1,filePath2)
        trainingData, validationData = featureEngineering(trainingData,validationData)
        selectClassificationModel(sc, trainingData, validationData)
        selectClassificationPipelineModel(sqlContext, trainingData, validationData)
    except Exception:
        raise
    finally:
        sc.stop()

Filtering Invaild records...
Training data Input: 32562 records, 30162 remain
Validation data Input: 16282 records, 15060 remain
FEATURE ENGINEERING
partitioning...
Feature Selection... 
Selected 20 Features: 
CLASSIFICATION
Classification Model: SVMWithSGD {'regType': None, 'intercept': True}
[ Error: 0.2156		Precision-recall: 0.5730	ROC: 0.8046 ] - 5.46849608421 sec
Classification Model: SVMWithSGD {'regType': 'l1', 'intercept': True}
[ Error: 0.2064		Precision-recall: 0.5810	ROC: 0.8109 ] - 2.77928996086 sec
Classification Model: SVMWithSGD {'regType': 'l2', 'intercept': True}
[ Error: 0.2276		Precision-recall: 0.5588	ROC: 0.7971 ] - 2.861068964 sec
Classification Model: LogisticRegressionWithLBFGS {'regType': 'l1', 'intercept': True}
[ Error: 0.1695		Precision-recall: 0.7033	ROC: 0.8760 ] - 2.29076313972 sec
Classification Model: LogisticRegressionWithLBFGS {'regType': 'l2', 'intercept': True}
[ Error: 0.1663		Precision-recall: 0.7118	ROC: 0.8817 ] - 2.2789170742 sec
Classification

In [207]:
sc.stop()

In [None]:
pwd