In [15]:
from pyspark import SparkContext
from pyspark.mllib.classification import LogisticRegressionWithLBFGS,SVMWithSGD,LogisticRegressionWithSGD, NaiveBayes
from pyspark.mllib.regression import LabeledPoint,LinearRegressionWithSGD,RidgeRegressionWithSGD,LassoWithSGD
from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
from pyspark.mllib.feature import StandardScaler,ChiSqSelector,Normalizer,PCA
from pyspark.mllib.evaluation import RegressionMetrics, BinaryClassificationMetrics
from pyspark.mllib.stat import Statistics, MultivariateStatisticalSummary
from pyspark.mllib.util import MLUtils
from pyspark.ml import Pipeline
from pyspark.sql import SQLContext
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.regression import DecisionTreeRegressor, LinearRegression, RandomForestRegressor
from pyspark.ml.evaluation import BinaryClassificationEvaluator, RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
import time

In [32]:
def importRawData(sc, filePath1,filePath2):
    """
    :param sc: Spark Context
    :param filePath: path to data .csv file
    :return: RDD of (LabeledPoint, index)
    """
    rdd1 = sc.textFile(filePath1)
    rdd2 = sc.textFile(filePath2)
    # index was keep for this dataset, we need index to perform partition
    # spark DOES NOT gurantee tranformation and action execute order for RDD[Vector]
    Trdd = rdd1.map(lambda line: line.split(",")) \
                 .map(lambda array: [float(n) for n in array])
    Vrdd = rdd2.map(lambda line: line.split(",")) \
                 .map(lambda array: [float(n) for n in array]) 
            
    return (Trdd,Vrdd)



def featureEngineering(trainingData,validationData, corrSelection = None, zNorm = True, l2Norm = True, categorical = True, topFeature = 10):
    """
    this function is to provide feature engineering and transformation
    including partition, normalization, feature selection, dimension reduction
    :param rawData: RDD[Vector] of raw data
    :param corrSelection: local matrix of correliation matrix, if provided will perform CFS
    :param zNorm: boolean of whether to perform Z normalization or not
    :param l2Norm: boolean of whether to perform L2 normalization or not
    :param categorical: boolean of whether to perform chi square feature selection or not
    :param topFeature: select top features if using chi square selection
    :return: tuple of (RDD[LabeledPoint] trainingData, RDD[LabeledPoint] validationData)
    """
    print "=============================================="
    print "FEATURE ENGINEERING"
    print "=============================================="
    print "partitioning..."
    # beware this partitioning is HARD CODED!
    # because it was suggested by data set creator
    # first 463715 used as trainning, last 51630 used as validation
    tFeatures = trainingData.map(lambda a:a[:-1])
    vFeatures = validationData.map(lambda a:a[:-1])
    tLabel = trainingData.map(lambda a:a[-1])
    vLabel = validationData.map(lambda a:a[-1])
    
    print "Normalization and Scaling... "
    if(zNorm):
        zN = StandardScaler(withMean=True, withStd=True).fit(tFeatures)
        tFeatures = zN.transform(tFeatures)
        vFeatures = zN.transform(vFeatures)
    if(l2Norm):
        l2N = Normalizer()
        tFeatures = l2N.transform(tFeatures)
        vFeatures = l2N.transform(vFeatures)
    '''
    print "Feature Selection... "
    
    # only categorical value for classification problem could use chi square selector
    # otherwise, use correlation based selector instead
    if categorical or (corrSelection is None):
        
        selector = ChiSqSelector(topFeature).fit(tLabel.zip(tFeatures).map(lambda lp:LabeledPoint(lp[0], lp[1])))
        tFeatures = selector.transform(tFeatures)
        vFeatures = selector.transform(vFeatures)
    else:
        bestFeatures = cfs(corrSelection)
        tFeatures = tFeatures.map(lambda a:[a[n-1] for n in bestFeatures])
        vFeatures = vFeatures.map(lambda a:[a[n-1] for n in bestFeatures])
    featureCount = len(tFeatures.first())
    print "Selected %s Features: " % featureCount
    print "Dimension Reduction..."
    # PCA dimension reduction = EVD of cov matrix = SVD of data matrix
    # spark use EVD implement, hence we need to calculate eigen vector ourself to decide cutoff
    # cutoff point the reduced dimension could represent 90% of original total variance
    selector = PCA(len(bestFeatures)).fit(tFeatures.map(lambda a: LabeledPoint(1,a).features))
    temp = selector.transform(tFeatures)
    eigen = eigenvalues(temp)
    print "PCA Eigen Vector: %s" % eigen
    accVarPor = [sum(eigen[:i+1])/sum(eigen) for i in range(len(eigen))]
    print "Accumulate Variance Porpotion:"
    reduction = 0
    for i in range(len(accVarPor)):
        print "%s Dimensions:  %.2f%%" % (i+1, accVarPor[i]*100)
        if(reduction==0 and accVarPor[i]>0.9): reduction = i+1
    print "Reduce to %s dimensions..." % reduction
    selector = PCA(reduction).fit(tFeatures.map(lambda a: LabeledPoint(1,a).features))
    '''
    return (tLabel.zip(tFeatures).map(lambda lp:LabeledPoint(lp[0], lp[1])).cache(), \
            vLabel.zip(vFeatures).map(lambda lp:LabeledPoint(lp[0], lp[1])).cache())
'''
def eigenvalues(data):
    """
    helper function of calculate eigen vector of PCA, to determine how many dimension after reduction
    :param data: RDD[Vector] of data after PCA transformation
    :return: local list of eigen vector based on input data
    """
    # only eigen value is calculated, not the whole covariance because that would be too slow
    count = data.count()
    miu = data.reduce(lambda a,b:[a[i]+b[i] for i in range(len(a))])
    miu = map(lambda s:s/count, miu)
    eigen = data.map(lambda a:[(a[i]-miu[i])**2 for i in range(len(a))]) \
                .reduce(lambda a,b:[a[i]+b[i] for i in range(len(a))])
    eigen = map(lambda s:s/count, eigen)
    return eigen
'''
def selectClassificationModel(sc, trainingData, validationData):
    """
    wrapper function to evaluate and select all the classification models
    :param sc: spark context
    :param trainingData: RDD[LabeledPoint] of training data
    :param validationData: RDD[LabeledPoint] of validation data
    :return: None
    """
    print "=============================================="
    print "CLASSIFICATION"
    print "=============================================="
    classificationModels = [
                                (SVMWithSGD, {"intercept":True, "regType":None}), 
                                (SVMWithSGD, {"intercept":True, "regType":"l1"}), 
                                (SVMWithSGD, {"intercept":True, "regType":"l2"}), 
                                (LogisticRegressionWithLBFGS, {"intercept":True, "regType":"l1"}), 
                                (LogisticRegressionWithLBFGS, {"intercept":True, "regType":"l2"}), 
                                (LogisticRegressionWithSGD, {"intercept":True, "regType":None}), 
                                (LogisticRegressionWithSGD, {"intercept":True, "regType":"l1"}), 
                                (LogisticRegressionWithSGD, {"intercept":True, "regType":"l2"})
                           ]
    for modelClass, kwargs in classificationModels:
        trainClassificationModel(sc, trainingData, validationData, modelClass, **kwargs)
    #GBT is waaaaaay too slow for this dataset
    classificationModels = [
                                (DecisionTree, {"numClasses":2,"categoricalFeaturesInfo":{},"minInstancesPerNode":100 ,"impurity":"gini"}), 
                                (DecisionTree, {"numClasses":2,"categoricalFeaturesInfo":{},"minInstancesPerNode":100 , "impurity":"entropy"}), 
                                (RandomForest, {"numClasses":2,"categoricalFeaturesInfo":{},"numTrees":20, "impurity":"gini"}), 
                                (RandomForest, {"numClasses":2,"categoricalFeaturesInfo":{},"numTrees":20, "impurity":"entropy"}) 
                           ]
    for modelClass, kwargs in classificationModels:
        trainClassificationTreeModel(sc, trainingData, validationData, modelClass, **kwargs)

def trainClassificationModel(sc, trainingData, validationData, modelClass, **kwargs):
    """
    train classification models for NOT-TREE based model
    :param sc: spark context
    :param trainingData: RDD[LabeledPoint] of training data
    :param validationData: RDD[LabeledPoint] of validation data
    :modelClass: model CLASS that use to train
    :kwargs: key-value paired arguments for modelClass, would be passes in directly
    :return: None
    """
    print "Classification Model: %s %s" % (modelClass.__name__, kwargs)
    startTime = time.time()
    trainingData = trainingData \
                    .map(lambda lp:LabeledPoint(1, lp.features) if lp.label>=2000 else LabeledPoint(0, lp.features))
    validationData = validationData \
                    .map(lambda lp:LabeledPoint(1, lp.features) if lp.label>=2000 else LabeledPoint(0, lp.features))
    model = modelClass.train(trainingData, **kwargs)
    model.clearThreshold()
    validationsResult = validationData.map(lambda lp:(float(model.predict(lp.features)), lp.label))
    metric = BinaryClassificationMetrics(validationsResult)
    # the error rate search is to search for overall best error rate
    # regardless of precision and recall, however they could be evaluate by PR area and ROC area
    errors = []
    for i in range(1, 11):
        err = validationsResult.filter(lambda (predict,label):(1 if predict>i/10.0 else 0)!=label).count() \
                                            / float(validationsResult.count())
        errors.append((err, i/10.0))
    errors.sort(key=lambda t:t[0])
    print "[ Error: %.4f\t\tPrecision-recall: %.4f\tROC: %.4f ] - %s sec" \
            % (errors[0][0], metric.areaUnderPR, metric.areaUnderROC, (time.time()-startTime))
        
def trainClassificationTreeModel(sc, trainingData, validationData, modelClass, **kwargs):
    """
    train classification models for TREE based model
    :param sc: spark context
    :param trainingData: RDD[LabeledPoint] of training data
    :param validationData: RDD[LabeledPoint] of validation data
    :modelClass: model CLASS that use to train
    :kwargs: key-value paired arguments for modelClass, would be passes in directly
    :return: None
    """
    print "Classification Model: %s %s" % (modelClass.__name__, kwargs)
    startTime = time.time()
    trainingData = trainingData \
                    .map(lambda lp:LabeledPoint(1, lp.features) if lp.label>=2000 else LabeledPoint(0, lp.features))
    validationData = validationData \
                    .map(lambda lp:LabeledPoint(1, lp.features) if lp.label>=2000 else LabeledPoint(0, lp.features))
    model = modelClass.trainClassifier(trainingData, **kwargs)
    validationFeatures = validationData.map(lambda lp:lp.features)
    # !!!beware, due to some stange bug, DO NOT chain RDD transformation on tree model predict, count() immediately!!!
    validationsResult = model.predict(validationFeatures)
    totalCount = validationsResult.count()
    validationsResult = validationsResult.zip(validationData.map(lambda lp:lp.label))
    errCount = validationsResult.filter(lambda (predict,label):predict!=label).count()
    validationsResult = validationsResult.zip(validationData.map(lambda lp:lp.label))
    err = float(errCount) / totalCount
    print "[ Error: %.4f ] - %s sec" % (err, (time.time()-startTime))

In [33]:
def selectClassificationPipelineModel(sqlContext, trainingData, validationData):
    """
    wrapper function to evaluate and select all the classification models using pipeline CV
    :param sqlContext: spark sql context
    :param trainingData: RDD[LabeledPoint] of training data
    :param validationData: RDD[LabeledPoint] of validation data
    :return: None
    """
    print "=============================================="
    print "CLASSIFICATION WITH PIPELINE CV"
    print "=============================================="
    classificationModels = [
                                (LogisticRegression(), {"fitIntercept":True, "regParam":[0.1,0.01,0.001]})
                           ]
    for modelObject, kwargs in classificationModels:
        trainClassificationPipelineModel(sqlContext, trainingData, validationData, modelObject, kwargs)
    
def trainClassificationPipelineModel(sqlContext, trainingData, validationData, modelObject, kwargs):
    """
    train classification models for NOT-TREE based model
    :param sqlContext: spark sql context
    :param trainingData: RDD[LabeledPoint] of training data
    :param validationData: RDD[LabeledPoint] of validation data
    :modelObject: model OBJECT that use to train
    :kwargs: key-value dict for cross validation, would be passes in directly
    :return: None
    """
    print "Classification Model: %s %s" % (modelObject.__class__.__name__, kwargs)
    startTime = time.time()
    trainingData = trainingData \
                    .map(lambda lp:LabeledPoint(1, lp.features) if lp.label>=2000 else LabeledPoint(0, lp.features))
    validationData = validationData \
                    .map(lambda lp:LabeledPoint(1, lp.features) if lp.label>=2000 else LabeledPoint(0, lp.features))
    trainingDF = sqlContext.createDataFrame(trainingData).cache()
    validationDF = sqlContext.createDataFrame(validationData).cache()
    grid = ParamGridBuilder().baseOn(kwargs).build()
    evaluator = BinaryClassificationEvaluator()
    cv = CrossValidator(estimator=modelObject, estimatorParamMaps=grid, evaluator=evaluator, numFolds = 5)
    cvModel = cv.fit(trainingDF)
    validationResult = cvModel.transform(validationDF)
    areaUnderROC = evaluator.evaluate(validationResult, {evaluator.metricName: "areaUnderROC"})
    areaUnderPR = evaluator.evaluate(validationResult, {evaluator.metricName: "areaUnderPR"})
    print "[ Precision-recall: %.4f\tROC: %.4f ] - %s sec" \
            % (areaUnderPR, areaUnderROC, (time.time()-startTime))

In [34]:
if __name__=="__main__":
    filePath1 = "file:///ipython/CSYE7374_ASSIGNMENT2/training_data.csv"
    filePath2 = "file:///ipython/CSYE7374_ASSIGNMENT2/training_data.csv"
    sc = SparkContext(appName="MainContext")
    sqlContext = SQLContext(sc)
    #rawData = importRawData(sc, filePath1,filePath2).cache()
    trainingData, validationData = importRawData(sc, filePath1,filePath2)
    
    try:
        trainingData, validationData = featureEngineering(trainingData,validationData)
        selectClassificationModel(sc, trainingData, validationData)
        #selectRegressionModel(sc, trainingData, validationData)
        selectClassificationPipelineModel(sqlContext, trainingData, validationData)
        #selectRegressionPipelineModel(sqlContext, trainingData, validationData)
    except Exception:
        raise
    finally:
        sc.stop()

FEATURE ENGINEERING
partitioning...
Normalization and Scaling... 
CLASSIFICATION
Classification Model: SVMWithSGD {'regType': None, 'intercept': True}
[ Error: 0.0000		Precision-recall: 0.0000	ROC: 0.0000 ] - 15.1522059441 sec
Classification Model: SVMWithSGD {'regType': 'l1', 'intercept': True}
[ Error: 0.0000		Precision-recall: 0.0000	ROC: 0.0000 ] - 13.6986620426 sec
Classification Model: SVMWithSGD {'regType': 'l2', 'intercept': True}
[ Error: 0.0000		Precision-recall: 0.0000	ROC: 0.0000 ] - 9.83493089676 sec
Classification Model: LogisticRegressionWithLBFGS {'regType': 'l1', 'intercept': True}
[ Error: 0.0000		Precision-recall: 0.0000	ROC: 0.0000 ] - 7.79879188538 sec
Classification Model: LogisticRegressionWithLBFGS {'regType': 'l2', 'intercept': True}
[ Error: 0.0000		Precision-recall: 0.0000	ROC: 0.0000 ] - 6.30630803108 sec
Classification Model: LogisticRegressionWithSGD {'regType': None, 'intercept': True}
[ Error: 0.0000		Precision-recall: 0.0000	ROC: 0.0000 ] - 9.6822929382

Py4JJavaError: An error occurred while calling o2540.fit.
: java.lang.ArrayIndexOutOfBoundsException: 1
	at org.apache.spark.ml.classification.LogisticRegression.train(LogisticRegression.scala:183)
	at org.apache.spark.ml.classification.LogisticRegression.train(LogisticRegression.scala:52)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:90)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:71)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:606)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:379)
	at py4j.Gateway.invoke(Gateway.java:259)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:207)
	at java.lang.Thread.run(Thread.java:744)


In [7]:
sc.stop()

In [11]:
pwd

u'/ipython/CSYE7374_ASSIGNMENT2'