In [19]:
from pyspark import SparkContext
from pyspark.mllib.stat import Statistics, MultivariateStatisticalSummary
from pyspark.mllib.util import MLUtils
from pyspark.sql import SQLContext
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.mllib.clustering import KMeans, KMeansModel
from pyspark.mllib.clustering import GaussianMixture
import time

def importRawData(sc, filePath):
    """
    :param sc: Spark Context
    :param filePath: path to data .csv file
    :return: RDD of (LabeledPoint, index)
    """
    rdd = MLUtils.loadLibSVMFile(sc,filePath)
    # index was keep for this dataset, we need index to perform partition
    # spark DOES NOT gurantee tranformation and action execute order for RDD[Vector]
    return rdd
            

def exploreData(rawData, corrThreshold = 0.05, colinearThreshold = 0.8):
    """
    this function is use to print out some basic statistic of data
    :param rawData: RDD[Vector] of raw data
    :param corrThreshold: correlation threshold for print
    :param colinearThreshold: colinear threshold for print
    :return: local matrix of correliation matrix, corr[0] as label, rest are features
    """
    print "=============================================="
    print "EXPLORE DATA"
    print "=============================================="
    matrix = rawData.map(lambda lp:lp.features)
    trait = Statistics.colStats(matrix)
    print "Features:"
    print "Format:\t\t[% 10s\t% 12s\t% 10s\t% 12s\t% 10s]" % ("mean","variance","numNonzeors","max","min")
    for i in range(1, len(trait.mean())):
        print "Feature %s:\t[ % 10.2f\t% 12.2f\t% 10d\t% 12.2f\t% 10.2f ] " % (i+1, trait.mean()[i], trait.variance()[i],\
                                                    trait.numNonzeros()[i], trait.max()[i], trait.min()[i])
    print "Label:"
    print "Label :\t[ % 10.2f\t% 12.2f\t% 10d\t% 12.2f\t% 10.2f ] " % (trait.mean()[0], trait.variance()[0], \
                                            trait.numNonzeros()[0], trait.max()[0], trait.min()[0])
    corr = Statistics.corr(matrix, method="pearson")
    print "Correlation between Features and Label (only show corr > %s): " % corrThreshold
    labelCorr = zip([row[0] for row in corr[1:] if abs(row[0])>=corrThreshold], range(1, len(corr)))
    labelCorr.sort(key=lambda t:abs(t[0]), reverse=True)
    for corrScore, i in labelCorr:
        print "Feature %s: % 6.4f" % (i, corrScore)
    print "Correlation between Features (only show colinear > %s): " % colinearThreshold
    for i, row in enumerate(corr):
        for j, r in enumerate(row):
            if (i!=0 and j!=0 and i<j and abs(r)>colinearThreshold):
                print "Feature %s and Feature %s have r=%.4f" % (i, j, r)
    return corr

def featureEngineering(rawData, corrSelection = None, zNorm = True, l2Norm = True, categorical = False, topFeature = 10):
    """
    this function is to provide feature engineering and transformation
    including partition, normalization, feature selection, dimension reduction
    :param rawData: RDD[Vector] of raw data
    :param corrSelection: local matrix of correliation matrix, if provided will perform CFS
    :param zNorm: boolean of whether to perform Z normalization or not
    :param l2Norm: boolean of whether to perform L2 normalization or not
    :param categorical: boolean of whether to perform chi square feature selection or not
    :param topFeature: select top features if using chi square selection
    :return: tuple of (RDD[LabeledPoint] trainingData, RDD[LabeledPoint] validationData)
    """
    print "=============================================="
    print "FEATURE ENGINEERING"
    print "=============================================="
    print "partitioning..."
    # beware this partitioning is HARD CODED!
    # because it was suggested by data set creator
    # first 463715 used as trainning, last 51630 used as validation
    tFeatures = rawData.filter(lambda a:a[1]<463715).map(lambda a:a[0][1:])
    vFeatures = rawData.filter(lambda a:a[1]>=463715).map(lambda a:a[0][1:])
    tLabel = rawData.filter(lambda a:a[1]<463715).map(lambda a:a[0][0])
    vLabel = rawData.filter(lambda a:a[1]>=463715).map(lambda a:a[0][0])
    print "Normalization and Scaling... "
    if(zNorm):
        zN = StandardScaler(withMean=True, withStd=True).fit(tFeatures)
        tFeatures = zN.transform(tFeatures)
        vFeatures = zN.transform(vFeatures)
    if(l2Norm):
        l2N = Normalizer()
        tFeatures = l2N.transform(tFeatures)
        vFeatures = l2N.transform(vFeatures)
    print "Feature Selection... "
    # only categorical value for classification problem could use chi square selector
    # otherwise, use correlation based selector instead
    if categorical or (corrSelection is None):
        selector = ChiSqSelector(topFeature).fit(tFeatures)
        tFeatures = selector.transform(tFeatures)
        vFeatures = selector.transform(vFeatures)
    else:
        bestFeatures = cfs(corrSelection)
        tFeatures = tFeatures.map(lambda a:[a[n-1] for n in bestFeatures])
        vFeatures = vFeatures.map(lambda a:[a[n-1] for n in bestFeatures])
    featureCount = len(tFeatures.first())
    print "Selected %s Features: " % featureCount
    print "Dimension Reduction..."
    # PCA dimension reduction = EVD of cov matrix = SVD of data matrix
    # spark use EVD implement, hence we need to calculate eigen vector ourself to decide cutoff
    # cutoff point the reduced dimension could represent 90% of original total variance
    selector = PCA(len(bestFeatures)).fit(tFeatures.map(lambda a: LabeledPoint(1,a).features))
    temp = selector.transform(tFeatures)
    eigen = eigenvalues(temp)
    print "PCA Eigen Vector: %s" % eigen
    accVarPor = [sum(eigen[:i+1])/sum(eigen) for i in range(len(eigen))]
    print "Accumulate Variance Porpotion:"
    reduction = 0
    for i in range(len(accVarPor)):
        print "%s Dimensions:  %.2f%%" % (i+1, accVarPor[i]*100)
        if(reduction==0 and accVarPor[i]>0.9): reduction = i+1
    print "Reduce to %s dimensions..." % reduction
    selector = PCA(reduction).fit(tFeatures.map(lambda a: LabeledPoint(1,a).features))
    return (tLabel.zip(tFeatures).map(lambda lp:LabeledPoint(lp[0], lp[1])).cache(), \
            vLabel.zip(vFeatures).map(lambda lp:LabeledPoint(lp[0], lp[1])).cache())

def cfs(corr):
    """
    helper function of correlation feature selection, implemented in greedy algorithm O(n^2logn)
    :param corr: local matrix of correliation matrix
    :return: local list of best k features, list element as feature index (1 based)
    """
    features = range(1,len(corr))
    bestK = []
    queue = []
    merit = -1
    sumL = 0
    sumF = 0
    print "Greedy Correlation Feature Selection (CFS)"
    # the stop condition of this algorithm is we reach maximum merit
    # i.e. merit start to decrease as we include more features
    # because of the increase correlation between features-to-features outweight featur-to-labels'
    # that's a bad thing to avoid, hence we stop there
    while(len(features)>0):
        for f in features:
            bestK.append(f)
            tempL = sumL+abs(corr[0][f])
            tempF = sumF
            for cur in bestK:
                if cur!=f:
                    tempF += abs(corr[cur][f])
            queue.append((tempL/((len(bestK)+1+2*tempF)**0.5), f, tempL, tempF))
            bestK.remove(f)
        queue.sort(key=lambda e:e[0], reverse=True)
        if(queue[0][0]<=merit):
            break
        merit = queue[0][0]
        bestK.append(queue[0][1])
        features.remove(queue[0][1])
        sumL = queue[0][2]
        sumF = queue[0][3]
        print "k=%s\tHighest Merit=%.4f\tSelected:%s" % (len(bestK),merit,bestK)
    return bestK

def eigenvalues(data):
    """
    helper function of calculate eigen vector of PCA, to determine how many dimension after reduction
    :param data: RDD[Vector] of data after PCA transformation
    :return: local list of eigen vector based on input data
    """
    # only eigen value is calculated, not the whole covariance because that would be too slow
    count = data.count()
    miu = data.reduce(lambda a,b:[a[i]+b[i] for i in range(len(a))])
    miu = map(lambda s:s/count, miu)
    eigen = data.map(lambda a:[(a[i]-miu[i])**2 for i in range(len(a))]) \
                .reduce(lambda a,b:[a[i]+b[i] for i in range(len(a))])
    eigen = map(lambda s:s/count, eigen)
    return eigen

def selectClusteringModel(sc, trainingData):
    """
    wrapper function to evaluate and select all the classification models
    :param sc: spark context
    :param trainingData: RDD[LabeledPoint] of training data
    :param validationData: RDD[LabeledPoint] of validation data
    :return: None
    """
    print "=============================================="
    print "CLUSTERING"
    print "=============================================="
    clusteringModels = [
                                (KMeans, {"k":2})
                                #,(GaussianMixture,{"k":2})
                           ]
    for modelClass, kwargs in clusteringModels:
        trainClusteringModel(sc, trainingData, modelClass, **kwargs)

def trainClusteringModel(sc, trainingData, modelClass, **kwargs):
    """
    train classification models for NOT-TREE based model
    :param sc: spark context
    :param trainingData: RDD[LabeledPoint] of training data
    :param validationData: RDD[LabeledPoint] of validation data
    :modelClass: model CLASS that use to train
    :kwargs: key-value paired arguments for modelClass, would be passes in directly
    :return: None
    """
    print "Clustering Model: %s %s" % (modelClass.__name__, kwargs)
    startTime = time.time()
    data = trainingData.map(lambda lp:lp.features)
   
    model = modelClass.train(data, **kwargs)
    validationsResult = trainingData.map(lambda lp:(float(model.predict(lp.features)), lp.label))
    #errorRate = validationsResult.filter(lambda t:t[0]==t[1]).count()/float(validationsResult.count())
    rate2 = validationsResult.filter(lambda t:t[0]==0).count()/float(validationsResult.count())
    rate1 = validationsResult.filter(lambda t:t[1]==1).count()/float(validationsResult.count())
    rate3 = validationsResult.filter(lambda t:(t[1]==1)&(t[0]==0)).count()/float(validationsResult.count())
    print "[Commercial Rate in Dataset: %.4f ] - %s sec" \
            % (rate1, (time.time()-startTime))
    print "[Commercial Rate in Clustering Model: %.4f ] - %s sec" \
            % (rate2, (time.time()-startTime))
    print "[Correct Rate: %.4f ] - %s sec" \
            % (rate3, (time.time()-startTime))
    #print "[Rate: %.4f ] - %s sec" \
     #       % (errorRate, (time.time()-startTime))
    #metric = model.computeCost(data)
  
    #print "[ WSSSE: %.4f ] - %s sec" \
     #       % (metric, (time.time()-startTime))
        
        
if __name__=="__main__":
    
    fileName = ["file:///ipython/spark/BBC.txt","file:///ipython/spark/CNN.txt","file:///ipython/spark/CNNIBN.txt","file:///ipython/spark/NDTV.txt","file:///ipython/spark/TIMESNOW.txt"]
    for filePath in fileName:
        print "=============================================="
        print filePath 
        print "=============================================="
        sc = SparkContext(appName="MainCOntext")
        rawData = importRawData(sc, filePath).cache()
        try:
            #corrSelection = exploreData(rawData)
            selectClusteringModel(sc, rawData)
        except Exception:
            raise
        finally:
            sc.stop()


file:///ipython/spark/BBC.txt
CLUSTERING
Clustering Model: KMeans {'k': 2}
[Commercial Rate in Dataset: 0.4749 ] - 17.0943098068 sec
[Commercial Rate in Clustering Model: 0.7735 ] - 17.0951399803 sec
[Correct Rate: 0.4086 ] - 17.0951769352 sec
file:///ipython/spark/CNN.txt
CLUSTERING
Clustering Model: KMeans {'k': 2}
[Commercial Rate in Dataset: 0.6392 ] - 24.2854502201 sec
[Commercial Rate in Clustering Model: 0.5301 ] - 24.2862520218 sec
[Correct Rate: 0.4177 ] - 24.2862770557 sec
file:///ipython/spark/CNNIBN.txt
CLUSTERING
Clustering Model: KMeans {'k': 2}
[Commercial Rate in Dataset: 0.6550 ] - 32.4633169174 sec
[Commercial Rate in Clustering Model: 0.5144 ] - 32.4641370773 sec
[Correct Rate: 0.4037 ] - 32.4641709328 sec
file:///ipython/spark/NDTV.txt
CLUSTERING
Clustering Model: KMeans {'k': 2}
[Commercial Rate in Dataset: 0.7368 ] - 16.0542960167 sec
[Commercial Rate in Clustering Model: 0.6023 ] - 16.0550689697 sec
[Correct Rate: 0.5208 ] - 16.0551059246 sec
file:///ipython/spar

In [17]:
sc.stop()