In [51]:
from pyspark import SparkContext
from pyspark.mllib.classification import LogisticRegressionWithLBFGS,SVMWithSGD,LogisticRegressionWithSGD, NaiveBayes
from pyspark.mllib.regression import LabeledPoint,LinearRegressionWithSGD,RidgeRegressionWithSGD,LassoWithSGD
from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
from pyspark.mllib.feature import StandardScaler,ChiSqSelector,Normalizer,PCA
from pyspark.mllib.evaluation import RegressionMetrics, BinaryClassificationMetrics
from pyspark.mllib.stat import Statistics, MultivariateStatisticalSummary
from pyspark.mllib.util import MLUtils
import time

def importRawData(sc, filePath):
    '''
    :param sc: Spark Context
    :param filePath: path to data .csv file
    :return: RDD of (LabeledPoint, index)
    '''
    rdd = sc.textFile(filePath)
    return rdd.map(lambda line: line.split(",")) \
                .map(lambda array: [float(n) for n in array]) \
                .zipWithIndex()

def exploreData(rawData, corrThreshold = 0.1, colinearThreshold = 0.8):
    """
    """
    print "=============================================="
    print "EXPLORE DATA"
    print "=============================================="
    matrix = rawData.map(lambda t:t[0])
    trait = Statistics.colStats(matrix)
    print "Features:"
    print "Format:\t\t[% 10s\t% 12s\t% 10s\t% 12s\t% 10s]" % ("mean","variance","numNonzeors","max","min")
    for i in range(1, len(trait.mean())):
        print "Feature %s:\t[ % 10.2f\t% 12.2f\t% 10d\t% 12.2f\t% 10.2f ] " % (i+1, trait.mean()[i], trait.variance()[i],\
                                                    trait.numNonzeros()[i], trait.max()[i], trait.min()[i])
    print "Label:"
    print "Label :\t[ % 10.2f\t% 12.2f\t% 10d\t% 12.2f\t% 10.2f ] " % (trait.mean()[0], trait.variance()[0], \
                                            trait.numNonzeros()[0], trait.max()[0], trait.min()[0])
    corr = Statistics.corr(matrix, method="pearson")
    print "Correlation between Features and Label: "
    labelCorr = zip([row[0] for row in corr], range(1, len(corr)))
    labelCorr.sort(key=lambda t:abs(t[0]), reverse=True)
    selection = [];
    for corrScore, i in labelCorr:
        print "Feature %s: % 6.4f" % (i, corrScore)
        if(abs(corrScore)>=corrThreshold):
            selection.append(i)
    print "Correlation between Features (only show colinear > %s): " % colinearThreshold
    for i, row in enumerate(corr):
        for j, r in enumerate(row):
            if (i!=0 and j!=0 and i<j and abs(r)>colinearThreshold):
                print "Feature %s and Feature %s have r=%.4f" % (i, j, r)
                if(i in selection and j in selection):
                    selection.remove(j)
    print "Suggest Feature Selection based on Correlation >= %s: " % corrThreshold
    print "Features: %s" % selection
    return selection
    
                
def featureEngineering(rawData, selection = None, zNorm = True, l2Norm = True, categorical = False, topFeature = 10):
    """
    """
    #this partition is set by the suggestion of data set
    #first 463715 used as trainning, last 51630 used as validation
    print "=============================================="
    print "FEATURE ENGINEERING"
    print "=============================================="
    print "partitioning..."
    tFeatures = rawData.filter(lambda a:a[1]<463715).map(lambda a:a[0][1:])
    vFeatures = rawData.filter(lambda a:a[1]>=463715).map(lambda a:a[0][1:])
    tLabel = rawData.filter(lambda a:a[1]<463715).map(lambda a:a[0][0])
    vLabel = rawData.filter(lambda a:a[1]>=463715).map(lambda a:a[0][0])
    print "Normalization and Scaling... "
    if(zNorm):
        zN = StandardScaler(withMean=True, withStd=True).fit(tFeatures)
        tFeatures = zN.transform(tFeatures)
        vFeatures = zN.transform(vFeatures)
    if(l2Norm):
        l2N = Normalizer()
        tFeatures = l2N.transform(tFeatures)
        vFeatures = l2N.transform(vFeatures)
    print "Feature Selection... "
    if categorical and (selection == None):
        selector = ChiSqSelector(topFeature).fit(tFeatures)
        tFeatures = selector.transform(tFeatures)
        vFeatures = selector.transform(vFeatures)
    else:
        tFeatures = tFeatures.map(lambda a:[a[n-1] for n in selection])
        vFeatures = vFeatures.map(lambda a:[a[n-1] for n in selection])
    featureCount = len(tFeatures.first())
    print "Selected %s Features" % featureCount
    print "Dimension Reduction..."
    selector = PCA(featureCount).fit(tFeatures.map(lambda a: LabeledPoint(1,a).features))
    temp = selector.transform(tFeatures)
    eigen = eigenvalues(temp)
    print "PCA Eigen Vector: %s" % eigen
    accVarPor = [sum(eigen[:i+1])/sum(eigen) for i in range(len(eigen))]
    print "Accumulate Variance Porpotion:"
    reduction = 0
    for i in range(len(accVarPor)):
        print "%s Features:  %.2f%%" % (i+1, accVarPor[i]*100)
        if(reduction==0 and accVarPor[i]>0.9): reduction = i+1
    print "Select %s Features..." % reduction
    selector = PCA(reduction).fit(tFeatures.map(lambda a: LabeledPoint(1,a).features))
    return (tLabel.zip(tFeatures).map(lambda lp:LabeledPoint(lp[0], lp[1])).cache(), \
            vLabel.zip(vFeatures).map(lambda lp:LabeledPoint(lp[0], lp[1])).cache())
    
# def cov(data):
#     count = data.count()
#     miu = data.reduce(lambda a,b:[a[i]+b[i] for i in range(len(a))])
#     miu = map(lambda s:s/count, miu)
#     cov_ = []
#     for i in range(len(miu)):
#         temp = data.map(lambda a:[(a[i]-miu[i])*(a[j]-miu[j]) for j in range(len(a))])\
#                     .reduce(lambda a,b:[a[i]+b[i] for i in range(len(a))])
#         temp = map(lambda s:s/count, temp)
#         cov_.append(temp)
#     return cov_

def eigenvalues(data):
    count = data.count()
    miu = data.reduce(lambda a,b:[a[i]+b[i] for i in range(len(a))])
    miu = map(lambda s:s/count, miu)
    eigen = data.map(lambda a:[(a[i]-miu[i])**2 for i in range(len(a))]) \
                .reduce(lambda a,b:[a[i]+b[i] for i in range(len(a))])
    eigen = map(lambda s:s/count, eigen)
    return eigen

def selectClassificationModel(sc, trainingData, validationData):
    print "=============================================="
    print "CLASSIFICATION"
    print "=============================================="
    classificationModels = [
                                (SVMWithSGD, {"intercept":True, "regType":None}), 
                                (SVMWithSGD, {"intercept":True, "regType":"l1"}), 
                                (SVMWithSGD, {"intercept":True, "regType":"l2"}), 
                                (LogisticRegressionWithLBFGS, {"intercept":True, "regType":"l1"}), 
                                (LogisticRegressionWithLBFGS, {"intercept":True, "regType":"l2"}), 
                                (LogisticRegressionWithSGD, {"intercept":True, "regType":None}), 
                                (LogisticRegressionWithSGD, {"intercept":True, "regType":"l1"}), 
                                (LogisticRegressionWithSGD, {"intercept":True, "regType":"l2"})
                           ]
    for modelClass, kwargs in classificationModels:
        trainClassificationModelWithROC(sc, trainingData, validationData, modelClass, **kwargs)
    classificationModels = [
                                (DecisionTree, {"numClasses":2,"categoricalFeaturesInfo":{},"minInstancesPerNode":100 ,"impurity":"gini"}), 
                                (DecisionTree, {"numClasses":2,"categoricalFeaturesInfo":{},"minInstancesPerNode":100 , "impurity":"entropy"}), 
                                (RandomForest, {"numClasses":2,"categoricalFeaturesInfo":{},"numTrees":20, "impurity":"gini"}), 
                                (RandomForest, {"numClasses":2,"categoricalFeaturesInfo":{},"numTrees":20, "impurity":"entropy"}) 
                           ]
    for modelClass, kwargs in classificationModels:
        trainClassificationModelNoROC(sc, trainingData, validationData, modelClass, **kwargs)

def trainClassificationModelWithROC(sc, trainingData, validationData, modelClass, **kwargs):
    """
    """
    print "Classification Model: %s %s" % (modelClass.__name__, kwargs)
    startTime = time.time()
    trainingData = trainingData \
                    .map(lambda lp:LabeledPoint(1, lp.features) if lp.label>=1965 else LabeledPoint(0, lp.features))
    validationData = validationData \
                    .map(lambda lp:LabeledPoint(1, lp.features) if lp.label>=1965 else LabeledPoint(0, lp.features))
    model = modelClass.train(trainingData, **kwargs)
    model.clearThreshold()
    validationsResult = validationData.map(lambda lp:(float(model.predict(lp.features)), lp.label))
    metric = BinaryClassificationMetrics(validationsResult)
    errors = []
    for i in range(1, 11):
        err = validationsResult.filter(lambda (predict,label):(1 if predict>i/10.0 else 0)!=label).count() \
                                            / float(validationsResult.count())
        errors.append((err, i/10.0))
    errors.sort(key=lambda t:t[0])
    print "[ Error: %.4f\t\tPrecision-recall: %.4f\tROC: %.4f ] - %s sec" \
            % (errors[0][0], metric.areaUnderPR, metric.areaUnderROC, (time.time()-startTime))
        
def trainClassificationModelNoROC(sc, trainingData, validationData, modelClass, **kwargs):
    """
    """
    print "Classification Model: %s %s" % (modelClass.__name__, kwargs)
    startTime = time.time()
    trainingData = trainingData \
                    .map(lambda lp:LabeledPoint(1, lp.features) if lp.label>=1965 else LabeledPoint(0, lp.features))
    validationData = validationData \
                    .map(lambda lp:LabeledPoint(1, lp.features) if lp.label>=1965 else LabeledPoint(0, lp.features))
    model = modelClass.trainClassifier(trainingData, **kwargs)
    validationFeatures = validationData.map(lambda lp:lp.features)
    validationsResult = model.predict(validationFeatures)
    totalCount = validationsResult.count()
    validationsResult = validationsResult.zip(validationData.map(lambda lp:lp.label))
    errCount = validationsResult.filter(lambda (predict,label):predict!=label).count()
    validationsResult = validationsResult.zip(validationData.map(lambda lp:lp.label))
    err = float(errCount) / totalCount
    print "[ Error: %.4f ] - %s sec" % (err, (time.time()-startTime))
        
if __name__=="__main__":
    filePath = "file:///ipython/YearPredictionMSD.txt"
    sc = SparkContext(appName="MainContext")
    rawData = importRawData(sc, filePath).cache()
    try:
        selection = exploreData(rawData)
        trainingData, validationData = featureEngineering(rawData, selection = selection)
        selectClassificationModel(sc, trainingData, validationData)
    except Exception:
        raise
    finally:
        sc.stop()

EXPLORE DATA
Features:
Format:		[      mean	    variance	numNonzeors	         max	       min]
Feature 2:	[      43.39	       36.82	    515345	       61.97	      1.75 ] 
Feature 3:	[       1.29	     2660.53	    515345	      384.07	   -337.09 ] 
Feature 4:	[       8.66	     1243.87	    515345	      322.85	   -301.01 ] 
Feature 5:	[       1.16	      266.43	    515345	      335.77	   -154.18 ] 
Feature 6:	[      -6.55	      522.62	    515345	      262.07	   -181.95 ] 
Feature 7:	[      -9.52	      165.32	    515345	      166.24	    -81.79 ] 
Feature 8:	[      -2.39	      212.34	    515345	      172.40	   -188.21 ] 
Feature 9:	[      -1.79	       63.42	    515345	      126.74	    -72.50 ] 
Feature 10:	[       3.73	      112.00	    515345	      146.30	   -126.48 ] 
Feature 11:	[       1.88	       42.64	    515345	       60.35	    -41.63 ] 
Feature 12:	[      -0.15	       19.10	    515343	       88.02	    -69.68 ] 
Feature 13:	[       2.55	       69.23	    515345	       87.91	    -94.04 ] 
Fe