In [None]:
from pyspark import SparkContext
from pyspark.mllib.classification import LogisticRegressionWithLBFGS,SVMWithSGD,LogisticRegressionWithSGD
from pyspark.mllib.regression import LabeledPoint,LinearRegressionWithSGD,RidgeRegressionWithSGD,LassoWithSGD
from pyspark.mllib.feature import StandardScaler,ChiSqSelector,Normalizer,PCA
from pyspark.mllib.evaluation import RegressionMetrics
from pyspark.mllib.stat import Statistics, MultivariateStatisticalSummary
from pyspark.mllib.util import MLUtils
import time

def importRawData(sc, filePath):
    '''
    :param sc: Spark Context
    :param filePath: path to data .csv file
    :return: RDD of (LabeledPoint, index)
    '''
    rdd = sc.textFile(filePath)
    return rdd.map(lambda line: line.split(",")) \
                .map(lambda array: LabeledPoint(int(array[0]),[float(i) for i in array[1:]])) \
                .zipWithIndex()

def exploreData(rawData):
    """
    """
    x = rawData.map(lambda lp:lp[0].features)
    y = rawData.map(lambda lp:[lp[0].label])
    xTrait = Statistics.colStats(x)
    print "Features:"
    print "Format:\t\t[ mean\tvariance\tnumNonzeors\tmax\tmin ]"
    for i in range(len(xTrait.mean())):
        print "Feature %s:\t[ %.4f\t%.4f\t%d\t%.4f\t%.4f ] " % (i+1, xTrait.mean()[i], xTrait.variance()[i],\
                                                    xTrait.numNonzeros()[i], xTrait.max()[i], xTrait.min()[i])
    yTrait = Statistics.colStats(y)
    print "Label:"
    print "Label :\t[ %.2f\t%.2f\t%d\t%.2f\t%.2f ] " % (yTrait.mean()[0], yTrait.variance()[0], \
                                            yTrait.numNonzeros()[0], yTrait.max()[0], yTrait.min()[0])
    corr = Statistics.corr(x, method="pearson")
    print "Pearson Correlation test: "
    for i, row in enumerate(corr):
        for j, r in enumerate(row):
            if (i<j and r>0.8):
                print "Feature %s and Feature %s have r=%s" % (i, j, r)
                
def featureEngineering(rawData, zNorm = True, l2Norm = True, chiSq = False, topFeature = 10):
    """
    """
    #this partition is set by the suggestion of data set
    #first 463715 used as trainning, last 51630 used as validation
    tFeatures = rawData.filter(lambda lp:lp[1]<463715).map(lambda lp:lp[0].features)
    vFeatures = rawData.filter(lambda lp:lp[1]>=463715).map(lambda lp:lp[0].features)
    tLabel = rawData.filter(lambda lp:lp[1]<463715).map(lambda lp:lp[0].label)
    vLabel = rawData.filter(lambda lp:lp[1]>=463715).map(lambda lp:lp[0].label)
    if(zNorm):
        zN = StandardScaler(withMean=True, withStd=True).fit(tFeatures)
        tFeatures = zN.transform(tFeatures)
        vFeatures = zN.transform(vFeatures)
    if(l2Norm):
        l2N = Normalizer()
        tFeatures = l2N.transform(tFeatures)
        vFeatures = l2N.transform(vFeatures)
    if chiSq:
        selector = ChiSqSelector(topFeature).fit(tFeatures)
        tFeatures = selector.transform(tFeatures)
        vFeatures = selector.transform(vFeatures)
    else:
        selector = PCA(topFeature).fit(tFeatures)
        tFeatures = selector.transform(tFeatures)
        vFeatures = selector.transform(vFeatures)
    return (tLabel.zip(tFeatures).map(lambda lp:LabeledPoint(lp[0], lp[1])).cache(), \
            vLabel.zip(vFeatures).map(lambda lp:LabeledPoint(lp[0], lp[1])).cache())
    

def trainClassificationModel(sc, trainingData, validationData, modelClass, **kwargs):
    """
    """
    print "=============================================="
    print "Classification: %s" % modelClass.__name__
    if "regType" in kwargs:
        print "Regulization Type: %s" % kwargs["regType"]
    print "=============================================="
    startTime = time.time()
    print "Convert class variable ..."
    trainingData = trainingData \
                    .map(lambda lp:LabeledPoint(0, lp.features) if lp.label>=1965 else LabeledPoint(1, lp.features))
    validationData = validationData \
                    .map(lambda lp:LabeledPoint(0, lp.features) if lp.label>=1965 else LabeledPoint(1, lp.features))
    print "Training ..."
    model = modelClass.train(trainingData, **kwargs)
    print "Training Complete: "
    validationsResult = validationData.map(lambda lp:(lp.label, model.predict(lp.features)))
    #use error rate as measurement
    modelErr = validationsResult.filter(lambda (actual, predict):actual!=predict).count() / float(validationsResult.count())
    print "\nError Rate: %s " % modelErr
    print "\n%s Model: " % modelClass.__name__
    print model.weights
    print("\nTotal Run Time: %s seconds" % (time.time() - startTime))
        
if __name__=="__main__":
    filePath = "file:///ipython/YearPredictionMSD.txt"
    sc = SparkContext(appName="MainContext")
    rawData = importRawData(sc, filePath).cache()
    try:
        print "File Path: %s" % filePath
        print "***********************************************"
        print "\nData Explore\n"
        print "***********************************************"
        exploreData(rawData)
        print "***********************************************"
        print "\nFeature Engineering\n"
        print "***********************************************"
        trainingData, validationData = featureEngineering(rawData, topFeature=10)
        exploreData(trainingData.zipWithIndex())
        print "***********************************************"
        print "\nClassification\n"
        print "***********************************************"
        #tuple array contains (ModelClass, [regulization types])
        classificationModels = [(SVMWithSGD, [None, "l1", "l2"]), 
                                (LogisticRegressionWithLBFGS, ["l1", "l2"]), 
                                (LogisticRegressionWithSGD, [None, "l1", "l2"])]
        for classificationModel, regTypes in classificationModels:
            for regType_ in regTypes:
                trainClassificationModel(sc, trainingData, validationData, classificationModel, iterations=200, intercept=True, regType=regType_)
            if len(regTypes)==0:
                trainClassificationModel(sc, trainingData, validationData, classificationModel, intercept=True, iterations=100)
    except Exception:
        raise
    finally:
        sc.stop()

File Path: file:///ipython/YearPredictionMSD.txt
***********************************************

Data Explore

***********************************************
Features:
Format:		[ mean	variance	numNonzeors	max	min ]
Feature 1:	[ 43.3871	36.8153	515345	61.9701	1.7490 ] 
Feature 2:	[ 1.2896	2660.5326	515345	384.0657	-337.0925 ] 
Feature 3:	[ 8.6583	1243.8731	515345	322.8514	-301.0051 ] 
Feature 4:	[ 1.1641	266.4335	515345	335.7718	-154.1836 ] 
Feature 5:	[ -6.5536	522.6155	515345	262.0689	-181.9534 ] 
Feature 6:	[ -9.5220	165.3218	515345	166.2369	-81.7943 ] 
Feature 7:	[ -2.3911	212.3395	515345	172.4027	-188.2140 ] 
Feature 8:	[ -1.7932	63.4225	515345	126.7413	-72.5038 ] 
Feature 9:	[ 3.7279	111.9969	515345	146.2979	-126.4790 ] 
Feature 10:	[ 1.8824	42.6439	515345	60.3454	-41.6317 ] 
Feature 11:	[ -0.1465	19.1043	515343	88.0208	-69.6809 ] 
Feature 12:	[ 2.5461	69.2256	515345	87.9132	-94.0420 ] 
Feature 13:	[ 33.7140	495.4912	515345	549.7649	0.1328 ] 
Feature 14:	[ 2439.3594	3060287.3135