In [0]:
from collections import defaultdict
import numpy as np
from pyspark.ml.linalg import SparseVector
from pyspark.sql.functions import explode
from pyspark import SparkFiles
from pyspark.sql import Row

import bz2
import json
import time
from pyspark.ml import Pipeline
from pyspark.ml.feature import * # CountVectorizer, Tokenizer, RegexTokenizer, HashingTF
from pyspark.ml.regression import * # RandomForestRegressor, LinearRegression, DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator

In [0]:
def timeit(method):
    '''
    Decorator to time functions.
    '''
    def timed(*args, **kw):
        ts = time.time()
        result = method(*args, **kw)
        te = time.time()

        print('%r took %2.2f sec\n' % (method.__name__, te-ts))
              
        return result
    return timed

In [0]:
import re
# DATAFILE_PATTERN = '^(.+),"(.+)",(.*),(.*),(.*)'
ID_PATTERN = '"id":(.*?(?=,|}))'
UPS_PATTERN = '"ups":(.*?(?=,|}))'
BODY_PATTERN = '"body":(.*?(?=,|}))'
# DOWNS_PATTERN = '"downs":(.*?(?=,|}))'
SCORE_PATTERN = '"score":(.*?(?=,|}))'
# CONTROVERSIALITY_PATTERN = '"controversiality":(.*?(?=,|}))'

def removeQuotes(s):
    """ Remove quotation marks from an input string
    Args:
        s (str): input string that might have the quote "" characters
    Returns:
        str: a string without the quote characters
    """
    return ''.join(i for i in s if i!='"')

def parseDatafileLine(datafileLine):
    """ Parse a line of the data file using the specified regular expression pattern
    Args:
        datafileLine (str): input string that is a line from the data file
    Returns:
        tuple: a tuple including the parsed results using the given regular expression and without the quote characters
    """
    id_match = re.search(ID_PATTERN, datafileLine.decode('utf-8'))
    ups_match = re.search(UPS_PATTERN, datafileLine.decode('utf-8'))
    body_match = re.search(BODY_PATTERN, datafileLine.decode('utf-8'))
    score_match = re.search(SCORE_PATTERN, datafileLine.decode('utf-8'))
    
    if (id_match is None) or (ups_match is None) or (body_match is None) or (score_match is None):
        print('Invalid datafile line: %s' % datafileLine)
        return (datafileLine, -1)
    else:
        viralness = 0
        if int(score_match.group(1)) < -10 or int(score_match.group(1)) > 10:
            viralness = 1
        comment = (id_match.group(1), int(ups_match.group(1)), removeQuotes(body_match.group(1)), int(score_match.group(1)), viralness)
        return (comment, 1)

In [0]:
import sys
import os
from pyspark import SparkFiles

RC_PATH = '/FileStore/shared_uploads/ddk1@andrew.cmu.edu/RC_2007_10'

def parseData(path):
    """ Parse a data file
    Args:
        filename (str): input file name of the data file
    Returns:
        RDD: a RDD of parsed lines
    """
#     sc.addFile(path)
    return (sc
            .textFile(SparkFiles.get(path), 4, 0)
            .map(parseDatafileLine)
            .cache())

def loadData(path):
    """ Load a data file
    Args:
        path (str): input file name of the data file
    Returns:
        RDD: a RDD of parsed valid lines
    """

    raw = parseData(path).cache()
    
    failed = (raw
              .filter(lambda s: s[1] == -1)
              .map(lambda s: s[0]))
    for line in failed.take(10):
        print('%s - Invalid datafile line: %s' % (path, line))
    
    deleted = (raw
             .filter(lambda s: s[0][2] == '[deleted]')
             .map(lambda s: s[0]))
    
    valid = (raw
             .filter(lambda s: s[1] == 1)
             .filter(lambda s: s[0][2] != '[deleted]')
             .map(lambda s: s[0])
             .cache())
    viral = (raw
             .filter(lambda s: s[1] == 1)
             .filter(lambda s: s[0][4] == 1)
             .map(lambda s: s[0])
             .cache())
    nonviral = (raw
               .filter(lambda s: s[1] == 1)
               .filter(lambda s: s[0][4] == 0)
               .map(lambda s: s[0])
               .cache())
    nonviral_cut = nonviral.sample(False, viral.count()/nonviral.count())
    viral_nonviral_cut = viral.union(nonviral_cut)
    print('%s - Read %d lines, successfully parsed %d lines, failed to parse %d lines, %d lines were deleted, %d lines were viral, %d lines were non-viral, %d viral and non-viral lines were returned' % (path,
                                                                                                                                                                                                 raw.count(),
                                                                                                                                                                                                 valid.count(),
                                                                                                                                                                                                 failed.count(),
                                                                                                                                                                                                 deleted.count(),
                                                                                                                                                                                                 viral.count(),
                                                                                                                                                                                                 nonviral.count(),
                                                                                                                                                                                                 viral_nonviral_cut.count()))
    return viral_nonviral_cut

reddit = loadData(RC_PATH)

In [0]:
sentenceDF = reddit.toDF().selectExpr("_1 as id", "_2 as ups", "_3 as body", "_4 as score", "_5 as viralness")
sentenceDF.show(n=5)

In [0]:
split_regex = r'\W+'
linebreak_regex = r'\\r\\n\\r\\n'

def simpleTokenize(string):
    """ A simple implementation of input string tokenization
    Args:
        string (str): input string
    Returns:
        list: a list of tokens
    """
    linebreak_removed_string = re.sub(linebreak_regex, " ", string)
    return list(filter(None, re.split(split_regex, linebreak_removed_string.lower())))

In [0]:
stopfile = "https://raw.githubusercontent.com/10605/data/master/hw1/stopwords.txt"
sc.addFile(stopfile)
stopwords = set(sc.textFile("file://" + SparkFiles.get("stopwords.txt")).collect())
print('These are the stopwords: %s' % stopwords)

def tokenize(string):
    """ An implementation of input string tokenization that excludes stopwords
    Args:
        string (str): input string
    Returns:
        list: a list of tokens without stopwords
    """
    return list(filter(lambda word: word not in stopwords,simpleTokenize(string)))

In [0]:
redditRecToToken = reddit.map(lambda line: (line[0], line[1], tokenize(line[2]),line[3], line[4]))

print(redditRecToToken.take(5))

def countTokens(vendorRDD):
    """ Count and return the number of tokens
    Args:
        vendorRDD (RDD of (recordId, tokenizedValue)): Pair tuple of record ID to tokenized output
    Returns:
        count: count of all tokens
    """
    # TODO: Uncomment the template below and replace <FILL IN> with appropriate code
    recordCount = vendorRDD.map(lambda line: len(line[0]))
    recordSum = recordCount.sum()
    return recordSum

totalTokens = countTokens(redditRecToToken)
print('There are %s tokens in the combined datasets' % totalTokens)

In [0]:
def timeit(method):
    '''
    Decorator to time functions.
    '''
    def timed(*args, **kw):
        ts = time.time()
        result = method(*args, **kw)
        te = time.time()

        print('%r took %2.2f sec\n' % (method.__name__, te-ts))
              
        return result
    return timed

In [0]:
@timeit
def term_frequency(df, inputCol, outputCol, hashFeatures=None):
    '''
    Returns a DataFrame object containing a new row with the extracted features. 
    Passing hashed=True will return a Featured Hashed matrix.
    
    @params:
        df - DataFrame
        inputCol - name of input column from DataFrame to find features
        outputCol - name of the column to save the features
        hashFeatures - number of features for HashingTF, if None will perform 
            CountVectorization
    '''
    
    # since the number of features was not passed perform standard CountVectorization
    if hashFeatures is None:
        cv = CountVectorizer(inputCol=inputCol, outputCol=outputCol)
        feature_extractor = cv.fit(df)
    # otherwise perform a feature extractor with 
    else:
        feature_extractor = HashingTF(\
                              inputCol=inputCol, outputCol=outputCol, numFeatures=hashFeatures)
    
    # create a new DataFrame using either feature extraction method
    return feature_extractor.transform(df)

In [0]:
wordsFilteredDF = spark.createDataFrame(redditRecToToken).toDF("id", "ups", "filtered_words", "score", "viralness")

# Feature Hash the comment content
# number of features for Feature Hash matrix, reccomended too use power of 2
hashDF = term_frequency(\
    df=wordsFilteredDF, inputCol="filtered_words", outputCol="features", hashFeatures=1024)

# Display snippet of new DataFrame
hashDF.select('filtered_words','features').show(5)

In [0]:
@timeit
def random_forest_regression(df, featuresCol, labelCol):
    '''
    Returns a DataFrame containing a column of predicted values of the labelCol.
    Predict the output of labelCol using values in featuresCol y = rf(x).
    
    @params:
        df - DataFrame
        featuresCol - input features, x
        labelCol - output variable, y
    '''
    # split the training and test data using the holdout method
    (trainingData, testData) = df.randomSplit([0.8, 0.2])
    
    # create the random forest regressor, limit number of trees to ten
    dtr = RandomForestRegressor(\
       featuresCol=featuresCol, labelCol=labelCol)
    
    # fit the training data to the regressor to create the model
    model = dtr.fit(trainingData)
    
    # create a DataFrame contained a column with predicted values of the labelCol
    predictions = model.transform(testData)
    
    return predictions

In [0]:
# train random forest regression
rfPredictions = random_forest_regression(df=hashDF,featuresCol="features",labelCol="score")

# compute the error
evaluator = RegressionEvaluator(labelCol="score", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(rfPredictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

In [0]:
rfPredictions.show(10)

In [0]:
# train random forest regression
viralPredictions = random_forest_regression(df=hashDF,featuresCol="features",labelCol="viralness")

# compute the error
evaluator = RegressionEvaluator(labelCol="viralness", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(viralPredictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

In [0]:
viralPredictions.show(10)

In [0]:
from pyspark.ml.classification import LogisticRegression
from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel, LogisticRegressionWithSGD
from pyspark.mllib.regression import LabeledPoint
@timeit
def logistic_regression(df, featuresCol, labelCol, viralness):
    '''
    Returns a DataFrame containing a column of predicted values of the labelCol.
    Predict the output of labelCol using values in featuresCol y = rf(x).
    
    @params:
        df - DataFrame
        featuresCol - input features, x
        labelCol - output variable, y
    '''
    # split the training and test data using the holdout method
    print(df.count())
    if not viralness:
      df = df.filter((df.score >=0) & (df.score <10))
    print(df.show(10))
    print(df.count())
    (trainingData, testData) = df.randomSplit([0.8, 0.2])
    
    
    # TODO: Uncomment the lines below and replace <FILL IN> with appropriate code
    # Given hyperparameters
    standardization = False
    elastic_net_param = 0.8
    reg_param = .3
    max_iter = 10

    lr = (LogisticRegression(featuresCol=featuresCol, labelCol=labelCol, regParam = reg_param, standardization = standardization, maxIter = max_iter,elasticNetParam = elastic_net_param))
#     lr = (LogisticRegressionWithLBFGS(featuresCol=featuresCol, labelCol=labelCol, regParam = reg_param, standardization = standardization, maxIter = max_iter,elasticNetParam = elastic_net_param))
#     model = LogisticRegressionWithLBFGS.train(trainingData, numClasses=5)
#     print(lr.numClasses)
#     
    lr_model_basic = lr.fit(trainingData)
#     lr_model_basic =LogisticRegressionWithLBFGS.train(trainingData.rdd(),numClasses=5)

    # YOUR CODE HERE
    # raise NotImplementedError()

    #print('intercept: {0}'.format(lr_model_basic.interceptVector))
#     print('length of coefficients: {0}'.format(len(lr_model_basic.coefficientMatrix)))
#     sorted_coefficients = sorted(lr_model_basic.coefficients)[:5]
    trainingSummary = lr_model_basic.summary
    accuracy = trainingSummary.accuracy
    print(accuracy)
    # create a DataFrame contained a column with predicted values of the labelCol
    predictions = lr_model_basic.transform(testData)
    
    return predictions

In [0]:
# train random forest regression
lrPredictions = logistic_regression(df=hashDF,featuresCol="features",labelCol="score", viralness=False)

# compute the error
evaluator = RegressionEvaluator(labelCol="score", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(lrPredictions)
print ("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

In [0]:
lrPredictions.show(10)

In [0]:
# train random forest regression
lrViralPredictions = logistic_regression(df=hashDF,featuresCol="features",labelCol="viralness", viralness=True)

# compute the error
evaluator = RegressionEvaluator(labelCol="viralness", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(lrViralPredictions)
print ("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

In [0]:
lrViralPredictions.show(10)