In [2]:
from collections import defaultdict
import numpy as np
from pyspark.ml.linalg import SparseVector
from pyspark.sql.functions import explode
from pyspark import SparkFiles
from pyspark.sql import Row

import bz2
import json
import time
from pyspark.ml import Pipeline
from pyspark.ml.feature import * # CountVectorizer, Tokenizer, RegexTokenizer, HashingTF
from pyspark.ml.regression import * # RandomForestRegressor, LinearRegression, DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
def timeit(method):
    '''
    Decorator to time functions.
    '''
    def timed(*args, **kw):
        ts = time.time()
        result = method(*args, **kw)
        te = time.time()

        print('%r took %2.2f sec\n' % (method.__name__, te-ts))
              
        return result
    return timed

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
@timeit
def load_data(filename, test=True, mb=1):
    '''
    Returns either the a DataFrame containing all the tweets or a test DataFrame containing
    numTest comments.
    
    @params:
        test - boolean, if True return test DataFrame
        mb - the number of megabytes to load from the data set
    '''
    
    # load compressed file
    #comments_file = bz2.BZ2File(filename, "r")
    
    # convert the megabytes to bytes
    #size = mb * (1024 ** 2)
    
    # load a test dataset of size mb
    if test:
        # create RDD using string returned by reading the comments file
        # specify bytesize of file to read
        #commentRDD = sc.parallelize(comments_file.readlines(size))
        #commentRDD = sc.parallelize(filename)
        # read RDD as json and convert to a DataFrame
        df = spark.read.json(filename)
    # load full dataset
    else:
        df = spark.read.json(filename)
        
    # return a new DataFrame that doesn't contain deleted comments
    #return df.filter("body != '[deleted]'")
    return df.filter("body != '[deleted]'")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [5]:
filename = 's3://dsml-vasu-simar-daniel/RC_2015-0*'

# load the comments into a DataFrame
commentDF = load_data(filename, mb=1)

# Display comments and information
print("Snippet of Comment DataFrame:")
commentDF.select('body', 'ups', 'downs', 'gilded', 'subreddit', 'score').show(5)
print("Column names of comment DataFrame:")
print(commentDF.columns)
print("\nThe total number of comments: %s (deleted comments removed)" % commentDF.count())

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

'load_data' took 81.39 sec

Snippet of Comment DataFrame:
+--------------------+---+-----+------+--------------+-----+
|                body|ups|downs|gilded|     subreddit|score|
+--------------------+---+-----+------+--------------+-----+
|Most of us have s...| 14|    0|     0|      exmormon|   14|
|But Mill's career...|  3|    0|     0|CanadaPolitics|    3|
|Mine uses a strai...|  1|    0|     0| AdviceAnimals|    1|
|Very fast, thank ...|  2|    0|     0|    freedonuts|    2|
|The guy is a prof...|  6|    0|     0|           WTF|    6|
+--------------------+---+-----+------+--------------+-----+
only showing top 5 rows

Column names of comment DataFrame:
['approved_by', 'archived', 'author', 'author_flair_css_class', 'author_flair_text', 'banned_by', 'body', 'body_html', 'controversiality', 'created', 'created_utc', 'distinguished', 'downs', 'edited', 'gilded', 'id', 'likes', 'link_id', 'mod_reports', 'name', 'num_reports', 'parent_id', 'removal_reason', 'replies', 'report_reasons'

In [6]:
sentenceDF = commentDF.select('id','ups','body','score')
sentenceDF.show(n=5)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------+---+--------------------+-----+
|     id|ups|                body|score|
+-------+---+--------------------+-----+
|cnas8zv| 14|Most of us have s...|   14|
|cnas8zw|  3|But Mill's career...|    3|
|cnas8zx|  1|Mine uses a strai...|    1|
|cnas8zz|  2|Very fast, thank ...|    2|
|cnas900|  6|The guy is a prof...|    6|
+-------+---+--------------------+-----+
only showing top 5 rows

In [24]:
hashDF.rdd.getNumPartitions()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

694

In [7]:
# use pyspark tokenizer object to split words in array
pattern = "\\W"
# tokenizer = RegexTokenizer(inputCol="body", outputCol="words", pattern=pattern)
tokenizer = Tokenizer(inputCol="body", outputCol="words")
wordsDF = tokenizer.transform(sentenceDF)

# Remove stop words
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
wordsFilteredDF = remover.transform(wordsDF)

# Remove body and words since they will no longer be used
wordsFilteredDF = wordsFilteredDF.select('id','ups','filtered_words','score')

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [8]:
@timeit
def term_frequency(df, inputCol, outputCol, hashFeatures=None):
    '''
    Returns a DataFrame object containing a new row with the extracted features. 
    Passing hashed=True will return a Featured Hashed matrix.
    
    @params:
        df - DataFrame
        inputCol - name of input column from DataFrame to find features
        outputCol - name of the column to save the features
        hashFeatures - number of features for HashingTF, if None will perform 
            CountVectorization
    '''
    
    # since the number of features was not passed perform standard CountVectorization
    if hashFeatures is None:
        cv = CountVectorizer(inputCol=inputCol, outputCol=outputCol)
        feature_extractor = cv.fit(wordsFilteredDF)
    # otherwise perform a feature extractor with 
    else:
        feature_extractor = HashingTF(\
                              inputCol=inputCol, outputCol=outputCol, numFeatures=hashFeatures)
    
    # create a new DataFrame using either feature extraction method
    return feature_extractor.transform(df)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [18]:
# Feature Hash the comment content
# number of features for Feature Hash matrix, reccomended too use power of 2
hashDF = term_frequency(\
    df=wordsFilteredDF, inputCol="filtered_words", outputCol="features", hashFeatures=1024)

# Display snippet of new DataFrame
hashDF.select('filtered_words','features').show(5)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

'term_frequency' took 0.05 sec

+--------------------+--------------------+
|      filtered_words|            features|
+--------------------+--------------------+
|[us, family, memb...|(1024,[368,386,45...|
|[mill's, career, ...|(1024,[102,211,22...|
|[mine, uses, stra...|(1024,[112,120,18...|
|[fast,, thank, you!]|(1024,[206,220,36...|
|[guy, professiona...|(1024,[95,358,366...|
+--------------------+--------------------+
only showing top 5 rows

In [16]:
from pyspark.ml.classification import LogisticRegression
from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel, LogisticRegressionWithSGD
from pyspark.mllib.regression import LabeledPoint
@timeit
def logistic_regression(df, featuresCol, labelCol):
    '''
    Returns a DataFrame containing a column of predicted values of the labelCol.
    Predict the output of labelCol using values in featuresCol y = rf(x).
    
    @params:
        df - DataFrame
        featuresCol - input features, x
        labelCol - output variable, y
    '''
    # split the training and test data using the holdout method
    print(df.count())
    df = df.filter((df.score >=0) & (df.score <10))
    print(df.show(10))
    print(df.count())
    (trainingData, testData) = df.randomSplit([0.8, 0.2])
    
    
    # TODO: Uncomment the lines below and replace <FILL IN> with appropriate code
    # Given hyperparameters
    standardization = False
    elastic_net_param = 0.8
    reg_param = .3
    max_iter = 20

    lr = (LogisticRegression(featuresCol=featuresCol, labelCol=labelCol, regParam = reg_param, standardization = standardization, maxIter = max_iter,elasticNetParam = elastic_net_param))
#     lr = (LogisticRegressionWithLBFGS(featuresCol=featuresCol, labelCol=labelCol, regParam = reg_param, standardization = standardization, maxIter = max_iter,elasticNetParam = elastic_net_param))
#     model = LogisticRegressionWithLBFGS.train(trainingData, numClasses=5)
#     print(lr.numClasses)
#     
    lr_model_basic = lr.fit(trainingData)
#     lr_model_basic =LogisticRegressionWithLBFGS.train(trainingData.rdd(),numClasses=5)

    # YOUR CODE HERE
    # raise NotImplementedError()

    #print('intercept: {0}'.format(lr_model_basic.interceptVector))
#     print('length of coefficients: {0}'.format(len(lr_model_basic.coefficientMatrix)))
#     sorted_coefficients = sorted(lr_model_basic.coefficients)[:5]
    trainingSummary = lr_model_basic.summary
    accuracy = trainingSummary.accuracy
    print(accuracy)
    # create a DataFrame contained a column with predicted values of the labelCol
    predictions = lr_model_basic.transform(testData)
    
    return predictions




VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [19]:
@timeit
def random_forest_regression(df, featuresCol, labelCol):
    '''
    Returns a DataFrame containing a column of predicted values of the labelCol.
    Predict the output of labelCol using values in featuresCol y = rf(x).
    
    @params:
        df - DataFrame
        featuresCol - input features, x
        labelCol - output variable, y
    '''
    
    print(df.count())
    df = df.filter((df.score >=0) & (df.score <10))
    print(df.show(10))
    print(df.count())
    # split the training and test data using the holdout method
    (trainingData, testData) = df.randomSplit([0.8, 0.2])
    
    # create the random forest regressor, limit number of trees to ten
    dtr = RandomForestRegressor(\
       featuresCol=featuresCol, labelCol=labelCol)
    
    # fit the training data to the regressor to create the model
    model = dtr.fit(trainingData)
    
    # create a DataFrame contained a column with predicted values of the labelCol
    predictions = model.transform(testData)
    
    return predictions

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [None]:
# train random forest regression
hashDF.repartition(1000)
predictions = random_forest_regression(df=hashDF,featuresCol="features",labelCol="score")

# compute the error
evaluator = RegressionEvaluator(labelCol="score", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [17]:
# train random forest regression
predictions = logistic_regression(df=hashDF,featuresCol="features",labelCol="score")

# compute the error
evaluator = RegressionEvaluator(labelCol="score", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print ("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

147697374
+-------+---+--------------------+-----+--------------------+
|     id|ups|      filtered_words|score|            features|
+-------+---+--------------------+-----+--------------------+
|cnas8zw|  3|[mill's, career, ...|    3|(2048,[234,317,59...|
|cnas8zx|  1|[mine, uses, stra...|    1|(2048,[112,120,18...|
|cnas8zz|  2|[fast,, thank, you!]|    2|(2048,[206,1244,1...|
|cnas900|  6|[guy, professiona...|    6|(2048,[95,358,792...|
|cnas901|  1|[great, question,...|    1|(2048,[8,144,406,...|
|cnas902|  1|[ie-shiv-ghostbla...|    1|(2048,[27,377,396...|
|cnas903|  1|               [:d.]|    1|  (2048,[449],[1.0])|
|cnas905|  2|[know, describe, ...|    2|(2048,[406,637,65...|
|cnas906|  2|           [says, g]|    2|(2048,[34,305],[1...|
|cnas908|  1|      [love, music!]|    1|(2048,[112,1550],...|
+-------+---+--------------------+-----+--------------------+
only showing top 10 rows

None
129748976
0.49816970795908483
'logistic_regression' took 1024.93 sec

Root Mean Squared Err

In [21]:
predictions.show(10)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------+---+--------------------+-----+--------------------+------------------+
|     id|ups|      filtered_words|score|            features|        prediction|
+-------+---+--------------------+-----+--------------------+------------------+
|cnas8zw|  3|[mill's, career, ...|    3|(1024,[102,211,22...| 2.026020253653934|
|cnas909|  2|[mean, village, h...|    2|(1024,[40,233,289...| 2.026020253653934|
|cnas90e|  1|[haha, awesome, m...|    1|(1024,[342,537,55...| 2.026020253653934|
|cnas90k|  2|["hey, rocky,, wa...|    2|(1024,[19,116,241...| 2.107411685668756|
|cnas90n|  7|[math, prof, ride...|    7|(1024,[27,34,60,1...| 2.082950787227361|
|cnas90s|  2|[thank, you,, cer...|    2|(1024,[362,614,76...|1.8643675997624751|
|cnas90u|  1|           [goonies]|    1|  (1024,[932],[1.0])| 2.026020253653934|
|cnas90y|  2|[thought, wanted,...|    2|(1024,[5,303,312,...| 2.026499951540571|
|cnas917|  2|[[](/hellohuman)m...|    2|(1024,[71,90,139,...| 2.026020253653934|
|cnas927|  3|[this., always,