In [0]:
# /FileStore/tables/beauty4.csv
import os.path
# baseDir = os.path.join('data')
# inputPath = os.path.join('SENG501', 'lab1', 'deerfoot.csv')
fileName = '/FileStore/tables/beauty4.csv'

reviewsRDD = sc.textFile(fileName)
trailed = reviewsRDD.map(lambda x: x.rstrip("'"))
print(trailed.take(1))
print(trailed.count())

["'5','True','09 1, 2016','A3CIUOJXQ5VDQ2','B0000530HU','{''Size:'': '' 7.0 oz'', ''Flavor:'': '' Classic Ice Blue''}','Shelly F','As advertised. Reasonably priced','Five Stars','1472688000','',"]
5269


In [0]:
# TODO: Replace <FILL IN> with appropriate code
def extractFields(reviewsRDD):
    fieldsList = reviewsRDD.split("','")
    print(len(fieldsList))
    rate = fieldsList[0].replace("'","")
    return (rate, fieldsList[7], fieldsList[8])
     
print(extractFields(reviewsRDD.take(1)[0]))

12
('5', 'As advertised. Reasonably priced', 'Five Stars')


In [0]:
reviewsRatingRDD = reviewsRDD.map(extractFields)
print(reviewsRatingRDD.take(3))

[('5', 'As advertised. Reasonably priced', 'Five Stars'), ('5', 'Like the oder and the feel when I put it on my face.  I have tried other brands but the reviews from people I know they prefer the oder of this brand. Not hard on the face when dry.  Does not leave dry skin.', 'Good for the face'), ('1', 'I bought this to smell nice after I shave.  When I put it on I smelled awful.  I am 19 and I smelled like a grandmother with too much perfume.', 'Smells awful')]


### Text Preprocessing and TFIDF
###### Make all text from review-text and summary-text lowercase and remove all punctuation
###### do tfidf
###### ref: <https://spark.apache.org/docs/latest/ml-features.html#tf-idf>

In [0]:
import string
from pyspark.mllib.feature import HashingTF, IDF

# remove punctuation and make everything lowercase

def prepData(rdd):
    r1 = rdd[1].translate(str.maketrans('','',string.punctuation))
    r2 = rdd[2].translate(str.maketrans('','',string.punctuation))
    rev = r2.lower() + ' ' + r1.lower()
    # rev = rev.translate(str.maketrans('','',string.punctuation))
    label = int(rdd[0])
    return (label, rev)


reviewsData = reviewsRatingRDD.map(prepData)
print(reviewsData.take(2))



[(5, 'five stars as advertised reasonably priced'), (5, 'good for the face like the oder and the feel when i put it on my face  i have tried other brands but the reviews from people i know they prefer the oder of this brand not hard on the face when dry  does not leave dry skin')]


In [0]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
reviewsDF = reviewsData.toDF(['Label', 'Text'])
reviewsDF.show(n=5, truncate=True)

tokenizer = Tokenizer(inputCol="Text", outputCol="Words")
words = tokenizer.transform(reviewsDF)
                           
hashingTF = HashingTF(inputCol="Words", outputCol="rawFeatures", numFeatures=1000)
tf = hashingTF.transform(words)
tf.cache()

idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(tf)
tfidf = idfModel.transform(tf)




+-----+--------------------+
|Label|                Text|
+-----+--------------------+
|    5|five stars as adv...|
|    5|good for the face...|
|    1|smells awful i bo...|
|    5|truth is there is...|
|    5|bvlgari shampoo i...|
+-----+--------------------+
only showing top 5 rows



In [0]:
tfidf.select("Label", "Text", "features").show(n=5)

+-----+--------------------+--------------------+
|Label|                Text|            features|
+-----+--------------------+--------------------+
|    5|five stars as adv...|(1000,[41,242,306...|
|    5|good for the face...|(1000,[17,34,48,6...|
|    1|smells awful i bo...|(1000,[29,76,322,...|
|    5|truth is there is...|(1000,[4,9,17,44,...|
|    5|bvlgari shampoo i...|(1000,[17,35,115,...|
+-----+--------------------+--------------------+
only showing top 5 rows



In [0]:
featurizedRDD = tfidf.select("label", "features").rdd
print(featurizedRDD.take(1))

[Row(label=5, features=SparseVector(1000, {41: 4.2003, 242: 1.1637, 306: 5.5253, 486: 5.104, 749: 1.8412, 974: 1.2499}))]


In [0]:
# Convert to RDD LabeledPoint
from pyspark.mllib.regression import LabeledPoint

labeledPnt = featurizedRDD.map(lambda x: LabeledPoint(x["label"], x["features"].toArray()))

## Random Forest Model
##### ref: <https://spark.apache.org/docs/3.1.3/api/python/reference/api/pyspark.mllib.tree.RandomForest.html#pyspark.mllib.tree.RandomForest>

#### Splitting Data

In [0]:
weights = [0.6, 0.2, 0.2]
seed = 20
trainData, valData, testData = labeledPnt.randomSplit(weights, seed)

trainData.cache()
valData.cache()
testData.cache()

nTrain = trainData.count()
nVal = valData.count()
nTest = testData.count()

print(nTrain, nVal, nTest, nTrain + nVal + nTest)

3153 1026 1090 5269


### Start training model
##### ref <https://spark.apache.org/docs/latest/mllib-ensembles.html#classification>

In [0]:
from pyspark.mllib.tree import RandomForest, RandomForestModel

model = RandomForest.trainClassifier(trainData, numClasses=6, categoricalFeaturesInfo={}, numTrees=10, maxDepth=5)
model2 = RandomForest.trainClassifier(trainData, numClasses=6, categoricalFeaturesInfo={}, numTrees=10, maxDepth=25)
model3 = RandomForest.trainClassifier(trainData, numClasses=6, categoricalFeaturesInfo={}, numTrees=50, maxDepth=5)
model4 = RandomForest.trainClassifier(trainData, numClasses=6, categoricalFeaturesInfo={}, numTrees=20, maxDepth=30)


#### Predicting Labels and Evaluating Models

In [0]:
from pyspark.mllib.evaluation import MulticlassMetrics

def printMetrics(metrics, m_id):
    print()
    print("--------------------------------------------------------------")    
    print("Performance Metrics for Model " + str(m_id))
    print()
    print("Accuracy: %.2f" % (metrics.accuracy))
    for i in range(1, 6):
        print("Label " + str(i))
        print("Precision: %.2f" % (metrics.precision(i)))
        print("Recall: %.2f" % (metrics.recall(i)))
       # print("F-Score: %.2f%%" % (metrics.fMeasure(i,beta=1.0) * 100))
        print()

#evaluation
pred = model.predict(valData.map(lambda x: x.features))
y_labels = valData.map(lambda x: x.label)
predAndLab = pred.zip(y_labels)
valMetrics = MulticlassMetrics(predAndLab)
printMetrics(valMetrics, 1)

pred = model2.predict(valData.map(lambda x: x.features))
y_labels = valData.map(lambda x: x.label)
predAndLab = pred.zip(y_labels)
valMetrics = MulticlassMetrics(predAndLab)
printMetrics(valMetrics, 2)


pred = model3.predict(valData.map(lambda x: x.features))
y_labels = valData.map(lambda x: x.label)
predAndLab = pred.zip(y_labels)
valMetrics = MulticlassMetrics(predAndLab)
printMetrics(valMetrics, 3)

pred = model4.predict(valData.map(lambda x: x.features))
y_labels = valData.map(lambda x: x.label)
predAndLab = pred.zip(y_labels)
valMetrics = MulticlassMetrics(predAndLab)
printMetrics(valMetrics, 4)





--------------------------------------------------------------
Performance Metrics for Model 1

Accuracy: 0.88
Label 1
Precision: 0.00
Recall: 0.00

Label 2
Precision: 0.00
Recall: 0.00

Label 3
Precision: 1.00
Recall: 0.10

Label 4
Precision: 0.00
Recall: 0.00

Label 5
Precision: 0.88
Recall: 1.00


--------------------------------------------------------------
Performance Metrics for Model 2

Accuracy: 0.95
Label 1
Precision: 1.00
Recall: 0.83

Label 2
Precision: 1.00
Recall: 0.64

Label 3
Precision: 0.75
Recall: 0.29

Label 4
Precision: 0.95
Recall: 0.59

Label 5
Precision: 0.95
Recall: 1.00


--------------------------------------------------------------
Performance Metrics for Model 3

Accuracy: 0.88
Label 1
Precision: 0.00
Recall: 0.00

Label 2
Precision: 0.00
Recall: 0.00

Label 3
Precision: 1.00
Recall: 0.05

Label 4
Precision: 0.00
Recall: 0.00

Label 5
Precision: 0.88
Recall: 1.00


--------------------------------------------------------------
Performance Metrics for Model 

### Use Optimal Model for Testing

In [0]:
test_labels = testData.map(lambda x: x.label)
testPred = model4.predict(testData.map(lambda x: x.features))
predAndLab = testPred.zip(test_labels)
testMetrics = MulticlassMetrics(predAndLab)
print("test data performance")
printMetrics(testMetrics, 4)

test data performance

--------------------------------------------------------------
Performance Metrics for Model 4

Accuracy: 0.95
Label 1
Precision: 1.00
Recall: 0.73

Label 2
Precision: 1.00
Recall: 0.57

Label 3
Precision: 1.00
Recall: 0.58

Label 4
Precision: 0.96
Recall: 0.62

Label 5
Precision: 0.95
Recall: 1.00

