# Big Data Project 20211

In [3]:
!pip install -r requirements.txt

Collecting spark-nlp==3.0.1
  Downloading spark_nlp-3.0.1-py2.py3-none-any.whl (146 kB)
     |████████████████████████████████| 146 kB 1.5 MB/s            
[?25hCollecting numpy==1.21.4
  Downloading numpy-1.21.4-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (15.7 MB)
     |████████████████████████████████| 15.7 MB 1.5 MB/s            
[?25hInstalling collected packages: spark-nlp, numpy
Successfully installed numpy-1.21.4 spark-nlp-3.0.1


In [4]:
ifrom pyspark.sql import SparkSession

spark = SparkSession.\
        builder.\
        appName("pyspark-notebook").\
        master("spark://spark-master:7077").\
        config("spark.executor.memory", "512m").\
        config("spark.driver.memory","16G").\
        config("spark.driver.maxResultSize", "0").\
        config("spark.kryoserializer.buffer.max", "2000M").\
        config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:3.0.1").\
        getOrCreate()

Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
:: loading settings :: url = jar:file:/usr/local/lib/python3.9/dist-packages/pyspark/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml
com.johnsnowlabs.nlp#spark-nlp_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-0a97ba91-6af8-4c0e-8fc7-595376315e45;1.0
	confs: [default]
	found com.johnsnowlabs.nlp#spark-nlp_2.12;3.0.1 in central
	found com.typesafe#config;1.3.0 in central
	found org.rocksdb#rocksdbjni;6.5.3 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.603 in central
	found com.github.universal-automata#liblevenshtein;3.0.0 in central
	found com.google.code.findbugs#annotations;3.0.1 in central
	found net.jcip#jcip-annotations;1.0 in central
	found com.google.code.findbugs#jsr305;3.0.1 in central
	found com.google.protobuf#protobuf-java-util;3.0.0-beta-3 in central
	found com.google.protobuf#protobuf-java;3.0.0-beta-3

Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
:: loading settings :: url = jar:file:/usr/local/lib/python3.9/dist-packages/pyspark/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml
com.johnsnowlabs.nlp#spark-nlp_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-b7abb58b-1a95-43bd-a801-cb3512640ab4;1.0
	confs: [default]
	found com.johnsnowlabs.nlp#spark-nlp_2.12;3.0.1 in central
	found com.typesafe#config;1.3.0 in central
	found org.rocksdb#rocksdbjni;6.5.3 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.603 in central
	found com.github.universal-automata#liblevenshtein;3.0.0 in central
	found com.google.code.findbugs#annotations;3.0.1 in central
	found net.jcip#jcip-annotations;1.0 in central
	found com.google.code.findbugs#jsr305;3.0.1 in central
	found com.google.protobuf#protobuf-java-util;3.0.0-beta-3 in central
	found com.google.protobuf#protobuf-java;3.0.0-beta-3

In [2]:
spark

In [4]:
spark.version

'3.0.0'

In [5]:
# dataset = spark.read \
#       .option("header", True) \
#       .json("data/Prime_Pantry.json")
dataset = spark.read \
      .option("header", True) \
      .option("inferSchema", True) \
      .option('quote', '"') \
      .option('escape', '"') \
      .csv("hdfs://namenode:9000/data/input/IMDB.csv")
      

                                                                                

In [6]:
dataset = dataset[['overall', 'reviewText']]
dataset.show(5)

+--------+--------------------+
| overall|          reviewText|
+--------+--------------------+
|positive|One of the other ...|
|positive|A wonderful littl...|
|positive|I thought this wa...|
|negative|Basically there's...|
|positive|Petter Mattei's "...|
+--------+--------------------+
only showing top 5 rows



In [5]:
# filter rows with include positive and negative reviews
dataset = dataset.filter(dataset.overall.isin(["positive", "negative"]))
# count positive and negative reviews
dataset.groupBy('overall').count().show()

                                                                                

+--------+-----+
| overall|count|
+--------+-----+
|positive|25000|
|negative|25000|
+--------+-----+



                                                                                

In [11]:
from pyspark.sql import functions as F
# apply overall > 3.0 to positive reviews, else negative
dataset = dataset.withColumn('label', F.when(F.col('overall') > 3.0, 'positive').otherwise('negative'))
dataset.show(5)
# count positive and negative reviews
dataset.groupBy('label').count().show()

+-------+--------------------+-----+
|overall|          reviewText|label|
+-------+--------------------+-----+
|    5.0|       Good clinging|    1|
|    4.0|Fantastic buy and...|    1|
|    4.0|                  ok|    1|
|    3.0|Saran Cling Plus ...|    0|
|    4.0|This is my go to ...|    1|
+-------+--------------------+-----+
only showing top 5 rows



In [4]:
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import LogisticRegression
# regular expression tokenizer
regexTokenizer = RegexTokenizer(inputCol="reviewText", outputCol="words", pattern="\\W")
# stop words
add_stopwords = ["http","https","amp","rt","t","c","the"] 
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(add_stopwords)
# bag of words count
countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5)

In [5]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
label_stringIdx = StringIndexer(inputCol = "overall", outputCol = "label")
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx])
# Fit the pipeline to training documents.
pipelineFit = pipeline.fit(dataset)
dataset = pipelineFit.transform(dataset)
dataset.show(5)

                                                                                

+--------+--------------------+--------------------+--------------------+--------------------+-----+
| overall|          reviewText|               words|            filtered|            features|label|
+--------+--------------------+--------------------+--------------------+--------------------+-----+
|positive|One of the other ...|[one, of, the, ot...|[one, of, other, ...|(10000,[0,1,2,3,4...|  1.0|
|positive|A wonderful littl...|[a, wonderful, li...|[a, wonderful, li...|(10000,[0,1,2,3,4...|  1.0|
|positive|I thought this wa...|[i, thought, this...|[i, thought, this...|(10000,[0,1,2,3,4...|  1.0|
|negative|Basically there's...|[basically, there...|[basically, there...|(10000,[0,1,2,3,4...|  0.0|
|positive|Petter Mattei's "...|[petter, mattei, ...|[petter, mattei, ...|(10000,[0,1,2,3,4...|  1.0|
+--------+--------------------+--------------------+--------------------+--------------------+-----+
only showing top 5 rows



In [6]:
# set seed for reproducibility
(trainingData, testData) = dataset.randomSplit([0.5, 0.5], seed = 100)
# print("Training Dataset Count: " + str(trainingData.count()))
# print("Test Dataset Count: " + str(testData.count()))

In [7]:
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)

22/01/01 05:19:24 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
22/01/01 05:19:24 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
                                                                                

In [8]:

predictions = lrModel.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("reviewText","overall","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)



+------------------------------+--------+------------------------------+-----+----------+
|                    reviewText| overall|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
|All I could think while wat...|negative|[0.9999986788311754,1.32116...|  0.0|       0.0|
|Pier Paolo Pasolini, or Pee...|negative|[0.9999981302138441,1.86978...|  0.0|       0.0|
|Zombi 3 starts as a group o...|negative|[0.9999977403304414,2.25966...|  0.0|       0.0|
|Plankton, or Creatures from...|negative|[0.9999933154370028,6.68456...|  0.0|       0.0|
|Serum starts as Eddie (Dere...|negative|[0.9999905481682364,9.45183...|  0.0|       0.0|
|Interferencia starts as une...|negative|[0.9999892957177893,1.07042...|  0.0|       0.0|
|...the first? Killjoy 1. Bu...|negative|[0.9999854888488539,1.45111...|  0.0|       0.0|
|Waitress: Honey, here's the...|negative|[0.9999851843852937,1.48156...|  0.0|       0.0|
|Holy crap

                                                                                

## Evaluate Model

In [9]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

                                                                                

0.8942080954580265

In [11]:
''' Close Spark '''
# spark.close()

' Close Spark '