In [2]:
import findspark
findspark.init("/usr/local/spark")

from pyspark.sql import SparkSession 
import pyspark.sql.functions as F 
from pyspark.ml import Pipeline
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                
from delta import *

# Import Spark NLP
import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.pretrained import PretrainedPipeline

builder = SparkSession.builder.appName("Sentiment Analysis") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.0")\
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")\
    #.master("spark://namenode:7077")\
    #.config("spark.executor.instances", "1")\
    #.config("spark.executor.cores", "2")

    
spark = configure_spark_with_delta_pip(builder).getOrCreate()


# Load the dataset

In [3]:
df = spark.read.format("delta").load("/temp/filtered_reviews")

                                                                                

In [3]:
df.show()



+--------------------+--------------------+--------------------+-------------------+--------------------+-----+
|           review_id|         business_id|                text|               date|          categories|stars|
+--------------------+--------------------+--------------------+-------------------+--------------------+-----+
|WI2cpA5VSEgGE5Pl9...|pym7c6ZFEtmoH16xN...|We saw the review...|2019-10-04 11:50:29|Restaurants, Bars...|  4.0|
|JMm_2beO-LpbNc4r6...|Ei5HBqe012ImhqEr2...|This is a delight...|2019-09-08 21:52:26|Italian, Restaura...|  5.0|
|bWk_7CwRfQpdkZMMM...|UmjITdXHhEF46ho6I...|This place is FUC...|2019-04-14 04:48:15|Adult Entertainme...|  1.0|
|bBqyHGJpbjp68gmmP...|hy5GpGXAna-5qrb3z...|Literally disgust...|2019-05-20 22:54:11|Mexican, Restaura...|  1.0|
|SH_DWH_hzRBTc2TRK...|vN6v8m4DO45Z4pp8y...|Surrey's is great...|2013-01-07 02:00:05|Vegetarian, Resta...|  4.0|
|eT1QJGwbjVbRiElo4...|g_nLH7QGP3_l1eE-7...|Oh how yummy and ...|2017-08-11 01:34:23|Seafood, Specialt...

                                                                                

# Building the pipeline
## Create a document assembler

In [3]:
document = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

## Create a tokenizer

In [4]:
token = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")\
    .fit(df)

## Create a normalizer

In [5]:
normalizer = Normalizer() \
    .setInputCols(["token"]) \
    .setOutputCol("normal")

## Load a pretrained sentiment analysis model

In [7]:
vivekn = ViveknSentimentModel.pretrained() \
    .setInputCols(["document", "normal"]) \
    .setOutputCol("result_sentiment")

sentiment_vivekn download started this may take some time.
Approximate size to download 873.6 KB
[ | ]sentiment_vivekn download started this may take some time.
Approximate size to download 873.6 KB
[ / ]Download done! Loading the resource.
[OK!]


## Create a finisher

In [6]:
finisher = Finisher() \
    .setInputCols(["result_sentiment"]) \
    .setOutputCols("final_sentiment")

## Create the pipeline

In [7]:
pipeline = Pipeline().setStages([document, token, normalizer, vivekn, finisher])

# Create the data as a Spark dataframe


In [10]:
data = spark.createDataFrame([
    ["I recommend this movie"],
    ["Dont waste your time!!!"]
]).toDF("text")

# Fit the pipeline to the data


In [11]:
pipelineModel = pipeline.fit(df)

In [12]:
result = pipelineModel.transform(df)

In [13]:
result.show()



+--------------------+--------------------+--------------------+-------------------+--------------------+-----+---------------+
|           review_id|         business_id|                text|               date|          categories|stars|final_sentiment|
+--------------------+--------------------+--------------------+-------------------+--------------------+-----+---------------+
|WI2cpA5VSEgGE5Pl9...|pym7c6ZFEtmoH16xN...|We saw the review...|2019-10-04 11:50:29|Restaurants, Bars...|  4.0|     [negative]|
|JMm_2beO-LpbNc4r6...|Ei5HBqe012ImhqEr2...|This is a delight...|2019-09-08 21:52:26|Italian, Restaura...|  5.0|     [negative]|
|bWk_7CwRfQpdkZMMM...|UmjITdXHhEF46ho6I...|This place is FUC...|2019-04-14 04:48:15|Adult Entertainme...|  1.0|     [positive]|
|bBqyHGJpbjp68gmmP...|hy5GpGXAna-5qrb3z...|Literally disgust...|2019-05-20 22:54:11|Mexican, Restaura...|  1.0|     [positive]|
|SH_DWH_hzRBTc2TRK...|vN6v8m4DO45Z4pp8y...|Surrey's is great...|2013-01-07 02:00:05|Vegetarian, Resta...

                                                                                

In [7]:
result = result.withColumn("right_prediction", 
                   F.when(((F.array_contains(F.col("final_sentiment"),"positive")) & (F.col("stars").isin(["5.0", "4.0", "3.0"]))) |
                        ((F.array_contains(F.col("final_sentiment"),"negative")) & (F.col("stars").isin(["3.0", "2.0", "1.0"]))), 
                        1).otherwise(0))

NameError: name 'result' is not defined

In [15]:
count_ones = result.agg(F.sum("right_prediction")).collect()[0][0]

ERROR:root:KeyboardInterrupt while sending command.               (0 + 18) / 20]
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.8/socket.py", line 669, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [None]:
count_ones

3227628

In [None]:
total_count = df.count()

In [None]:
count_ones / total_count

0.6831727932118494

In [None]:
df.where(F.col("right_prediction") == 0).take(5)

                                                                                

[Row(review_id='p8Tlu_gaub7KoBEleGg2fA', business_id='f8ss-XDpkdwwbQf0-V6JCA', text='Very authentic Chinese food, and great happy hour specials. Dined here probably around 10 times and already looking forward to going back. Standouts include crab ragoon, the whole fried fish, garlic clams and their pea tips.', date='2020-09-20 20:23:35', categories='Cantonese, Restaurants, Nightlife, Cajun\\/Creole, Asian Fusion, Seafood, Chinese, Bars, Wine Bars', stars='4.0', final_sentiment=['negative'], right_prediction=0),
 Row(review_id='_ZNA4BU4HPke6N-g1tmuyA', business_id='X5pLH_HQG0ckmsqkjDxEHA', text='Giving 4 stars only because the whole grouper dinner meal was undercooked! However, service, tuna trio, fish tacos and the kids meal were all great!', date='2021-08-11 20:56:19', categories='American (Traditional), Nightlife, Bars, Restaurants, Seafood', stars='4.0', final_sentiment=['negative'], right_prediction=0),
 Row(review_id='qw29d27PzPs66Aqk2UghFA', business_id='uR7G8I4Cef9D9R340TN24Q', 

# Improve this by training own viveken model

In [8]:
trainDF, testDF = df.randomSplit([.8, .2], seed=42)

In [9]:
trainDF = trainDF.withColumn("train_sentiment", F.when(F.col("stars").isin(["5.0", "4.0"]), "positive")
                                      .when(F.col("stars").isin(["1.0", "2.0"]), "negative")
                                      .otherwise(None))  # set to None for rows with 3.0

# drop rows with 3.0
trainDF = trainDF.filter(F.col("stars") != "3.0")
testDF = testDF.filter(F.col("stars") != "3.0")

In [10]:
train_vivekn= (
    ViveknSentimentApproach()
    .setInputCols(["document", "normal"])
    .setOutputCol("result_sentiment")
    .setSentimentCol("train_sentiment")
)

In [11]:
training_pipeline = Pipeline().setStages([document, token, normalizer, train_vivekn, finisher])

In [12]:
testDF.show()

[Stage 8:>                                                          (0 + 1) / 1]

+--------------------+--------------------+--------------------+-------------------+--------------------+-----+
|           review_id|         business_id|                text|               date|          categories|stars|
+--------------------+--------------------+--------------------+-------------------+--------------------+-----+
|--2KOIf3Rg7qDqpaY...|1Vo4BLw75ntATAJHY...|"Great view, that...|2014-07-30 18:25:20|Restaurants, Musi...|  1.0|
|--4INAzazK6omgf3m...|vz0oI7GR9AOtmJpO5...|Although my pork ...|2011-06-02 19:47:41|Irish, Pubs, Amer...|  4.0|
|--4fzr2yfPWIvZP9X...|ZAnLKdfQhX0pj3cwo...|I am not sure if ...|2015-07-19 12:59:20|Breakfast & Brunc...|  1.0|
|--DogY-DxnpGI6HgI...|IWHdx0NhDKADkGOgX...|First time I went...|2012-01-17 22:37:38|Restaurants, Amer...|  5.0|
|--Esto4G8tuwgW1ZM...|tsOEt8v3chHwL1rZi...|Located less than...|2018-03-04 21:00:55|Vietnamese, Resta...|  4.0|
|--KsMF-3s2D0Lxwu8...|uwoXUyqzKAiiM6DW1...|Go somewhere else...|2014-07-14 14:50:56|Sports Bars, Amer...

                                                                                

In [13]:
pipeline = training_pipeline.fit(trainDF)

[Stage 11:>                                                       (0 + 18) / 20]

12:28:57.273 [dispatcher-CoarseGrainedScheduler] ERROR org.apache.spark.scheduler.TaskSchedulerImpl - Lost executor 2 on 192.168.1.138: Executor heartbeat timed out after 135563 ms
12:28:57.332 [dispatcher-CoarseGrainedScheduler] ERROR org.apache.spark.scheduler.TaskSchedulerImpl - Lost executor 1 on 192.168.1.192: Executor heartbeat timed out after 139810 ms
12:28:57.334 [dispatcher-CoarseGrainedScheduler] ERROR org.apache.spark.scheduler.TaskSchedulerImpl - Lost executor 0 on 192.168.1.59: Executor heartbeat timed out after 141939 ms


[Stage 11:>                                                       (0 + 18) / 20]

12:29:21.167 [rpc-server-4-1] ERROR org.apache.spark.network.server.TransportRequestHandler - Error while invoking RpcHandler#receive() on RPC id 7591011196615457716
java.io.OptionalDataException: null
	at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1692) ~[?:1.8.0_362]
	at java.io.ObjectInputStream.readObject(ObjectInputStream.java:503) ~[?:1.8.0_362]
	at java.io.ObjectInputStream.readObject(ObjectInputStream.java:461) ~[?:1.8.0_362]
	at scala.collection.mutable.HashMap.$anonfun$readObject$1(HashMap.scala:195) ~[scala-library-2.12.15.jar:?]
	at scala.collection.mutable.HashTable.init(HashTable.scala:110) ~[scala-library-2.12.15.jar:?]
	at scala.collection.mutable.HashTable.init$(HashTable.scala:89) ~[scala-library-2.12.15.jar:?]
	at scala.collection.mutable.HashMap.init(HashMap.scala:44) ~[scala-library-2.12.15.jar:?]
	at scala.collection.mutable.HashMap.readObject(HashMap.scala:195) ~[scala-library-2.12.15.jar:?]
	at sun.reflect.GeneratedMethodAccessor9.invoke(Unknow

ERROR:root:KeyboardInterrupt while sending command.               (0 + 18) / 20]
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.8/socket.py", line 669, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

12:31:57.223 [dispatcher-CoarseGrainedScheduler] ERROR org.apache.spark.scheduler.TaskSchedulerImpl - Lost executor 5 on 192.168.1.59: Executor heartbeat timed out after 173899 ms
12:31:57.240 [dispatcher-CoarseGrainedScheduler] ERROR org.apache.spark.scheduler.TaskSchedulerImpl - Lost executor 4 on 192.168.1.192: Executor heartbeat timed out after 173842 ms
12:31:57.248 [dispatcher-CoarseGrainedScheduler] ERROR org.apache.spark.scheduler.TaskSchedulerImpl - Lost executor 3 on 192.168.1.138: Executor heartbeat timed out after 162549 ms


[Stage 11:>                                                       (0 + 18) / 20]

In [None]:
result = pipeline.transform(testDF)

AttributeError: 'Pipeline' object has no attribute 'transform'

# Other language model:

In [4]:
document_assembler = DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document")

use = UniversalSentenceEncoder.pretrained('tfhub_use', lang="en") \
.setInputCols(["document"])\
.setOutputCol("sentence_embeddings")

# classifier = SentimentDLModel().pretrained('sentimentdl_use_twitter')\
#     .setInputCols(["sentence_embeddings"])\
#     .setOutputCol("sentiment")

#nlp_pipeline = Pipeline(stages=[document_assembler,use,classifier])

#l_model = LightPipeline(nlp_pipeline.fit(spark.createDataFrame([['']]).toDF("text")))


tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[ | ]tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[ / ]Download done! Loading the resource.
[ \ ]

2023-04-17 15:19:06.830330: I external/org_tensorflow/tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


[ — ]

2023-04-17 15:19:13.974786: W external/org_tensorflow/tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 60236800 exceeds 10% of free system memory.


[ \ ]

2023-04-17 15:19:16.305972: W external/org_tensorflow/tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 60236800 exceeds 10% of free system memory.
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving


[OK!]


ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving


Py4JError: An error occurred while calling z:com.johnsnowlabs.nlp.pretrained.PythonResourceDownloader.downloadModel

In [None]:
classifier = SentimentDLModel().pretrained('sentimentdl_use_twitter')\
    .setInputCols(["sentence_embeddings"])\
    .setOutputCol("sentiment")

nlp_pipeline = Pipeline(stages=[document_assembler,use,classifier])

l_model = LightPipeline(nlp_pipeline.fit(spark.createDataFrame([['']]).toDF("text")))

In [8]:
annotations = l_model.fullAnnotate(["im meeting up with one of my besties tonight! Cant wait!!  - GIRL TALK!!", "is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!"])


NameError: name 'l_model' is not defined

In [5]:
pipeline = PretrainedPipeline("movies_sentiment_analysis", "en")

result = pipeline.annotate("""I love johnsnowlabs!  """)

movies_sentiment_analysis download started this may take some time.


ConnectionRefusedError: [Errno 111] Connection refused