In [32]:
from pyspark.sql import SparkSession

In [33]:
import nltk
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("averaged_perceptron_tagger")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [34]:
spark = SparkSession.builder.appName("Sentiment Analysis_Remote") \
    .master('spark://0.0.0.0:7077') \
    .config("spark.executor.resource.gpu.amount", "1") \
    .config("spark.rapids.sql.enabled", "true") \
    .config("spark.plugins", "com.nvidia.spark.SQLPlugin") \
    .config("spark.executor.memory", "25g")\
    .config("spark.shuffle.service.enabled", "false")\
    .config("spark.dynamicAllocation.enabled", "false")\
    .getOrCreate()

In [35]:
spark

In [36]:
df = spark.read.csv("datasets/sentiment_data_trim.csv", header=True, inferSchema=True)
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- Review: string (nullable = true)
 |-- Summary: string (nullable = true)
 |-- Sentiment: integer (nullable = true)



In [37]:
df = df.drop("_c0")
# df.show()

In [38]:
from pyspark.sql.functions import concat_ws
df = df.withColumn("Reviews", concat_ws(" ", df.Review, df.Summary))
df = df.drop("Review", "Summary")

In [39]:
df = df.na.drop()

In [40]:
# from pyspark.sql.types import IntegerType
# df = df.withColumn("Sentiment", df["Sentiment"].cast(IntegerType()))

In [41]:
# function to remove special characters and numbers from the reviews
import re
from pyspark.sql.functions import udf
def removeSpecChar(raw_text):
    print(type(raw_text))
    clean_SpecialChar = re.sub("[^a-zA-Z]", " ", raw_text)  
    return clean_SpecialChar
removeSpecialChar = udf(removeSpecChar)

In [42]:
from pyspark.sql.functions import lower
df = df.withColumn("Reviews", removeSpecialChar(lower(df["Reviews"])))
# df.show(n=2)

In [43]:
from pyspark.ml.feature import Tokenizer
tokenizer = Tokenizer(inputCol="Reviews", outputCol="reviewtoken")
df = tokenizer.transform(df)
# df.show(n=2)

In [44]:
from pyspark.ml.feature import StopWordsRemover
from nltk.corpus import stopwords
remover = StopWordsRemover(inputCol="reviewtoken", outputCol="reviewtokenfiltered", stopWords=stopwords.words("english"))
df = remover.transform(df)
# df.show(n=2)

In [45]:
df.printSchema()

root
 |-- Sentiment: integer (nullable = true)
 |-- Reviews: string (nullable = true)
 |-- reviewtoken: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- reviewtokenfiltered: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [46]:
from nltk.stem import WordNetLemmatizer
def get_wordnet_pos(tag):
    from nltk.corpus import wordnet
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else: 
        return wordnet.NOUN

In [47]:
lemmatizer = WordNetLemmatizer()
def lemmatizeScalar(sentance):
    from nltk.tag import pos_tag
    print(sentance)
    tagged = pos_tag( [i for i in sentance if i])
    lemmatized = []
    for word, tag in tagged:
        lemma = lemmatizer.lemmatize(word, pos = get_wordnet_pos(tag))
        lemmatized.append(lemma)
    return lemmatized
lemmatize = udf(lemmatizeScalar)

In [48]:
df = df.withColumn("reviewtokenfiltered2", lemmatize(df.reviewtokenfiltered))
# dfnew.show()

In [49]:
from pyspark.sql.functions import expr
df = df.withColumn('reviewtokenfiltered2', expr(r"regexp_extract_all(reviewtokenfiltered2, '(\\w+)', 1)"))


In [50]:
df = df.drop("reviewtoken", "Reviews","reviewtokenfiltered")
df = df.withColumnRenamed("reviewtokenfiltered2", "reviewtokenfiltered")
# df.show(n=2)

In [51]:
df.printSchema()

root
 |-- Sentiment: integer (nullable = true)
 |-- reviewtokenfiltered: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [None]:
split = df.randomSplit([0.6,0.2,0.2], seed=26)

In [None]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
hashingTF = HashingTF(inputCol="reviewtokenfiltered", outputCol="termFrequency", numFeatures=37120)
split[0] = hashingTF.transform(split[0])
# split[0].show(n=2)

idf = IDF(inputCol="termFrequency", outputCol="features", )
idfModel = idf.fit(split[0])
split[0] = idfModel.transform(split[0])

In [None]:
idfModel

IDFModel: uid=IDF_cf19aebc35ba, numDocs=174944, numFeatures=37120

In [None]:
split[1] = hashingTF.transform(split[1])
split[2] = hashingTF.transform(split[2])
# split[1].show(n=2)

split[1] = idfModel.transform(split[1])
split[2] = idfModel.transform(split[2])

In [None]:
split[1].show()

+---------+--------------------+--------------------+--------------------+
|Sentiment| reviewtokenfiltered|       termFrequency|            features|
+---------+--------------------+--------------------+--------------------+
|        0|[absolute, rubbis...|(37120,[6205,1763...|(37120,[6205,1763...|
|        0|[absolute, rubbis...|(37120,[6205,1763...|(37120,[6205,1763...|
|        0|[absolute, rubbis...|(37120,[6205,1763...|(37120,[6205,1763...|
|        0|[absolute, rubbis...|(37120,[6205,1763...|(37120,[6205,1763...|
|        0|[absolute, rubbis...|(37120,[6205,1763...|(37120,[6205,1763...|
|        0|[absolute, rubbis...|(37120,[6205,1763...|(37120,[6205,1763...|
|        0|[absolute, rubbis...|(37120,[3267,4322...|(37120,[3267,4322...|
|        0|[absolute, rubbis...|(37120,[6205,1763...|(37120,[6205,1763...|
|        0|[absolute, rubbis...|(37120,[5181,6205...|(37120,[5181,6205...|
|        0|[absolute, rubbis...|(37120,[6205,1396...|(37120,[6205,1396...|
|        0|[absolute, rub

In [None]:
train = split[0].select('features', 'sentiment')
test = split[1].select('features', 'sentiment')
validation = split[2].select('features', 'sentiment')

train = train.withColumnRenamed('sentiment', 'label')
test = test.withColumnRenamed('sentiment', 'label')
validation = validation.withColumnRenamed('sentiment', 'label')

# train.show(n=2)

In [None]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import LogisticRegressionModel

lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

lrModel = lr.fit(train)

LogisticRegressionModel.save(lrModel, "LinearRegression_Spark2.model")

In [None]:
lrtest = lrModel.transform(test)
lrtest.show()

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|(37120,[6205,1763...|    0|[-1.2923707818879...|[0.21545180171940...|       1.0|
|(37120,[6205,1763...|    0|[-1.2923707818879...|[0.21545180171940...|       1.0|
|(37120,[6205,1763...|    0|[-1.2923707818879...|[0.21545180171940...|       1.0|
|(37120,[6205,1763...|    0|[-1.2923707818879...|[0.21545180171940...|       1.0|
|(37120,[6205,1763...|    0|[-1.2923707818879...|[0.21545180171940...|       1.0|
|(37120,[6205,1763...|    0|[-1.2923707818879...|[0.21545180171940...|       1.0|
|(37120,[3267,4322...|    0|[-1.2923707818879...|[0.21545180171940...|       1.0|
|(37120,[6205,1763...|    0|[-1.2923707818879...|[0.21545180171940...|       1.0|
|(37120,[5181,6205...|    0|[-1.2923707818879...|[0.21545180171940...|       1.0|
|(37120,[6205,13

In [59]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
accuracy = evaluator.evaluate(lrtest)
print("Accuracy = %s" % (accuracy))
print("Test Error = %s" % (1.0 - accuracy))

Accuracy = 0.6916396150555925
Test Error = 0.30836038494440754


In [None]:
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.types import FloatType
import pyspark.sql.functions as F

preds_and_labels = lrtest.select(['prediction','label']).withColumn('label', F.col('label').cast(FloatType())).orderBy('prediction')
preds_and_labels = preds_and_labels.select(['prediction','label'])
metrics = MulticlassMetrics(preds_and_labels.rdd.map(tuple))
print(metrics.confusionMatrix().toArray())



[[    0. 12437.]
 [    0. 45644.]]


In [60]:
from pyspark.ml.classification import LinearSVC
from pyspark.ml.classification import LinearSVCModel

In [61]:
svm = LinearSVC()

In [62]:
svmModel = svm.fit(train)

In [63]:
svmtest = svmModel.transform(train)

In [64]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
accuracy = evaluator.evaluate(svmtest)
print("Accuracy = %s" % (accuracy))
print("Test Error = %s" % (1.0 - accuracy))

Accuracy = 0.9753874348851735
Test Error = 0.024612565114826457


In [65]:
preds_and_labels = svmtest.select(['prediction','label']).withColumn('label', F.col('label').cast(FloatType())).orderBy('prediction')
preds_and_labels = preds_and_labels.select(['prediction','label'])
metrics = MulticlassMetrics(preds_and_labels.rdd.map(tuple))
print(metrics.confusionMatrix().toArray())



[[ 34940.   2752.]
 [  1528. 135724.]]


In [66]:
LinearSVCModel.save(svmModel, "svm_spark.model")

In [52]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'label')

In [53]:
rfModel = rf.fit(train)

In [56]:
from pyspark.ml.classification import RandomForestClassificationModel
RandomForestClassificationModel.save(rfModel, "RandomForest_spark.model")

In [57]:
rftest = rfModel.transform(test)

In [47]:
rftest.show(n=50)

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|(262144,[145380,2...|    0|[4.24081373896948...|[0.21204068694847...|       1.0|
|(262144,[145380,2...|    0|[4.24081373896948...|[0.21204068694847...|       1.0|
|(262144,[145380,2...|    0|[4.24081373896948...|[0.21204068694847...|       1.0|
|(262144,[145380,2...|    0|[4.24081373896948...|[0.21204068694847...|       1.0|
|(262144,[55875,14...|    0|[4.24081373896948...|[0.21204068694847...|       1.0|
|(262144,[55875,14...|    0|[4.24081373896948...|[0.21204068694847...|       1.0|
|(262144,[5634,258...|    0|[4.76258274092123...|[0.23812913704606...|       1.0|
|(262144,[76764,14...|    0|[4.24081373896948...|[0.21204068694847...|       1.0|
|(262144,[109885,1...|    0|[4.24081373896948...|[0.21204068694847...|       1.0|
|(262144,[52879,

In [58]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
accuracy = evaluator.evaluate(rftest)
print("Accuracy = %s" % (accuracy))
print("Test Error = %s" % (1.0 - accuracy))

Accuracy = 0.6916807150328762
Test Error = 0.3083192849671238


In [None]:
preds_and_labels = rftest.select(['prediction','label']).withColumn('label', F.col('label').cast(FloatType())).orderBy('prediction')
preds_and_labels = preds_and_labels.select(['prediction','label'])
metrics = MulticlassMetrics(preds_and_labels.rdd.map(tuple))
print(metrics.confusionMatrix().toArray())

In [49]:
rfModel.trees

[DecisionTreeClassificationModel: uid=dtc_83579c1cfded, depth=5, numNodes=19, numClasses=2, numFeatures=262144,
 DecisionTreeClassificationModel: uid=dtc_62a257923ca4, depth=4, numNodes=9, numClasses=2, numFeatures=262144,
 DecisionTreeClassificationModel: uid=dtc_96a0d7155237, depth=5, numNodes=15, numClasses=2, numFeatures=262144,
 DecisionTreeClassificationModel: uid=dtc_84a5a786e85c, depth=4, numNodes=11, numClasses=2, numFeatures=262144,
 DecisionTreeClassificationModel: uid=dtc_8991e6bb84a7, depth=5, numNodes=27, numClasses=2, numFeatures=262144,
 DecisionTreeClassificationModel: uid=dtc_77d9cef2614f, depth=5, numNodes=11, numClasses=2, numFeatures=262144,
 DecisionTreeClassificationModel: uid=dtc_bf53e9d88f8b, depth=5, numNodes=13, numClasses=2, numFeatures=262144,
 DecisionTreeClassificationModel: uid=dtc_c51f85d4f2ca, depth=5, numNodes=15, numClasses=2, numFeatures=262144,
 DecisionTreeClassificationModel: uid=dtc_289a14674f42, depth=5, numNodes=19, numClasses=2, numFeatures=2

In [67]:
loaded = LinearSVCModel.load("svm_spark.model/")

In [68]:
loaded

LinearSVCModel: uid=LinearSVC_ed819e61f2fc, numClasses=2, numFeatures=37120

In [71]:
loaded.transform(test).select("label", "prediction").write.csv("svmTest.csv")