In [6]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.0-bin-hadoop3"

import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").appName("NLP_Sentiment_Analysis").getOrCreate()

In [7]:
from pyspark.sql.functions import col

data = [
    ("I absolutely loved this movie, it was fantastic!", 1.0),
    ("The plot was boring and the acting was terrible.", 0.0),
    ("Great cinematography, but the story was weak.", 0.0),
    ("An absolute masterpiece, highly recommended.", 1.0),
    ("I wasted my time watching this. Extremely disappointed.", 0.0),
    ("The best film I have seen this year!", 1.0)
]

df = spark.createDataFrame(data, ["text", "label"])
df.show(truncate=False)

+-------------------------------------------------------+-----+
|text                                                   |label|
+-------------------------------------------------------+-----+
|I absolutely loved this movie, it was fantastic!       |1.0  |
|The plot was boring and the acting was terrible.       |0.0  |
|Great cinematography, but the story was weak.          |0.0  |
|An absolute masterpiece, highly recommended.           |1.0  |
|I wasted my time watching this. Extremely disappointed.|0.0  |
|The best film I have seen this year!                   |1.0  |
+-------------------------------------------------------+-----+



In [11]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression


tokenizer = Tokenizer(inputCol="text", outputCol="words")


remover = StopWordsRemover(inputCol="words", outputCol="filtered")


hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=1000)
idf = IDF(inputCol="rawFeatures", outputCol="features")


lr = LogisticRegression(maxIter=10, regParam=0.01)


pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf, lr])

In [9]:
train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)

model = pipeline.fit(train_data)

predictions = model.transform(test_data)
predictions.select("text", "probability", "prediction").show()

+--------------------+--------------------+----------+
|                text|         probability|prediction|
+--------------------+--------------------+----------+
|The plot was bori...|[0.39553605915909...|       1.0|
|An absolute maste...|[0.39553605915909...|       1.0|
+--------------------+--------------------+----------+



In [10]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
accuracy = evaluator.evaluate(predictions)

print(f"Model Accuracy (Area under ROC): {accuracy:.2f}")

Model Accuracy (Area under ROC): 0.50
