In [6]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("Amazon Reviews Sentiment Analysis") \
    .getOrCreate()

# Load Data
data = spark.read.csv('test.csv', inferSchema=True, header=True)

# Preprocessing Steps
# Tokenize words
regexTokenizer = RegexTokenizer(inputCol="content", outputCol="words", pattern="\\W")

# Remove stop words
remover = StopWordsRemover(inputCol="words", outputCol="filtered")

# Convert words to feature vectors
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures")
idf = IDF(inputCol="rawFeatures", outputCol="features")

# Convert label to numeric
label_stringIdx = StringIndexer(inputCol="label", outputCol="labelIndex")

# Pipeline
pipeline = Pipeline(stages=[regexTokenizer, remover, hashingTF, idf, label_stringIdx])

# Apply transformations
pipelineModel = pipeline.fit(data)
dataset = pipelineModel.transform(data)

dataset.show(5)


23/12/11 22:05:54 WARN StopWordsRemover: Default locale set was [en_SA]; however, it was not found in available locales in JVM, falling back to en_US locale. Set param `locale` in order to respect another locale.
                                                                                

+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|label|               title|             content|               words|            filtered|         rawFeatures|            features|labelIndex|
+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|    1|            Great CD|"My lovely Pat ha...|[my, lovely, pat,...|[lovely, pat, one...|(262144,[3370,218...|(262144,[3370,218...|       1.0|
|    1|One of the best g...|Despite the fact ...|[despite, the, fa...|[despite, fact, p...|(262144,[6946,844...|(262144,[6946,844...|       1.0|
|    0|Batteries died wi...|I bought this cha...|[i, bought, this,...|[bought, charger,...|(262144,[1578,576...|(262144,[1578,576...|       0.0|
|    1|works fine, but M...|Check out Maha En...|[check, out, maha...|[check, maha, ene...|(262144,[82005,10...|(262144,[82005,10.

23/12/11 22:06:03 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB


In [7]:
train_data, test_data = dataset.randomSplit([0.7, 0.3])


In [8]:
# Define Logistic Regression model
lr = LogisticRegression(featuresCol='features', labelCol='labelIndex')

# Train the model
lrModel = lr.fit(train_data)

# Make predictions
predictions = lrModel.transform(test_data)

# Evaluate the model
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(labelCol="labelIndex", rawPredictionCol="prediction")
accuracy = evaluator.evaluate(predictions)
print("Accuracy: ", accuracy)


23/12/11 22:06:26 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
23/12/11 22:06:35 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
23/12/11 22:06:35 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/12/11 22:06:35 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
23/12/11 22:06:43 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
23/12/11 22:06:43 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
23/12/11 22:06:43 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
23/12/11 22:06:43 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
23/12/11 22:06:43 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
23/12/11 22:06:44 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
23/12/11 22:06:44 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
23/12/11 22:06:44 WARN DAGSchedul

Accuracy:  0.7625791932943319


                                                                                

## Random Forest Classifier

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Amazon Reviews Classification") \
    .getOrCreate()

data = spark.read.csv('test.csv', inferSchema=True, header=True)


23/12/11 22:19:05 WARN Utils: Your hostname, Hadis-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 172.20.10.5 instead (on interface en0)
23/12/11 22:19:05 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/11 22:19:05 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

In [2]:
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer
from pyspark.ml import Pipeline

# Tokenize words
regexTokenizer = RegexTokenizer(inputCol="content", outputCol="words", pattern="\\W")

# Remove stop words
remover = StopWordsRemover(inputCol="words", outputCol="filtered")

# Feature transformation using TF-IDF
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures")
idf = IDF(inputCol="rawFeatures", outputCol="features")

# Convert label to numeric
label_stringIdx = StringIndexer(inputCol="label", outputCol="labelIndex")

# Pipeline
pipeline = Pipeline(stages=[regexTokenizer, remover, hashingTF, idf, label_stringIdx])

# Apply transformations
dataset = pipeline.fit(data).transform(data)


23/12/11 22:19:12 WARN StopWordsRemover: Default locale set was [en_SA]; however, it was not found in available locales in JVM, falling back to en_US locale. Set param `locale` in order to respect another locale.
                                                                                

In [3]:
train_data, test_data = dataset.randomSplit([0.7, 0.3])


In [4]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Define the model
rf_classifier = RandomForestClassifier(featuresCol='features', labelCol='labelIndex')

# Train the model
rf_model = rf_classifier.fit(train_data)

# Predict on test data
rf_predictions = rf_model.transform(test_data)

# Evaluate the model
rf_evaluator = BinaryClassificationEvaluator(labelCol="labelIndex")
rf_accuracy = rf_evaluator.evaluate(rf_predictions)
print("Random Forest Classifier Accuracy: ", rf_accuracy)


23/12/11 22:19:29 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
23/12/11 22:19:32 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
23/12/11 22:19:38 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB
23/12/11 22:20:10 WARN DAGScheduler: Broadcasting large task binary with size 1033.7 KiB
23/12/11 22:20:10 WARN DAGScheduler: Broadcasting large task binary with size 7.8 MiB
23/12/11 22:20:15 WARN MemoryStore: Not enough space to cache rdd_50_7 in memory! (computed 113.0 MiB so far)
23/12/11 22:20:15 WARN BlockManager: Persisting block rdd_50_7 to disk instead.
23/12/11 22:20:16 WARN BlockManager: Block rdd_50_5 could not be removed as it was not found on disk or in memory
23/12/11 22:20:16 WARN BlockManager: Block rdd_50_7 could not be removed as it was not found on disk or in memory
23/12/11 22:20:16 WARN MemoryStore: Not enough space to cache rdd_50_2 in memory! (computed 33.0 MiB so far)
23/12/11 22:20:16 WARN BlockManager: Per

ConnectionRefusedError: [Errno 61] Connection refused