<a href="https://colab.research.google.com/github/MaimunaSun/Classification-of-Land-Cover-Type/blob/main/NLP_Restaurant_Reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Restaurants receive thousands of text reviews but struggle to understand why their ratings go up or down. Star ratings alone donâ€™t explain customer sentiment or operational issues.

In [1]:
!pip install pyspark



In [2]:
import pyspark
print("PySpark installed and imported successfully!")

PySpark installed and imported successfully!


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("YelpAnalysis") \
    .config("spark.driver.memory", "8g") \
    .getOrCreate()


Loading the data

In [3]:
business_path = "/content/drive/MyDrive/yelp_dataset/yelp_academic_dataset_business.json"

business_df = spark.read.json(business_path)
business_df.printSchema()


root
 |-- address: string (nullable = true)
 |-- attributes: struct (nullable = true)
 |    |-- AcceptsInsurance: string (nullable = true)
 |    |-- AgesAllowed: string (nullable = true)
 |    |-- Alcohol: string (nullable = true)
 |    |-- Ambience: string (nullable = true)
 |    |-- BYOB: string (nullable = true)
 |    |-- BYOBCorkage: string (nullable = true)
 |    |-- BestNights: string (nullable = true)
 |    |-- BikeParking: string (nullable = true)
 |    |-- BusinessAcceptsBitcoin: string (nullable = true)
 |    |-- BusinessAcceptsCreditCards: string (nullable = true)
 |    |-- BusinessParking: string (nullable = true)
 |    |-- ByAppointmentOnly: string (nullable = true)
 |    |-- Caters: string (nullable = true)
 |    |-- CoatCheck: string (nullable = true)
 |    |-- Corkage: string (nullable = true)
 |    |-- DietaryRestrictions: string (nullable = true)
 |    |-- DogsAllowed: string (nullable = true)
 |    |-- DriveThru: string (nullable = true)
 |    |-- GoodForDancing: str

In [4]:
review_path = "/content/drive/MyDrive/yelp_dataset/yelp_academic_dataset_review.json"

reviews_df = spark.read.json(review_path)
reviews_df.printSchema()


root
 |-- business_id: string (nullable = true)
 |-- cool: long (nullable = true)
 |-- date: string (nullable = true)
 |-- funny: long (nullable = true)
 |-- review_id: string (nullable = true)
 |-- stars: double (nullable = true)
 |-- text: string (nullable = true)
 |-- useful: long (nullable = true)
 |-- user_id: string (nullable = true)



Filter only restaurants

In [16]:
from pyspark.sql.functions import col

restaurants_df = business_df.filter(
    col("categories").contains("Restaurants")
)


In [17]:
#Sample Review
reviews_sample = reviews_df.sample(fraction=0.05, seed=42)

In [18]:
#Only restaurent reviews are required
restaurant_reviews_df = reviews_sample.join(
    restaurants_df.select("business_id"),
    on="business_id",
    how="inner"
)


Select Useful Fields for Sentimen Analysis

In [20]:
sentiment_df = restaurant_reviews_df.select("review_id", "business_id", "text", "stars")

Convert Stars to Sentiment Lables

In [21]:
from pyspark.sql.functions import when

# Create sentiment label column
sentiment_df = sentiment_df.withColumn(
    "sentiment",
    when(col("stars") <= 2, "Negative")
    .when(col("stars") == 3, "Neutral")
    .otherwise("Positive")
)

# Check
sentiment_df.select("stars", "sentiment").show(10)


+-----+---------+
|stars|sentiment|
+-----+---------+
|  5.0| Positive|
|  1.0| Negative|
|  4.0| Positive|
|  5.0| Positive|
|  5.0| Positive|
|  5.0| Positive|
|  4.0| Positive|
|  4.0| Positive|
|  5.0| Positive|
|  5.0| Positive|
+-----+---------+
only showing top 10 rows


Preprocess text data

In [22]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF
from pyspark.ml import Pipeline

# Tokenize text
tokenizer = Tokenizer(inputCol="text", outputCol="words")

# Remove stopwords
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")

# TF-IDF
hashingTF = HashingTF(inputCol="filtered_words", outputCol="rawFeatures", numFeatures=10000)
idf = IDF(inputCol="rawFeatures", outputCol="features")

# Build pipeline
pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf])
preprocessed_df = pipeline.fit(sentiment_df).transform(sentiment_df)


Convert sentiment to numeric labels

In [23]:
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol="sentiment", outputCol="label")
preprocessed_df = indexer.fit(preprocessed_df).transform(preprocessed_df)


In [24]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Split data
train_df, test_df = preprocessed_df.randomSplit([0.8, 0.2], seed=42)

# Train classifier
lr = LogisticRegression(featuresCol="features", labelCol="label", maxIter=20)
model = lr.fit(train_df)

# Predict
predictions = model.transform(test_df)

# Evaluate accuracy
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Test Accuracy: {accuracy}")


Test Accuracy: 0.8282234532921724
