In [None]:
!pip install tensorflow  # If TensorFlow is not already installed



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col
from pyspark.sql.types import StringType, IntegerType, FloatType, StructType, StructField


#"mongodb+srv://<dbusername>:<passwd>@<clustername>.mongodb.net/msds697.yelp_business"
connection_string = "mongodb+srv://shunmugaa:IISc2024@dataenggcluster.oos60.mongodb.net/"
# Create a SparkSession
print("\n======= Creating SparkSession to connect PySpark to MongoDBAtlas...")
my_spark = SparkSession \
   .builder \
   .appName("tutorial") \
   .config("spark.mongodb.read.connection.uri", connection_string) \
   .config("spark.mongodb.write.connection.uri", connection_string) \
   .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1") \
   .getOrCreate()


# Load MongoDB Atlas Collections into PySpark SQL DataFrames
print("\n======= Loading Yelp Business Data from MongoDB into DataFrame...")
df_yelp_business = my_spark.read.format("com.mongodb.spark.sql.DefaultSource") \
   .option('uri', connection_string + "yelp.business") \
   .load()

print("\n======= Loading Yelp Review Data from MongoDB into DataFrame...")
df_yelp_review = my_spark.read.format("com.mongodb.spark.sql.DefaultSource") \
   .option('uri', connection_string + "yelp.reviews") \
   .load()






In [None]:
from pyspark.sql.functions import when
#Categorize Label column
df_yelp_review = df_yelp_review.withColumn(
    "sentiment",
    when(df_yelp_review["stars"].isin(1, 2), 0)
    .when(df_yelp_review["stars"] == 3, 1)
    .otherwise(2)
)



Text Preprocessing

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, lower, split, array_remove, array_join
from pyspark.sql.types import StringType
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download NLTK resources if not already downloaded
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

stop_words = set(stopwords.words('english'))
# Define a UDF for text preprocessing
@udf(returnType=StringType())
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stop words and non-alphabetic tokens
    # Load stop words here inside the UDF to avoid serialization issues

    tokens = [word for word in tokens if word not in stop_words and word.isalpha()]
    return ' '.join(tokens)


# Apply the UDF to the 'text' column and create a new 'preprocessed_text' column
df_yelp_review = df_yelp_review.withColumn("preprocessed_text", preprocess_text(df_yelp_review["text"]))

# Show the first 5 rows of the DataFrame
df_yelp_review.show(5, truncate=False)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


+--------------------------+----------------------+----+-------------------+-----+----------------------+-----+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------+----------------------+---------+---------------

TF-IDF Vectorization

In [None]:

from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.sql.functions import lower, regexp_replace

# Assuming df_yelp_review is your DataFrame and 'preprocessed_text' is the column with the preprocessed text
# Convert text to lowercase and remove punctuation
df_yelp_review = df_yelp_review.withColumn(
    "cleaned_text",
    lower(regexp_replace(df_yelp_review["preprocessed_text"], "[^a-zA-Z\\s]", ""))
)


# Tokenize the text
tokenizer = Tokenizer(inputCol="cleaned_text", outputCol="words")
wordsData = tokenizer.transform(df_yelp_review)

# Calculate Term Frequency (TF)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=1000)  # Adjust numFeatures as needed
featurizedData = hashingTF.transform(wordsData)

# Calculate Inverse Document Frequency (IDF)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
tfidf_matrix = idfModel.transform(featurizedData)

# Show the resulting DataFrame with TF-IDF features
tfidf_matrix.select("features").show(5, truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

**Naive Bayes**

In [None]:

from pyspark.ml.classification import NaiveBayes
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.sql.functions import rand

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Assuming tfidf_matrix is your DataFrame with TF-IDF features and 'sentiment' is the target variable
# Create a new DataFrame with features and label columns
data = tfidf_matrix.select(
    tfidf_matrix["features"].alias("features"), tfidf_matrix["sentiment"].alias("label")
)

# Split data into training and testing sets
(trainingData, testData) = data.randomSplit([0.67, 0.33], seed=42)

# Create a NaiveBayes model
nb = NaiveBayes(smoothing=1.0, modelType="multinomial")

# Create a pipeline to chain the model
pipeline = Pipeline(stages=[nb])

# Train the model
model = pipeline.fit(trainingData)

# Make predictions on the test set
predictions = model.transform(testData)

# Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy:", accuracy)

Accuracy: 0.7368308338075396


**Logistic Regression**

In [None]:
from pyspark.ml.classification import LogisticRegression

# Create a LogisticRegression model
lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8) # Adjust parameters as needed

# Train the model
model = lr.fit(trainingData)

# Make predictions on the test set
predictions = model.transform(testData)

# Evaluate the model - Accuracy
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy:", accuracy)



Accuracy: 0.6870234138234431


**MLP**

In [None]:
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline

# Get the number of features from the training data
num_features = len(trainingData.first()['features']) # Get the number of features from the training data

#Define layers for the neural network
layers = [num_features, 128, 64, 3]  # Example architecture: 300 input features, 2 hidden layers, 3 output classes (0, 1, 2)

# Create a MultilayerPerceptronClassifier
mlp = MultilayerPerceptronClassifier(
    maxIter=100, layers=layers, blockSize=128, seed=1234
)  # Adjust parameters as needed

# Create a pipeline to chain the model
pipeline = Pipeline(stages=[mlp])

# Train the model
model = pipeline.fit(trainingData)

# Make predictions on the test set
predictions = model.transform(testData)

# Evaluate the model
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy"
)
accuracy = evaluator.evaluate(predictions)
print("Accuracy:", accuracy)

Accuracy: 0.829675304020162


Gensim Word Embedding

In [None]:
!pip install pyspark gensim

from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, FloatType
from gensim.models import Word2Vec

# Assuming 'df_yelp_review' is your Spark DataFrame with a 'preprocessed_text' column

# 1. Train a Gensim Word2Vec model (if you don't have a pre-trained model)
# Replace 'your_text_data' with a list or iterable of your preprocessed review texts
# Adjust parameters like size, window, min_count, etc. as needed
text_data = df_yelp_review.select("text").rdd.map(lambda row: row[0].split()).collect()


model = Word2Vec(text_data, vector_size=300, window=5, min_count=5, workers=4)
model.save("word2vec_model") # save the trained model


# 2. Load the pre-trained Word2Vec model
model = Word2Vec.load("word2vec_model")

# 3. Define a UDF for vectorization




In [None]:
@udf(returnType=ArrayType(FloatType()))
def vectorize_text(text):
    words = text.split()
    vectors = [model.wv[word] for word in words if word in model.wv]
    if vectors:
        avg_vector = sum(vectors) / len(vectors) # calculate average first
        return avg_vector.tolist() # convert to python list then return


    else:
        return [0.0] * model.vector_size  # Return a zero vector if no words are found

# 4. Apply the UDF to the DataFrame
df_yelp_review = df_yelp_review.withColumn("gensim_vectors", vectorize_text(df_yelp_review["preprocessed_text"]))

# 5. Show the DataFrame with the new vectors column
df_yelp_review.select("preprocessed_text", "gensim_vectors").show(5, truncate=False)

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vectors, VectorUDT # Import VectorUDT
from pyspark.sql.functions import udf, col # Import col
from pyspark.sql.types import ArrayType, FloatType
from gensim.models import Word2Vec


# 4. Apply the UDF to the DataFrame
df_yelp_review = df_yelp_review.withColumn("gensim_vectors", vectorize_text(df_yelp_review["preprocessed_text"]))


# ***NEW CODE START***
# 4.1 Create a UDF to convert the array of floats to a Vector
array_to_vector_udf = udf(lambda x: Vectors.dense(x), VectorUDT())

# 4.2 Apply the UDF to convert 'gensim_vectors' to a Vector column
df_yelp_review = df_yelp_review.withColumn("gensim_vectors_vec", array_to_vector_udf(col("gensim_vectors")))


# 4.3 Create a VectorAssembler to convert the vector to a feature vector
assembler = VectorAssembler(
    inputCols=["gensim_vectors_vec"], outputCol="features"
)  # Use 'features' as the output column name

# 4.4 Transform the DataFrame to create the 'features' column
df_yelp_review = assembler.transform(df_yelp_review)
# ***NEW CODE END***


# 5. Select the features and label columns for training
data = df_yelp_review.select(
    df_yelp_review["features"], df_yelp_review["sentiment"].alias("label")
)

# ... (rest of your code for training and evaluation) ...

In [None]:

# Split data into training and testing sets
(gentrainingData, gentestData) = data.randomSplit([0.67, 0.33], seed=42)

**Logistic Regression**

In [None]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# Create a LogisticRegression model
lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8) # Adjust parameters as needed

# Train the model
lrmodel = lr.fit(gentrainingData)

# Make predictions on the test set
predictions = lrmodel.transform(gentestData)

# Evaluate the model - Accuracy
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy:", accuracy)

Accuracy: 0.6851290204231381


**Naive Bayes**

In [None]:

from pyspark.ml.feature import VectorAssembler, MinMaxScaler
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.functions import udf, col
from pyspark.sql.types import ArrayType, FloatType
from gensim.models import Word2Vec
from pyspark.ml.classification import NaiveBayes
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


scaler = MinMaxScaler(inputCol="gensim_vectors_vec", outputCol="scaled_features")

df_yelp_review = scaler.fit(df_yelp_review).transform(df_yelp_review)

# 5. Select the features and label columns for training
# Use the scaled features for Naive Bayes
data = df_yelp_review.select(
    df_yelp_review["scaled_features"].alias("features"), df_yelp_review["sentiment"].alias("label")
)

# Split data into training and testing sets
(gentrainingData, gentestData) = data.randomSplit([0.67, 0.33], seed=42)

# Create a NaiveBayes model
nb = NaiveBayes(smoothing=1.0, modelType="multinomial")

# Create a pipeline to chain the model
pipeline = Pipeline(stages=[nb])

# Train the model
model = pipeline.fit(gentrainingData)

# Make predictions on the test set
predictions = model.transform(gentestData)

# Evaluate the model - Accuracy
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy:", accuracy)

Accuracy: 0.6875626880641926


**Neural Networks**

In [None]:
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline

#Define layers for the neural network
layers = [300, 128, 64, 3]  # Example architecture: 300 input features, 2 hidden layers, 3 output classes (0, 1, 2)

# Create a MultilayerPerceptronClassifier
mlp = MultilayerPerceptronClassifier(
    maxIter=100, layers=layers, blockSize=128, seed=1234
)  # Adjust parameters as needed

# Create a pipeline to chain the model
pipeline = Pipeline(stages=[mlp])

# Train the model
model = pipeline.fit(gentrainingData)

# Make predictions on the test set
predictions = model.transform(gentestData)

# Evaluate the model
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy"
)
accuracy = evaluator.evaluate(predictions)
print("Accuracy:", accuracy)

Accuracy: 0.8531862399980409


SPACY Word Embedding

In [None]:

!pip install spacy
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:

from pyspark.ml.feature import VectorAssembler, MinMaxScaler
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.functions import udf, col
from pyspark.sql.types import ArrayType, FloatType, StringType
from pyspark.sql import SparkSession
import spacy


nlp = spacy.load("en_core_web_md")

def spacy_process_text(text):
    doc = nlp(text)
    return doc.vector.tolist() # Return spaCy word embeddings

# Register the UDF
spacy_udf = udf(spacy_process_text, ArrayType(FloatType())) # Changed to FloatType to match spaCy embeddings


# Apply the UDF to the DataFrame
df_yelp_review = df_yelp_review.withColumn(
    "spacy_features", spacy_udf(col("text"))
)

**Naive Bayes**

In [None]:
df_yelp_review_reduced = df_yelp_review.limit(1000)

In [None]:

from pyspark.ml.feature import VectorAssembler, MinMaxScaler
from pyspark.ml.linalg import Vectors, VectorUDT, DenseVector # Import DenseVector
from pyspark.sql.functions import udf, col
from pyspark.sql.types import ArrayType, FloatType
from gensim.models import Word2Vec
from pyspark.ml.classification import NaiveBayes
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.storagelevel import StorageLevel # Import StorageLevel


# Define a UDF to convert array to vector
to_vector_udf = udf(lambda x: DenseVector(x), VectorUDT())

# Apply the UDF to create the 'spacy_features_vec' column
df_yelp_review_reduced = df_yelp_review_reduced.withColumn("spacy_features_vec", to_vector_udf("spacy_features"))

scaler = MinMaxScaler(inputCol="spacy_features_vec", outputCol="scaled_features")
# Persist the DataFrame to MEMORY_AND_DISK
df_yelp_review_reduced.persist(StorageLevel.MEMORY_AND_DISK)  # Persist before time-consuming operations

df_yelp_review_reduced = scaler.fit(df_yelp_review_reduced).transform(df_yelp_review_reduced)

# 5. Select the features and label columns for training
# Use the scaled features for Naive Bayes
assembler = VectorAssembler(inputCols=["scaled_features"], outputCol="scaled_features_ass") # Create a VectorAssembler to convert the array to a vector

data = assembler.transform(df_yelp_review_reduced).select(col("scaled_features_ass").alias("features"), col("sentiment").alias("label"))

# Split data into training and testing sets
(spacytrainingData, spacytestData) = data.randomSplit([0.67, 0.33], seed=42)

# Create a NaiveBayes model
nb = NaiveBayes(smoothing=1.0, modelType="multinomial")

# Create a pipeline to chain the model
pipeline = Pipeline(stages=[nb])

# Train the model
model = pipeline.fit(spacytrainingData)

# Make predictions on the test set
predictions = model.transform(spacytestData)

# Evaluate the model - Accuracy
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy:", accuracy)

Accuracy: 0.7403508771929824


**MLP model**

In [None]:
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline

#Define layers for the neural network
layers = [300, 128, 64, 3]  # Example architecture: 300 input features, 2 hidden layers, 3 output classes (0, 1, 2)

# Create a MultilayerPerceptronClassifier
mlp = MultilayerPerceptronClassifier(
    maxIter=100, layers=layers, blockSize=128, seed=1234
)  # Adjust parameters as needed

# Create a pipeline to chain the model
pipeline = Pipeline(stages=[mlp])

# Train the model
model = pipeline.fit(spacytrainingData)

# Make predictions on the test set
predictions = model.transform(spacytestData)

# Evaluate the model
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy"
)
accuracy = evaluator.evaluate(predictions)
print("Accuracy:", accuracy)

Accuracy: 0.7649122807017544
