### Sentiment Analysis using PySpark and use of Neural Network models(install pyspark and java jdk imp)

In [15]:
import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np

# PySpark for reading the CSV
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# TensorFlow / Keras for deep learning
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional,Input,Dropout

# For splitting data
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
def clean_text(text):
    # Check if text is None or empty
    if not text:
        return ""
    # Lowercase the text
    text = text.lower()
    # Remove punctuation and non-alphanumeric characters
    text = re.sub(r"[^a-z0-9\s]", "", text)
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize each token
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # Join tokens back to form the cleaned text
    return " ".join(tokens)


In [25]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
spark = SparkSession.builder.appName("SentimentAnalysis").getOrCreate()
df = spark.read.csv("../dataset/Reddit_Data.csv", header=True,inferSchema=True)
df.show()

+--------------------+--------+
|       clean_comment|category|
+--------------------+--------+
| family mormon ha...|       1|
|buddhism has very...|       1|
|seriously don say...|      -1|
|what you have lea...|       0|
|for your own bene...|       1|
|you should all si...|      -1|
| was teens when d...|       1|
|jesus was zen mee...|       0|
|there are two var...|      -1|
|dont worry about ...|       1|
| recently told fa...|       1|
| unto others you ...|       1|
|first understand ...|       1|
| recently heard s...|       1|
|different times d...|       1|
|does evil include...|      -1|
|our campaign has ...|       1|
|technically you c...|      -1|
|              zarus |       0|
|blood and souls f...|       0|
+--------------------+--------+
only showing top 20 rows



In [26]:
df = df.dropna()
df = df.withColumn("category", col("category").cast("int"))

# Collect data to driver (only use for datasets that fit in memory)
data = df.select("clean_comment", "category").collect()
texts = [row['clean_comment'] for row in data]
labels = [row['category'] for row in data]

In [27]:
# Count the number of unique categories
unique_count = df.select("category").distinct().count()
print("Unique categories:", unique_count)

# Show the value counts for each category
df.groupBy("category").count().orderBy("count", ascending=False).show()

Unique categories: 3
+--------+-----+
|category|count|
+--------+-----+
|       1|15749|
|       0|12895|
|      -1| 8244|
+--------+-----+



In [28]:
texts[:5]

[' family mormon have never tried explain them they still stare puzzled from time time like some kind strange creature nonetheless they have come admire for the patience calmness equanimity acceptance and compassion have developed all the things buddhism teaches ',
 'buddhism has very much lot compatible with christianity especially considering that sin and suffering are almost the same thing suffering caused wanting things shouldn want going about getting things the wrong way christian this would mean wanting things that don coincide with god will and wanting things that coincide but without the aid jesus buddhism could also seen proof god all mighty will and omnipotence certainly christians are lucky have one such christ there side but what about everyone else well many christians believe god grace salvation and buddhism god way showing grace upon others would also help study the things jesus said and see how buddha has made similar claims such rich man getting into heaven joke basic

In [29]:
labels[:5]

[1, 1, -1, 0, 1]

In [30]:
# Clean Text Data
# Apply text cleaning to every comment
clean_texts = [clean_text(text) for text in texts]
labels = np.array(labels)
clean_texts[:5]

['family mormon never tried explain still stare puzzled time time like kind strange creature nonetheless come admire patience calmness equanimity acceptance compassion developed thing buddhism teach',
 'buddhism much lot compatible christianity especially considering sin suffering almost thing suffering caused wanting thing want going getting thing wrong way christian would mean wanting thing coincide god wanting thing coincide without aid jesus buddhism could also seen proof god mighty omnipotence certainly christian lucky one christ side everyone else well many christian believe god grace salvation buddhism god way showing grace upon others would also help study thing jesus said see buddha made similar claim rich man getting heaven joke basically advocating rid material possession fact distinctly remembered jesus making someone cry someone asked achieve salvation jesus replied live like buddhist roughly translated also point buddha rarely spoke anything god theory personally knew wel

In [31]:
# Split Data for Training, Validation, and Testing
# First split: 70% train, 30% temporary (to later split equally into valid & test)
X_train, X_temp, y_train, y_temp = train_test_split(clean_texts, labels, test_size=0.3, random_state=42)
# Second split: split the temporary 30% equally into validation and test sets (15% each)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [None]:
#id dtype error
#y_train = np.array(y_train, dtype=np.int32)
#y_valid = np.array(y_valid, dtype=np.int32)
#y_test  = np.array(y_test, dtype=np.int32)

In [38]:
# Convert labels to a numpy array and adjust them(so now 0 is negative, 1 is neutral, and 2 is positive)
y_train = np.array([label + 1 for label in y_train], dtype=np.int32)
y_valid = np.array([label + 1 for label in y_valid], dtype=np.int32)
y_test  = np.array([label + 1 for label in y_test], dtype=np.int32)

In [32]:
# Tokenization & Padding
num_words = 10000
max_length = 100

# Initialize and fit the tokenizer on training texts only
tokenizer = Tokenizer(num_words=num_words, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)

# Convert texts to sequences for train, validation, and test sets
train_sequences = tokenizer.texts_to_sequences(X_train)
valid_sequences = tokenizer.texts_to_sequences(X_valid)
test_sequences  = tokenizer.texts_to_sequences(X_test)

# Pad the sequences so that all have the same length
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post', truncating='post')
valid_padded = pad_sequences(valid_sequences, maxlen=max_length, padding='post', truncating='post')
test_padded  = pad_sequences(test_sequences,  maxlen=max_length, padding='post', truncating='post')


In [33]:
train_padded[1]

array([4077,   82,  747,  168,    3,  164,  113,  434, 1712,  190, 1129,
        455,  319,  322,   42,  859,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0])

In [34]:
# Build Models

embedding_dim = 128

# Model 1: 3-Layer LSTM
model_lstm = Sequential([
    Input(shape=(max_length,)),
    Embedding(input_dim=num_words, output_dim=embedding_dim, input_length=max_length),
    LSTM(64, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
    LSTM(32, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
    LSTM(16, dropout=0.2, recurrent_dropout=0.2),
    Dense(3, activation='softmax')  # Assuming three sentiment classes
])
model_lstm.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print("3-Layer LSTM Model Summary:")
model_lstm.summary()

3-Layer LSTM Model Summary:




In [35]:
# Model 2: 3-Layer Bidirectional LSTM
model_bilstm = Sequential([
    Input(shape=(max_length,)),
    Embedding(input_dim=num_words, output_dim=embedding_dim, input_length=max_length),
    Bidirectional(LSTM(64, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)),
    Bidirectional(LSTM(32, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)),
    Bidirectional(LSTM(16, dropout=0.2, recurrent_dropout=0.2)),
    Dense(3, activation='softmax')
])
model_bilstm.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print("3-Layer Bidirectional LSTM Model Summary:")
model_bilstm.summary()

3-Layer Bidirectional LSTM Model Summary:


In [39]:
# Train the Models
epochs = 5
batch_size = 32

print("\nTraining 3-Layer LSTM Model:")
history_lstm = model_lstm.fit(train_padded, y_train, epochs=epochs, batch_size=batch_size,validation_data=(valid_padded, y_valid))

print("\nTraining 3-Layer Bidirectional LSTM Model:")
history_bilstm = model_bilstm.fit(train_padded, y_train, epochs=epochs, batch_size=batch_size,validation_data=(valid_padded, y_valid))


Training 3-Layer LSTM Model:
Epoch 1/5


[1m807/807[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 107ms/step - accuracy: 0.4450 - loss: 1.0598 - val_accuracy: 0.4936 - val_loss: 1.0303
Epoch 2/5
[1m807/807[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m104s[0m 129ms/step - accuracy: 0.5138 - loss: 1.0223 - val_accuracy: 0.5064 - val_loss: 1.0188
Epoch 3/5
[1m807/807[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m129s[0m 159ms/step - accuracy: 0.5121 - loss: 1.0220 - val_accuracy: 0.4739 - val_loss: 1.0444
Epoch 4/5
[1m807/807[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m304s[0m 375ms/step - accuracy: 0.5123 - loss: 0.9824 - val_accuracy: 0.7237 - val_loss: 0.6279
Epoch 5/5
[1m807/807[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 121ms/step - accuracy: 0.7626 - loss: 0.5779 - val_accuracy: 0.8513 - val_loss: 0.4293

Training 3-Layer Bidirectional LSTM Model:
Epoch 1/5
[1m807/807[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m105s[0m 122ms/step - accuracy: 0.6354 - loss: 0.8202 - val_accuracy: 0.83

In [40]:
# Evaluate Models on Test Data
score_lstm = model_lstm.evaluate(test_padded, y_test, verbose=0)
score_bilstm = model_bilstm.evaluate(test_padded, y_test, verbose=0)

print("\n3-Layer LSTM Test Loss: {:.4f}, Accuracy: {:.4f}".format(score_lstm[0], score_lstm[1]))
print("3-Layer Bidirectional LSTM Test Loss: {:.4f}, Accuracy: {:.4f}".format(score_bilstm[0], score_bilstm[1]))



3-Layer LSTM Test Loss: 0.4589, Accuracy: 0.8412
3-Layer Bidirectional LSTM Test Loss: 0.5124, Accuracy: 0.8524


In [41]:
# Save the Best Model

if score_lstm[1] >= score_bilstm[1]:
    best_model = model_lstm
    print("\nBest Model: 3-Layer LSTM")
else:
    best_model = model_bilstm
    print("\nBest Model: 3-Layer Bidirectional LSTM")

# Save the best model in H5 format
best_model.save("best_sentiment_model.h5")
print("Best model saved as 'best_sentiment_model.h5'")




Best Model: 3-Layer Bidirectional LSTM
Best model saved as 'best_sentiment_model.h5'


#### currently saved 3layer bidirectional model

In [42]:
#saving the tokenizer
import pickle

# Save the tokenizer to a file
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [43]:
#also save the lstm as comparable performance is there
model_lstm.save("lstm_sentiment_model.h5")
print("LSTM model saved as 'lstm_sentiment_model.h5'")



LSTM model saved as 'lstm_sentiment_model.h5'


In [None]:
#testing the model with a sample text of reddit on apple subreddit
text="I just upgraded to the new iPhone from Apple, and while I love its sleek design and powerful performance, the battery life leaves much to be desired."
clean_text = clean_text(text)
sequence = tokenizer.texts_to_sequences([clean_text])
padded = pad_sequences(sequence, maxlen=max_length, padding='post', truncating='post')
prediction = best_model.predict(padded)

if prediction[0][0] == 0:
    print("Negative Sentiment")
elif prediction[0][1] == 1:
    print("Neutral Sentiment")
else:
    print("Positive Sentiment")
##output is positive sentiment(2) which is correct as the review is positive

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 755ms/step
Positive Sentiment


In [1]:
import tensorflow as tf
print(tf.__version__)

2.17.0


#### Further try on different prebuilt embedding models and transformer based like BERT and see how they perform