In [1]:
#For the model to be trained, we need to install required libraries.
!pip install tensorflow keras numpy pandas scikit-learn nltk



In [41]:
from sklearn.utils import resample
import pandas as pd

# Load dataset
df = pd.read_csv("labeled_reddit_comments.csv")  # Original labeled dataset

# Separate classes
df_positive = df[df["Sentiment"] == "positive"]
df_negative = df[df["Sentiment"] == "negative"]
df_neutral = df[df["Sentiment"] == "neutral"]

# Oversample Negative & Neutral to match 835 samples
df_negative_oversampled = resample(df_negative, replace=True, n_samples=835, random_state=42)
df_neutral_oversampled = resample(df_neutral, replace=True, n_samples=835, random_state=42)

# Combine all classes
df_balanced = pd.concat([df_positive, df_negative_oversampled, df_neutral_oversampled])

# Shuffle data
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# **Apply preprocessing to text before saving**
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    if pd.isna(text):  
        return ""
    text = text.lower()
    words = word_tokenize(text)
    
    # Handle negations like "not good" → "not_good"
    negation_words = {"not", "no", "never", "n't"}
    processed_words = []
    i = 0
    while i < len(words):
        if words[i] in negation_words and i + 1 < len(words):
            processed_words.append(words[i] + "_" + words[i + 1])
            i += 1  
        else:
            processed_words.append(words[i])
        i += 1

    words = [lemmatizer.lemmatize(word) for word in processed_words if word.isalnum() and word not in stop_words]
    return " ".join(words)

# **Apply preprocessing**
df_balanced["Processed_Comment"] = df_balanced["Comment"].apply(preprocess_text)

# Save the **fully processed & balanced dataset**
df_balanced.to_csv("balanced_reddit_comments_updated.csv", index=False)

# Check the new distribution
print(df_balanced["Sentiment"].value_counts())
print(" Balanced & Preprocessed dataset saved as 'balanced_reddit_comments_updated.csv'!")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\guntu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\guntu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\guntu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Sentiment
neutral     835
negative    835
positive    551
Name: count, dtype: int64
 Balanced & Preprocessed dataset saved as 'balanced_reddit_comments_updated.csv'!


In [43]:
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D, Bidirectional, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd

df = pd.read_csv("balanced_reddit_comments_updated.csv")  #  Use updated dataset

# Convert to string and handle NaN values
df["Processed_Comment"] = df["Processed_Comment"].astype(str).fillna("")

# **Use Processed_Comment**
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df["Processed_Comment"])  #  Use preprocessed text
X = tokenizer.texts_to_sequences(df["Processed_Comment"])  #  Use preprocessed text
X = pad_sequences(X, maxlen=50)  # Pad sequences to same length

# Convert Sentiment labels to categorical
y = pd.get_dummies(df["Sentiment"]).values

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# **Fine-Tuned LSTM Model**
model = Sequential([
    Embedding(10000, 256, input_length=60),
    SpatialDropout1D(0.4),  
    Bidirectional(LSTM(128, dropout=0.4, recurrent_dropout=0.4)),  
    Dense(64, activation="relu"),  
    Dropout(0.4),  
    Dense(3, activation="softmax")  
])

# Compile Model
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train Model
history = model.fit(X_train, y_train, epochs=15, batch_size=64, validation_data=(X_test, y_test), verbose=1)

# Evaluate Model
loss, acc = model.evaluate(X_test, y_test)
print(f" Fine-Tuned LSTM Model Accuracy: {acc:.4f}")


Epoch 1/15




[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 188ms/step - accuracy: 0.3723 - loss: 1.0839 - val_accuracy: 0.4571 - val_loss: 1.0243
Epoch 2/15
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 143ms/step - accuracy: 0.5293 - loss: 0.9644 - val_accuracy: 0.5609 - val_loss: 0.8736
Epoch 3/15
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 144ms/step - accuracy: 0.6346 - loss: 0.7631 - val_accuracy: 0.7585 - val_loss: 0.6383
Epoch 4/15
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 150ms/step - accuracy: 0.8591 - loss: 0.3843 - val_accuracy: 0.7864 - val_loss: 0.5530
Epoch 5/15
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 180ms/step - accuracy: 0.9426 - loss: 0.1846 - val_accuracy: 0.8084 - val_loss: 0.6007
Epoch 6/15
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 172ms/step - accuracy: 0.9715 - loss: 0.1014 - val_accuracy: 0.8343 - val_loss: 0.6131
Epoch 7/15
[1m32/32[0m [32m━━━━━━━━

In [45]:
# Save trained LSTM model
model.save("lstm_sentiment_model.h5")
print(" Model saved as 'lstm_sentiment_model.h5'")




 Model saved as 'lstm_sentiment_model.h5'


In [47]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import pandas as pd

# Load the trained LSTM model
model = tf.keras.models.load_model("lstm_sentiment_model.h5")  # Ensure the file exists

# Load dataset for tokenization (Use the same dataset used during training)
df = pd.read_csv("balanced_reddit_comments_updated.csv")  # Use updated dataset
  # Load dataset
tokenizer = Tokenizer(num_words=5000)  
df["Processed_Comment"] = df["Processed_Comment"].astype(str).fillna("")
tokenizer.fit_on_texts(df["Processed_Comment"])  # Fit tokenizer on training data

print("Model & Tokenizer Loaded Successfully!")




Model & Tokenizer Loaded Successfully!


In [51]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import pandas as pd

# Load trained model
model = tf.keras.models.load_model("lstm_sentiment_model.h5")

# Load tokenizer (use the same dataset for consistency)
df = pd.read_csv("balanced_reddit_comments_updated.csv")
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(df["Comment"])  # Ensure same tokenization as training

def predict_sentiment(text):
    sequence = tokenizer.texts_to_sequences([text])
    padded_sequence = pad_sequences(sequence, maxlen=60)

    probabilities = model.predict(padded_sequence)
    sentiment_labels = ["Negative", "Positive", "Neutral"]
    
    print("Raw Softmax Probabilities:", probabilities)  # Debug softmax outputs
    print("Predicted Sentiment:", sentiment_labels[np.argmax(probabilities)])

    return sentiment_labels[np.argmax(probabilities)]

# Test with clear negative & positive examples
print(predict_sentiment("It's okay, nothing too great or too bad."))  # Should be Neutral
print(predict_sentiment("This is the worst experience ever. I regret buying this."))  # Should be negative





[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 784ms/step
Raw Softmax Probabilities: [[1.8276625e-06 4.9813181e-10 9.9999821e-01]]
Predicted Sentiment: Neutral
Neutral
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step
Raw Softmax Probabilities: [[9.9858618e-01 2.8892771e-05 1.3848791e-03]]
Predicted Sentiment: Negative
Negative
