In [3]:
#For the model to be trained, we need to install required libraries.
!pip install tensorflow keras numpy pandas scikit-learn nltk



In [1]:
#This snippet of code helps to balance our labelled data 

from sklearn.utils import resample
import pandas as pd

# Load dataset
df = pd.read_csv("labeled_reddit_comments.csv")

# Separate classes
df_positive = df[df["Sentiment"] == "positive"]
df_negative = df[df["Sentiment"] == "negative"]
df_neutral = df[df["Sentiment"] == "neutral"]

# Oversample Negative & Neutral to match 835 samples
df_negative_oversampled = resample(df_negative, replace=True, n_samples=835, random_state=42)
df_neutral_oversampled = resample(df_neutral, replace=True, n_samples=835, random_state=42)

# Combine all classes
df_balanced = pd.concat([df_positive, df_negative_oversampled, df_neutral_oversampled])

# Shuffle data
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Save the balanced dataset
df_balanced.to_csv("balanced_reddit_comments.csv", index=False)

# Check the new distribution
print(df_balanced["Sentiment"].value_counts())
print("✅ Balanced dataset saved as 'balanced_reddit_comments.csv'!")


Sentiment
neutral     835
positive    835
negative    835
Name: count, dtype: int64
✅ Balanced dataset saved as 'balanced_reddit_comments.csv'!


In [61]:
#Training the model and finding accuracies with 15 epochs

from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D, Bidirectional, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

df = pd.read_csv("balanced_reddit_comments_updated.csv")  # Use updated dataset

# Tokenization
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df["Comment"])
X = tokenizer.texts_to_sequences(df["Comment"])
X = pad_sequences(X, maxlen=50)  # Pad sequences to same length

# Convert Sentiment labels to categorical
y = pd.get_dummies(df["Sentiment"]).values

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build LSTM Model

model = Sequential([
    Embedding(10000, 256, input_length=60),
    SpatialDropout1D(0.4),  # Increased dropout to prevent overfitting
    Bidirectional(LSTM(128, dropout=0.4, recurrent_dropout=0.4)),  
    Dense(64, activation="relu"),  
    Dropout(0.4),  # Regularization to prevent overfitting
    Dense(3, activation="softmax")  
])

# Compile Model
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train Model
history = model.fit(X_train, y_train, epochs=15, batch_size=64, validation_data=(X_test, y_test), verbose=1)

# Evaluate Model
loss, acc = model.evaluate(X_test, y_test)
print(f" LSTM Model Accuracy: {acc:.4f}")


Epoch 1/15




[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 437ms/step - accuracy: 0.3991 - loss: 1.0783 - val_accuracy: 0.4990 - val_loss: 0.9882
Epoch 2/15
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 411ms/step - accuracy: 0.4951 - loss: 0.9875 - val_accuracy: 0.5549 - val_loss: 0.9162
Epoch 3/15
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 343ms/step - accuracy: 0.5952 - loss: 0.8209 - val_accuracy: 0.6707 - val_loss: 0.7405
Epoch 4/15
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 426ms/step - accuracy: 0.7938 - loss: 0.5127 - val_accuracy: 0.7465 - val_loss: 0.6225
Epoch 5/15
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 353ms/step - accuracy: 0.8992 - loss: 0.2892 - val_accuracy: 0.7665 - val_loss: 0.6362
Epoch 6/15
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 324ms/step - accuracy: 0.9465 - loss: 0.1515 - val_accuracy: 0.7505 - val_loss: 0.6650
Epoch 7/15
[1m32/32[0m [32m━━━

In [63]:
# Save trained LSTM model
model.save("lstm_sentiment_model.h5")
print(" Model saved as 'lstm_sentiment_model.h5'")




 Model saved as 'lstm_sentiment_model.h5'


In [65]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import pandas as pd

# Load the trained LSTM model
model = tf.keras.models.load_model("lstm_sentiment_model.h5")  # Ensure the file exists

# Load dataset for tokenization (Use the same dataset used during training)
df = pd.read_csv("balanced_reddit_comments_updated.csv")  # Use updated dataset
  # Load dataset
tokenizer = Tokenizer(num_words=5000)  
tokenizer.fit_on_texts(df["Comment"])  # Fit tokenizer on training data

print("Model & Tokenizer Loaded Successfully!")




Model & Tokenizer Loaded Successfully!


In [56]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import pandas as pd

# Load trained model
model = tf.keras.models.load_model("lstm_sentiment_model.h5")

# Load tokenizer (use the same dataset for consistency)
df = pd.read_csv("balanced_reddit_comments_updated.csv")
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(df["Comment"])  # Ensure same tokenization as training

def predict_sentiment(text):
    sequence = tokenizer.texts_to_sequences([text])
    padded_sequence = pad_sequences(sequence, maxlen=60)

    probabilities = model.predict(padded_sequence)
    sentiment_labels = ["Negative", "Positive", "Neutral"]
    
    print("Raw Softmax Probabilities:", probabilities)  # Debug softmax outputs
    print("Predicted Sentiment:", sentiment_labels[np.argmax(probabilities)])

    return sentiment_labels[np.argmax(probabilities)]

# Test with clear negative & positive examples
print(predict_sentiment("This is absolutely terrible. I hate it!"))  # Should be Negative
print(predict_sentiment("I am so happy, this is amazing!"))  # Should be Positive









[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step   
Raw Softmax Probabilities: [[0.31326613 0.02186757 0.66486627]]
Predicted Sentiment: Neutral
Neutral
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 114ms/step
Raw Softmax Probabilities: [[0.05294358 0.4684852  0.4785713 ]]
Predicted Sentiment: Neutral
Neutral


Sentiment
neutral     835
positive    835
negative    835
Name: count, dtype: int64
