In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

nltk.download("stopwords")
nltk.download("wordnet")

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    words = word_tokenize(text)  # Tokenize words
    words = [word for word in words if word.isalnum()]  # Remove punctuation
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]  # Lemmatization
    return " ".join(words)


In [None]:
#Training the model and finding accuracies with 15 epochs

from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D, Bidirectional, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

df = pd.read_csv("balanced_reddit_comments_updated.csv")  # Use updated dataset

# Tokenization
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df["Comment"])
X = tokenizer.texts_to_sequences(df["Comment"])
X = pad_sequences(X, maxlen=50)  # Pad sequences to same length

# Convert Sentiment labels to categorical
y = pd.get_dummies(df["Sentiment"]).values

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build LSTM Model

model = Sequential([
    Embedding(10000, 256, input_length=60),
    SpatialDropout1D(0.4),  # Increased dropout to prevent overfitting
    Bidirectional(LSTM(128, dropout=0.4, recurrent_dropout=0.4)),  
    Dense(64, activation="relu"),  
    Dropout(0.4),  # Regularization to prevent overfitting
    Dense(3, activation="softmax")  
])

# Compile Model
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train Model
history = model.fit(X_train, y_train, epochs=15, batch_size=64, validation_data=(X_test, y_test), verbose=1)

# Evaluate Model
loss, acc = model.evaluate(X_test, y_test)
print(f" LSTM Model Accuracy: {acc:.4f}")


In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import pandas as pd

# Load the trained model
model = tf.keras.models.load_model("lstm_sentiment_model.h5")  # Change filename if different

# Load tokenizer (Use the same tokenizer from training)
df = pd.read_csv("balanced_reddit_comments_updated.csv")  # Use updated dataset
  # Load dataset for tokenization
tokenizer = Tokenizer(num_words=5000)  
tokenizer.fit_on_texts(df["Comment"])  # Fit on training data

# Function to predict sentiment
def predict_sentiment(text):
    sequence = tokenizer.texts_to_sequences([text])  # Convert text to sequence
    padded_sequence = pad_sequences(sequence, maxlen=50)  # Pad sequence
    prediction = model.predict(padded_sequence)  # Predict sentiment

    sentiment_labels = ["Negative", "Positive", "Neutral"]
    predicted_label = sentiment_labels[np.argmax(prediction)]  # Get highest probability label

    return predicted_label

# Test with user input
while True:
    user_input = input("\nEnter a sentence to analyze sentiment (or type 'exit' to stop): ")
    if user_input.lower() == "exit":
        break
    sentiment = predict_sentiment(user_input)
    print(f"🔹 Predicted Sentiment: {sentiment}")
