In [None]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, LSTM, Dense, Dropout, SpatialDropout1D
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Load dataset
data = pd.read_csv("train.csv")  # Ensure correct path

def clean_text(text):
    if isinstance(text, float):
        return ""  # Handle NaN values
    text = text.lower()
    text = re.sub(r'http\S+|www.\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z ]', '', text)  # Remove non-alphabetic characters
    return text

data['cleaned_text'] = data['text'].apply(clean_text)

# Encode Sentiments
data['sentiment'] = data['sentiment'].map({'positive': 2, 'neutral': 1, 'negative': 0})

# Splitting Data
X_train, X_test, y_train, y_test = train_test_split(data['cleaned_text'], data['sentiment'], test_size=0.2, random_state=42)

# Tokenization & Padding
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)
X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=100, padding='post')
X_test_seq = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=100, padding='post')

# Model 1: RNN Model
rnn_model = Sequential([
    Embedding(10000, 128, input_length=100),
    SimpleRNN(64, return_sequences=False),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(3, activation='softmax')
])

rnn_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train RNN
rnn_history = rnn_model.fit(X_train_seq, y_train, validation_data=(X_test_seq, y_test), epochs=5, batch_size=32)

# Model 2: LSTM Model
lstm_model = Sequential([
    Embedding(10000, 128, input_length=100),
    LSTM(64, return_sequences=False),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(3, activation='softmax')
])

lstm_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train LSTM
lstm_history = lstm_model.fit(X_train_seq, y_train, validation_data=(X_test_seq, y_test), epochs=5, batch_size=32)

# Evaluate Models
rnn_preds = np.argmax(rnn_model.predict(X_test_seq), axis=1)
lstm_preds = np.argmax(lstm_model.predict(X_test_seq), axis=1)

print("RNN Model Performance:")
print(classification_report(y_test, rnn_preds, target_names=['Negative', 'Neutral', 'Positive']))

print("LSTM Model Performance:")
print(classification_report(y_test, lstm_preds, target_names=['Negative', 'Neutral', 'Positive']))

# Custom Prediction
def predict_sentiment(model, text):
    text_seq = pad_sequences(tokenizer.texts_to_sequences([clean_text(text)]), maxlen=100, padding='post')
    pred = np.argmax(model.predict(text_seq))
    sentiment_labels = {0: 'Negative', 1: 'Neutral', 2: 'Positive'}
    return sentiment_labels[pred]

print("Text: 'I love this product!' -> Sentiment (RNN):", predict_sentiment(rnn_model, "I love this product!"))
print("Text: 'I love this product!' -> Sentiment (LSTM):", predict_sentiment(lstm_model, "I love this product!"))