In [1]:
pip install numpy pandas nltk scikit-learn tensorflow



In [2]:
import pandas as pd
import numpy as np
import re
import nltk
import string
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle
import joblib
import os

nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
# ✅ Load Sample Dataset (Modify for Your Own Data)
data =pd.read_csv("/content/Tweets.csv")

df = pd.DataFrame(data)

# ✅ Convert Sentiments to Numeric Values
sentiment_map = {"positive": 1, "negative": 0, "neutral": 2}
df["sentiment"] = df["sentiment"].map(sentiment_map)

In [4]:
# ✅ Text Cleaning Function
def preprocess_text(text):
    # Check if the text is a string before processing
    if isinstance(text, str):
        text = text.lower()  # Convert to lowercase
        text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)  # Remove URLs
        text = re.sub(r"\@w+|\#", "", text)  # Remove mentions and hashtags
        text = text.translate(str.maketrans("", "", string.punctuation))  # Remove punctuation
        text = re.sub(r"\d+", "", text)  # Remove numbers
        text = text.strip()  # Remove whitespace
        stop_words = set(stopwords.words("english"))
        text = " ".join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    # If not a string, you may want to handle it differently (e.g., skip it, replace with an empty string)
    else:
        text = ""  # or handle in another way, depending on your needs
    return text

# ✅ Apply Preprocessing
df["cleaned_text"] = df["text"].apply(preprocess_text)

print(df.head())

       textID                                               text  \
0  cb774db0d1                I`d have responded, if I were going   
1  549e992a42      Sooo SAD I will miss you here in San Diego!!!   
2  088c60f138                          my boss is bullying me...   
3  9642c003ef                     what interview! leave me alone   
4  358bd9e861   Sons of ****, why couldn`t they put them on t...   

                         selected_text  sentiment  \
0  I`d have responded, if I were going          2   
1                             Sooo SAD          0   
2                          bullying me          0   
3                       leave me alone          0   
4                        Sons of ****,          0   

                               cleaned_text  
0                        id responded going  
1                   sooo sad miss san diego  
2                             boss bullying  
3                     interview leave alone  
4  sons couldnt put releases already bough

In [7]:
# ✅ Split Data into Train & Test Sets
X_train, X_test, y_train, y_test = train_test_split(df["cleaned_text"], df["sentiment"], test_size=0.2, random_state=42)

# ✅ Convert Text to TF-IDF Features
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# ✅ Train Logistic Regression Model
logreg_model = LogisticRegression()
logreg_model.fit(X_train_tfidf, y_train)

# ✅ Save Model & Vectorizer
joblib.dump(logreg_model, "sentiment_model.pkl")
joblib.dump(tfidf_vectorizer, "tfidf_vectorizer.pkl")

# ✅ Evaluate Model
y_pred = logreg_model.predict(X_test_tfidf)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Logistic Regression Accuracy: 0.6838275422957977
              precision    recall  f1-score   support

           0       0.73      0.59      0.65      1562
           1       0.76      0.69      0.72      1705
           2       0.62      0.74      0.68      2230

    accuracy                           0.68      5497
   macro avg       0.70      0.68      0.68      5497
weighted avg       0.69      0.68      0.68      5497



In [6]:
# ✅ Tokenization
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df["cleaned_text"])
X_seq = tokenizer.texts_to_sequences(df["cleaned_text"])
X_padded = pad_sequences(X_seq, maxlen=100)

# ✅ Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_padded, df["sentiment"], test_size=0.2, random_state=42)

# ✅ Build LSTM Model
lstm_model = Sequential([
    Embedding(5000, 128, input_length=100),
    SpatialDropout1D(0.2),
    LSTM(100, dropout=0.2, recurrent_dropout=0.2),
    Dense(3, activation="softmax")  # 3 Sentiment Classes
])

lstm_model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

# ✅ Train Model
lstm_model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_test, y_test))

# ✅ Save Model & Tokenizer
lstm_model.save("lstm_sentiment_model.h5")
pickle.dump(tokenizer, open("tokenizer.pkl", "wb"))

Epoch 1/5




[1m687/687[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 191ms/step - accuracy: 0.5484 - loss: 0.9232 - val_accuracy: 0.6969 - val_loss: 0.7197
Epoch 2/5
[1m687/687[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 197ms/step - accuracy: 0.7508 - loss: 0.6274 - val_accuracy: 0.7084 - val_loss: 0.7016
Epoch 3/5
[1m687/687[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 197ms/step - accuracy: 0.7824 - loss: 0.5496 - val_accuracy: 0.7069 - val_loss: 0.7178
Epoch 4/5
[1m687/687[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m145s[0m 202ms/step - accuracy: 0.8094 - loss: 0.5005 - val_accuracy: 0.7020 - val_loss: 0.7429
Epoch 5/5
[1m687/687[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 200ms/step - accuracy: 0.8299 - loss: 0.4462 - val_accuracy: 0.6922 - val_loss: 0.7966




In [10]:
# ✅ Evaluate LSTM Model
# Convert X_test and y_test to NumPy arrays with appropriate data types
# Assuming your sequences are integers
X_test_np = X_test.to_numpy()  # Remove astype(np.int32)
y_test_np = y_test.to_numpy().astype(np.int32)  # Assuming your labels are integers

# Ensure X_test_np contains tokenized and padded sequences
X_test_np = tokenizer.texts_to_sequences(X_test_np)  # Tokenize
X_test_np = pad_sequences(X_test_np, maxlen=100)      # Pad

loss, accuracy = lstm_model.evaluate(X_test_np, y_test_np, verbose=1)

print(f"LSTM Model Accuracy: {accuracy:.4f}")
print(f"LSTM Model Loss: {loss:.4f}")

# ✅ Generate Predictions
y_pred_probs = lstm_model.predict(X_test_np)  # Probabilities for each class
y_pred = y_pred_probs.argmax(axis=1)  # Convert probabilities to class labels

# ✅ Classification Report
from sklearn.metrics import classification_report

print("LSTM Model Classification Report:")
print(classification_report(y_test_np, y_pred))

[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 36ms/step - accuracy: 0.6836 - loss: 0.8036
LSTM Model Accuracy: 0.6922
LSTM Model Loss: 0.7966
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 45ms/step
LSTM Model Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.65      0.67      1562
           1       0.73      0.75      0.74      1705
           2       0.66      0.68      0.67      2230

    accuracy                           0.69      5497
   macro avg       0.70      0.69      0.69      5497
weighted avg       0.69      0.69      0.69      5497



In [11]:
pip install gradio

Collecting gradio
  Downloading gradio-5.19.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.9-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.7.2 (from gradio)
  Downloading gradio_client-1.7.2-py3-none-any.whl.metadata (7.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.9.8-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.meta

In [12]:
import gradio as gr
import joblib
import pickle
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model

# ✅ Load Trained Models
logreg_model = joblib.load("sentiment_model.pkl")
tfidf_vectorizer = joblib.load("tfidf_vectorizer.pkl")
lstm_model = load_model("lstm_sentiment_model.h5")
tokenizer = pickle.load(open("tokenizer.pkl", "rb"))

# ✅ Text Preprocessing Function
def preprocess_text(text):
    return text.lower().strip()

# ✅ Sentiment Analysis Function
def predict_sentiment(user_text):
    processed_text = preprocess_text(user_text)

    # ✅ Logistic Regression Prediction
    text_tfidf = tfidf_vectorizer.transform([processed_text])
    logreg_pred = logreg_model.predict(text_tfidf)[0]

    # ✅ LSTM Prediction
    text_seq = tokenizer.texts_to_sequences([processed_text])
    text_padded = pad_sequences(text_seq, maxlen=100)
    lstm_pred = np.argmax(lstm_model.predict(text_padded), axis=1)[0]

    sentiment_map = {0: "Negative", 1: "Positive", 2: "Neutral"}

    return f"Logistic Regression: {sentiment_map[logreg_pred]}\nLSTM Model: {sentiment_map[lstm_pred]}"

# ✅ Gradio Interface
interface = gr.Interface(
    fn=predict_sentiment,
    inputs="text",
    outputs="text",
    title="Sentiment Analysis System",
    description="Enter text to analyze its sentiment using both Logistic Regression & LSTM models."
)

# ✅ Launch Gradio App
interface.launch()



Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://37fcdf66aa8da1a444.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


