<a href="https://colab.research.google.com/github/NayanaSaikumar21/twitter_sentiment_analysis/blob/main/twitterSentimentAnalysisUsingLstm_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ---- STEP 0: Install Required Packages ----
!pip install numpy pandas tensorflow nltk tqdm gradio kaggle

import os
import shutil
import pandas as pd
import numpy as np
import re
import string
import nltk
import tensorflow as tf
import gradio as gr
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from google.colab import files

# Ensure NLTK resources are available
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")

# ---- STEP 1: Upload & Configure Kaggle API ----
print("📂 Please upload your 'kaggle.json' file.")
uploaded = files.upload()

# Move kaggle.json to the correct directory
kaggle_json_path = "kaggle.json"
kaggle_dest_path = "/root/.kaggle/kaggle.json"

os.makedirs("/root/.kaggle", exist_ok=True)
shutil.move(kaggle_json_path, kaggle_dest_path)
os.chmod(kaggle_dest_path, 0o600)

print("✅ Kaggle API key configured successfully.")

# ---- STEP 2: Download & Extract Dataset ----
!kaggle datasets download -d kazanova/sentiment140 -p /mnt/data --unzip

# Define dataset path
DATA_PATH = "/mnt/data/training.1600000.processed.noemoticon.csv"

# Check if dataset exists
if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(f"❌ {DATA_PATH} not found. Please check Kaggle download.")

print("✅ Dataset downloaded successfully.")

# ---- STEP 3: Load & Preprocess Dataset ----
# Load dataset
columns = ["target", "id", "date", "flag", "user", "text"]
df = pd.read_csv(DATA_PATH, names=columns, encoding="latin-1")[["target", "text"]]

# Convert sentiment labels
df["target"] = df["target"].replace({0: 0, 4: 1})

# Check original class distribution
print("📊 Original Class Distribution:")
print(df["target"].value_counts())

# Balance dataset: Take 50K positive & 50K negative samples
negative_samples = df[df["target"] == 0].sample(n=50000, random_state=42)
positive_samples = df[df["target"] == 1].sample(n=50000, random_state=42)
df_balanced = pd.concat([negative_samples, positive_samples]).sample(frac=1, random_state=42)

# Check new class distribution
print("📊 Balanced Class Distribution:")
print(df_balanced["target"].value_counts())


# ---- STEP 4: Text Cleaning ----
def clean_text(text):
    if not isinstance(text, str):  # Handle NaN values
        return ""

    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)  # Remove URLs
    text = re.sub(r"@\w+", "", text)  # Remove mentions
    text = re.sub(r"#(\w+)", r"\1", text)  # Remove hashtags
    text = text.translate(str.maketrans("", "", string.punctuation))  # Remove punctuation
    text = re.sub(r"\d+", "", text)  # Remove numbers
    text = text.strip()
    return text

df["clean_text"] = df["text"].apply(clean_text)

# Remove stopwords
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()
df["clean_text"] = df["clean_text"].apply(
    lambda x: " ".join([lemmatizer.lemmatize(word) for word in x.split() if word not in stop_words])
)

# Remove empty rows
df = df[df["clean_text"] != ""]

# Ensure dataset is not too small
if len(df) < 100:
    raise ValueError(f"❌ Not enough data to train! Dataset contains only {len(df)} samples. Add more data.")

print(f"✅ Data preprocessing complete. {len(df)} samples available.")

# ---- STEP 5: Tokenization ----
MAX_NUM_WORDS = 10000  # Vocabulary size
MAX_SEQ_LENGTH = 50     # Sequence length
EMBEDDING_DIM = 200     # Embedding size

tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(df["clean_text"])
sequences = tokenizer.texts_to_sequences(df["clean_text"])
padded_sequences = pad_sequences(sequences, maxlen=MAX_SEQ_LENGTH, padding="post")

print("✅ Tokenization complete.")


Collecting gradio
  Downloading gradio-5.23.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.8.0 (from gradio)
  Downloading gradio_client-1.8.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.2-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6 

KeyboardInterrupt: 

In [None]:

# ---- STEP 6: Define & Train LSTM Model ----
model = Sequential([
    Embedding(MAX_NUM_WORDS, EMBEDDING_DIM, input_length=MAX_SEQ_LENGTH),
    LSTM(64, return_sequences=True),
    Dropout(0.5),
    LSTM(32),
    Dense(1, activation="sigmoid")
])

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
print("✅ LSTM Model Defined.")

# ---- STEP 7: Train the Model ----
if len(df) > 100:
    print("🚀 Training model with validation split...")
    history = model.fit(padded_sequences, df["target"].values, epochs=5, batch_size=64, validation_split=0.2)

print("✅ Model training complete.")

# ---- STEP 8: Save the Model ----
MODEL_PATH = "/mnt/data/sentiment_lstm_model.h5"
model.save(MODEL_PATH)
print(f"✅ Model saved at {MODEL_PATH}")


In [None]:
import tensorflow as tf
import pandas as pd
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm  # Progress tracking
import numpy as np

# ---- STEP 9: Load & Compile Model ----
print("🔄 Loading model...")
model = tf.keras.models.load_model(MODEL_PATH)
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])  # Fixes warning

# ---- STEP 10: Optimize Sentiment Prediction ----
print("🔄 Preprocessing dataset for prediction...")

# Clean and tokenize all texts at once (Vectorized for speed)
df["clean_text"] = df["clean_text"].apply(lambda x: " ".join(
    [lemmatizer.lemmatize(word) for word in clean_text(x).split() if word not in stop_words]
))

# Convert all texts to sequences
sequences = tokenizer.texts_to_sequences(df["clean_text"])
padded_sequences = pad_sequences(sequences, maxlen=MAX_SEQ_LENGTH, padding="post")

print("🚀 Running batch predictions...")

# Predict in a single batch instead of one by one (MUCH FASTER)
batch_predictions = model.predict(padded_sequences, verbose=1)  # Predict all at once

# Convert predictions to labels
df["predicted_sentiment"] = np.where(batch_predictions.flatten() > 0.5, "Positive 😊", "Negative 😠")

# ---- STEP 11: Save Predictions to CSV ----
output_file = "/mnt/data/predicted_sentiments.csv"
df.to_csv(output_file, index=False)
print(f"✅ Sentiment predictions saved to {output_file}")


In [None]:

# ---- STEP 12: Create a Gradio Interface ----
def gradio_predict(input_text):
    return predict_sentiment(input_text)

interface = gr.Interface(fn=gradio_predict, inputs="text", outputs="text")
interface.launch(share=True)
