<a href="https://colab.research.google.com/github/Harivamsh2005/NLP/blob/main/Untitled13.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# ================================
# Sentiment Analysis with Word2Vec/GloVe + Deep Learning
# Models: LSTM, CNN, Bi-LSTM
# ================================

import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Conv1D, GlobalMaxPooling1D, Bidirectional

# --------------------
# Step 1: Load Dataset
# --------------------
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

df = pd.read_csv("tweets.csv")
print("Columns found in dataset:", df.columns)

# Detect text column
if "tweet" in df.columns:
    text_col = "tweet"
elif "text" in df.columns:
    text_col = "text"
elif "content" in df.columns:
    text_col = "content"
else:
    text_col = df.columns[0]   # assume first col is text

# Detect label column
if "label" in df.columns:
    label_col = "label"
elif "sentiment" in df.columns:
    label_col = "sentiment"
elif "target" in df.columns:
    label_col = "target"
elif "class" in df.columns:
    label_col = "class"
else:
    label_col = df.columns[1]  # assume second col is label

print(f"Using text column: {text_col}, label column: {label_col}")

# Clean text
def clean_text(text):
    text = str(text)
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r"[^a-zA-Z]", " ", text)
    text = text.lower().split()
    text = [w for w in text if w not in stop_words]
    return " ".join(text)

df["clean_tweet"] = df[text_col].apply(clean_text)

X = df["clean_tweet"].values
y = df[label_col].values

# Convert labels if they are strings ("positive"/"negative")
if y.dtype == "O":
    y = np.where(y.str.lower().isin(["positive", "pos", "1"]), 1, 0)

# -------------------------
# Step 2: Tokenization + Pad
# -------------------------
max_vocab = 20000
max_len = 30

tokenizer = Tokenizer(num_words=max_vocab)
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)
X_pad = pad_sequences(X_seq, maxlen=max_len)

X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.2, random_state=42)

# ------------------------------
# Step 3: Load GloVe Embeddings
# ------------------------------
embedding_index = {}
with open("glove.6B.300d.txt", encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype="float32")
        embedding_index[word] = vector

embedding_dim = 300
embedding_matrix = np.zeros((max_vocab, embedding_dim))
for word, i in tokenizer.word_index.items():
    if i < max_vocab:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

# ------------------------
# Step 4: Model Functions
# ------------------------
def build_lstm():
    model = Sequential()
    model.add(Embedding(max_vocab, embedding_dim, weights=[embedding_matrix],
                        input_length=max_len, trainable=False))
    model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(1, activation="sigmoid"))
    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
    return model

def build_cnn():
    model = Sequential()
    model.add(Embedding(max_vocab, embedding_dim, weights=[embedding_matrix],
                        input_length=max_len, trainable=False))
    model.add(Conv1D(128, 5, activation="relu"))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(1, activation="sigmoid"))
    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
    return model

def build_bilstm():
    model = Sequential()
    model.add(Embedding(max_vocab, embedding_dim, weights=[embedding_matrix],
                        input_length=max_len, trainable=False))
    model.add(Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2)))
    model.add(Dense(1, activation="sigmoid"))
    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
    return model

# -----------------------
# Step 5: Train & Evaluate
# -----------------------
models = {"LSTM": build_lstm(), "CNN": build_cnn(), "BiLSTM": build_bilstm()}
results = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train, y_train, validation_data=(X_test, y_test),
              epochs=5, batch_size=64, verbose=1)

    # Predictions
    y_pred = (model.predict(X_test) > 0.5).astype(int)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    results[name] = {"Accuracy": acc, "F1": f1}

    print(f"\n{name} Results:")
    print(classification_report(y_test, y_pred))

# -----------------------
# Step 6: Error Analysis
# -----------------------
def error_analysis(model, X_test, y_test, name):
    y_pred = (model.predict(X_test) > 0.5).astype(int).flatten()
    errors = []
    for i in range(len(y_test)):
        if y_pred[i] != y_test[i]:
            errors.append((df.iloc[i][text_col], y_test[i], y_pred[i]))
    print(f"\nMisclassified Positive tweets by {name}:")
    for t, true, pred in errors[:5]:
        if true == 1 and pred == 0:
            print("Tweet:", t)
    print(f"\nMisclassified Negative tweets by {name}:")
    for t, true, pred in errors[:5]:
        if true == 0 and pred == 1:
            print("Tweet:", t)

error_analysis(models["LSTM"], X_test, y_test, "LSTM")

# -----------------------
# Step 7: Compare with ML
# -----------------------
print("\n=== Deep Learning Results ===")
for name, metrics in results.items():
    print(f"{name}: Accuracy={metrics['Accuracy']:.4f}, F1={metrics['F1']:.4f}")

# Example from old assignment
traditional_results = {"SVM": {"Accuracy": 0.78, "F1": 0.76},
                       "NaiveBayes": {"Accuracy": 0.74, "F1": 0.72}}
print("\n=== Traditional ML Results ===")
print(traditional_results)

# -----------------------
# Step 8: Conclusion
# -----------------------
print("\nConclusion:")
print("Deep learning models (especially Bi-LSTM) generally outperform traditional ML models on sentiment detection when using pre-trained embeddings.")
print("CNN is faster and competitive, while traditional ML is useful only for very small datasets.")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Columns found in dataset: Index(['id', 'keyword', 'location', 'text', 'target'], dtype='object')
Using text column: text, label column: target





Training LSTM...
Epoch 1/5
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 166ms/step - accuracy: 0.8419 - loss: 0.3996 - val_accuracy: 0.8729 - val_loss: 0.3211
Epoch 2/5
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 159ms/step - accuracy: 0.8777 - loss: 0.3047 - val_accuracy: 0.8914 - val_loss: 0.2838
Epoch 3/5
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 156ms/step - accuracy: 0.8812 - loss: 0.2751 - val_accuracy: 0.8993 - val_loss: 0.2665
Epoch 4/5
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 153ms/step - accuracy: 0.8990 - loss: 0.2498 - val_accuracy: 0.9006 - val_loss: 0.2659
Epoch 5/5
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 147ms/step - accuracy: 0.9046 - loss: 0.2314 - val_accuracy: 0.8997 - val_loss: 0.2624
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 42ms/step

LSTM Results:
              precision    recall  f1-score   support

           0      

In [3]:
import os, zipfile, requests

glove_path = "glove.6B.300d.txt"

if not os.path.exists(glove_path):
    print("Downloading GloVe embeddings...")
    url = "http://nlp.stanford.edu/data/glove.6B.zip"
    r = requests.get(url)
    open("glove.6B.zip", "wb").write(r.content)

    with zipfile.ZipFile("glove.6B.zip", "r") as zip_ref:
        zip_ref.extractall(".")

    print("GloVe downloaded and extracted!")

else:
    print("GloVe file already exists.")


Downloading GloVe embeddings...
GloVe downloaded and extracted!
