In [7]:
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
import pandas as pd
from collections import Counter
from tqdm import tqdm

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [8]:
def build_vocab(texts):
  all_words = ' '.join(texts).split()
  word_counts = Counter(all_words)
  vocab = {word: idx + 1 for idx, (word, _) in enumerate(word_counts.most_common())}
  vocab['<PAD>'] = 0  ## token for paddng
  return vocab

data = pd.read_csv('/content/sample_data/training.1600000.processed.noemoticon.csv') ## link to the dataset -> "https://www.kaggle.com/datasets/kazanova/sentiment140"
data.columns = ['target', 'id', 'date', 'flag', 'user', 'text']
data.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [9]:
data['target'] = data['target'].map(lambda x: 1 if x == 4 else x)
data['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
1,800000
0,799999


In [10]:
vocab = build_vocab(data['text'])
vocab_size = len(vocab)
vocab_size

1350280

In [11]:
def text_to_seq(text, vocab, max_len=20):
  tokens = text.split() ## 'I love Egypt' -> [I, love, egypt]
  seq = [vocab.get(token, vocab['<PAD>']) for token in tokens][:max_len]
  return seq + [vocab['<PAD>']] * (max_len - len(seq)) ## padding with zeros

data['sequence'] = data['text'].apply(lambda x: text_to_seq(x, vocab))

In [12]:
class SentimentDataset(Dataset):
  def __init__(self, sequences, labels):
    self.sequences = torch.tensor(sequences, dtype=torch.long, device=device)
    self.labels = torch.tensor(labels, dtype=torch.float, device=device)

  def __len__(self):
    return len(self.labels)

  def __getitem__(self, idx):
    return self.sequences[idx], self.labels[idx]

dataset = SentimentDataset(data['sequence'].tolist(), data['target'].tolist())
dataloader = DataLoader(dataset, batch_size=128, shuffle=True)

In [13]:
class Sentiment_Analysis_LSTM_with_Atten_Mech(nn.Module):
  def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
    super(Sentiment_Analysis_LSTM_with_Atten_Mech, self).__init__()

    self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

    self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
    self.attention = nn.Linear(hidden_dim * 2, 1)  ## *2 because bidirectional

    self.fc = nn.Linear(hidden_dim * 2, output_dim)

  def forward(self, text): ## forward probagation
    ## text: (batch_size, max_len)
    embedded = self.embedding(text)  ## (batch_size, max_len, embedding_dim)
    lstm_out, _ = self.lstm(embedded)  ## lstm_out: (batch_size, max_len, hidden_dim * 2) / forget about hidden or cell for now (hidden, cell) -> _

    ## atten mechanism
    attn_scores = self.attention(lstm_out) ## (batch_size, max_len, 1)
    attn_weights = torch.softmax(attn_scores, dim=1) ## (batch_size, max_len, 1)
    context = torch.sum(attn_weights * lstm_out, dim=1) ## (batch_size, hidden_dim * 2)

    out = self.fc(context) ## (batch_size, output_dim)
    return out

embedding_dim = 50
hidden_dim = 64
output_dim = 1
model = Sentiment_Analysis_LSTM_with_Atten_Mech(vocab_size, embedding_dim, hidden_dim, output_dim).to(device)

In [14]:
loss_fn = nn.BCEWithLogitsLoss()  ## Binary cross-entropy because we have 2 classes only (Binary Classificaton)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)

In [15]:
def train_model(model, dataloader, epochs=10):
  model.train()
  for epoch in range(epochs):
    total_loss = 0
    progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}", leave=False) ## this 'leave' parameter deletes the output after each epoch when set to False

    for batch_seq, batch_labels in progress_bar:
      output = model(batch_seq).squeeze(-1)  ## (batch_size,)
      loss = loss_fn(output, batch_labels)

      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

      total_loss += loss.item()
      progress_bar.set_postfix(loss=loss.item())

train_model(model, dataloader)



In [45]:
torch.save(model.state_dict(), "NLP_Task_model.pth")

In [42]:
def model_infer(model, text, vocab, max_len=10):
  model.eval()

  sequences = [text_to_seq(t[0], vocab) for t in text]
  seq = torch.tensor(sequences, dtype=torch.long, device=device)

  with torch.inference_mode():
    pred = model(seq)
    pred = torch.sigmoid(pred).squeeze(-1)

  return ["Positive" if p > 0.5 else "Negative" for p in pred] if len(text) > 1 else ("Positive" if pred.item() > 0.5 else "Negative")

In [43]:
## Generated by ChatGPT (so not that much good as a test but it's ok)

test_cases = [
    # Basic Sentiment Classification
    ("I love this movie! It's amazing.", "Positive"),
    ("This is the worst product I have ever used.", "Negative"),
    ("Absolutely fantastic service! Highly recommend.", "Positive"),
    ("I'm never buying from this company again.", "Negative"),

    # Negation Handling
    ("Not bad at all! I actually liked it.", "Positive"),
    ("I don't love it.", "Negative"),
    ("This isn't the worst thing I've seen.", "Positive"),

    # Sarcasm & Irony
    ("Oh great, another Monday. Just what I needed. 😒", "Negative"),
    ("Loved waiting two hours for customer service. Best experience ever. 🙄", "Negative"),
    ("Yeah, because being stuck in traffic for 3 hours is SO fun.", "Negative"),
    ("Wow, what a fantastic update! Now the app crashes even faster. 👏", "Negative"),

    # Emojis & Hashtags
    ("Best day ever! 😍🔥 #excited", "Positive"),
    ("Ugh, I can't believe this happened. 😡 #disappointed", "Negative"),
    ("Feeling super happy today! 😊", "Positive"),
    ("This is just terrible. 😞 #fail", "Negative"),

    # Slang & Informal Speech
    ("That movie was lit! 🔥", "Positive"),
    ("Man, that game was straight-up trash. 💀", "Negative"),
    ("Bruh, that was insane! 😂", "Positive"),
    ("Smh, this ain't it chief.", "Negative"),

    # Short & Incomplete Tweets
    ("Awesome!", "Positive"),
    ("Terrible.", "Negative"),
    ("Disgusting.", "Negative"),
    ("Loved it.", "Positive"),

    # Capitalization & Punctuation Emphasis
    ("I LOVE THIS SO MUCH!!!", "Positive"),
    ("WHY IS THIS HAPPENING?!?! 😭", "Negative"),
    ("This is TERRIBLE!!!", "Negative"),
    ("So good!! Best experience ever!!", "Positive"),

    # Mixed Language (Code-Switching)
    ("Esta película es increíble! Loved it!", "Positive"),
    ("هذا التطبيق سيء جدا. Worst app ever.", "Negative"),
    ("C'est génial! This is amazing!", "Positive"),
    ("Horrible. كرهته جدا", "Negative"),
]

analysis = model_infer(model, test_cases, vocab)

In [44]:
for i, (text, expected) in enumerate(test_cases):
  prediction = analysis[i]
  print(f"Text: {text} | Prediction: {prediction} | Expected: {expected} | {'✅' if prediction == expected else '❌'}")

Text: I love this movie! It's amazing. | Prediction: Positive | Expected: Positive | ✅
Text: This is the worst product I have ever used. | Prediction: Negative | Expected: Negative | ✅
Text: Absolutely fantastic service! Highly recommend. | Prediction: Positive | Expected: Positive | ✅
Text: I'm never buying from this company again. | Prediction: Negative | Expected: Negative | ✅
Text: Not bad at all! I actually liked it. | Prediction: Positive | Expected: Positive | ✅
Text: I don't love it. | Prediction: Negative | Expected: Negative | ✅
Text: This isn't the worst thing I've seen. | Prediction: Negative | Expected: Positive | ❌
Text: Oh great, another Monday. Just what I needed. 😒 | Prediction: Negative | Expected: Negative | ✅
Text: Loved waiting two hours for customer service. Best experience ever. 🙄 | Prediction: Positive | Expected: Negative | ❌
Text: Yeah, because being stuck in traffic for 3 hours is SO fun. | Prediction: Negative | Expected: Negative | ✅
Text: Wow, what a fanta