In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from torchvision import transforms
import nltk
from nltk import word_tokenize
import pandas as pd
import numpy as np 
nltk.download('punkt')
from tqdm import tqdm
from collections import Counter
import re
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence


[nltk_data] Error loading punkt: <urlopen error [Errno -3] Temporary
[nltk_data]     failure in name resolution>


In [2]:
class Tokenizer:
    def __init__(self, corpus, min_freq=2):
        self.corpus = corpus
        self.tokens = nltk.word_tokenize(corpus.lower())
        
        self.freq = Counter(self.tokens)
        
        self.vocab = {
            '[PAD]': 0,
            '[UNK]': 1,
            '[CLS]': 2,
            '[SEP]': 3,
        }
        
        idx = 4
        for word in sorted(self.freq):
            if self.freq[word] >= min_freq:
                self.vocab[word] = idx
                idx += 1

        self.id_to_word = {idx: word for word, idx in self.vocab.items()}
    
    def tokenize(self, sentence):
        return nltk.word_tokenize(sentence.lower())
    
    def batch_tokenize(self, batch_sentence):
        return [self.tokenize(sentence) for sentence in tqdm(batch_sentence)]
    
    def encode(self, tokens):
        return [self.vocab.get(word, self.vocab['[UNK]']) for word in tokens]
    
    def batch_encode(self, batch_tokens):
        return [self.encode(tokens) for tokens in tqdm(batch_tokens)]
    
    def decode(self, sequence):
        return ' '.join(self.id_to_word.get(idx, '[UNK]') for idx in sequence)
    
    def batch_decode(self, batch_sequences):
        return [self.decode(seq) for seq in tqdm(batch_sequences)]
    
    def prepare_inputs(self, tokens, max_seq):
        cls_token = [self.vocab['[CLS]']]
        sep_token = [self.vocab['[SEP]']]
        
        encoded = self.encode(tokens)
        max_tokens = max_seq - 2
        truncated = encoded[:max_tokens]
        
        full_seq = cls_token + truncated + sep_token
        padding_length = max_seq - len(full_seq)
        padded_seq = full_seq + [self.vocab['[PAD]']] * padding_length
        
        return padded_seq
    
    def batch_prepare_inputs(self, batch_tokens, max_seq):
        return [self.prepare_inputs(tokens, max_seq) for tokens in tqdm(batch_tokens)]

    def decode_on_inputs(self, sequence, skip_special_tokens=True):
        words = []
        for idx in sequence:
            word = self.id_to_word.get(idx, '[UNK]')
            if skip_special_tokens and word in {'[PAD]', '[CLS]', '[SEP]'}:
                continue
            words.append(word)
        return ' '.join(words)

    def batch_decode_on_inputs(self, batch_sequences, skip_special_tokens=True):
        return [self.decode_on_inputs(seq, skip_special_tokens) for seq in tqdm(batch_sequences)]



In [3]:
def load_data(path):
    df = df = pd.read_csv(path, encoding='latin-1', header=None).drop([0, 1], axis=1)
    df.columns = ['sentiment', 'text']
    sentiment2id = {'Positive': 0, 'Neutral': 1, 'Negative': 2, 'Irrelevant': 3}
    df['sentiment'] = df['sentiment'].map(sentiment2id)
    return df

In [4]:

def clean_text(text: str) -> str:
    """
    Clean raw text for tokenization:
      - Lowercase all text
      - Remove URLs (http, https, www) and t.co short links
      - Collapse long runs of punctuation
      - Strip out non‑ASCII (e.g. emojis)
      - Remove stray slashes and repeated slashes
      - Normalize whitespace
    """
    if not isinstance(text, str):
        text = str(text)

    # 1. Lowercase
    text = text.lower()

    # 2. Remove URLs (http://, https://, www.) and Twitter t.co links
    text = re.sub(r'https?://\S+|www\.\S+|//t\.co/\S+', ' ', text)

    # 3. Collapse runs of dots (4+ → '...') and other punctuation (4+ → single char)
    text = re.sub(r'\.{4,}', '...', text)
    text = re.sub(r'([!?])\1{1,}', r'\1', text)

    # 4. Remove stray slashes (2+ → single '/')
    text = re.sub(r'/{2,}', '/', text)

    # 5. Strip non‑ASCII (emojis, fancy quotes, etc.)
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)

    # 6. Remove any remaining standalone punctuation tokens longer than 3 chars
    text = re.sub(r'([^\w\s]){4,}', r'\1', text)

    # 7. Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [5]:
path_train = "/kaggle/input/twitter-entity-sentiment-analysis/twitter_training.csv"
path_val = "/kaggle/input/twitter-entity-sentiment-analysis/twitter_validation.csv"

df_train = load_data(path_train)
df_val = load_data(path_val)
df_train = df_train.dropna(subset=['text'])
df_val = df_val.dropna(subset=['text'])

In [6]:
df_train['clean_text'] = df_train['text'].apply(clean_text)
df_val['clean_text'] = df_val['text'].apply(clean_text)

In [7]:
df_train['clean_text'][0], df_train['sentiment'][0]

('im getting on borderlands and i will murder you all ,', 0)

In [8]:
texts = " ".join(df_train['text'].values)
corpus = clean_text(texts)
tokenizer = Tokenizer(" ".join(df_train['clean_text'].values), min_freq=2)

In [9]:
len(tokenizer.vocab)

28406

In [10]:
max_seq_len = 64
train_inputs = tokenizer.batch_prepare_inputs(
    tokenizer.batch_tokenize(df_train['clean_text'].values), max_seq=max_seq_len
)
val_inputs = tokenizer.batch_prepare_inputs(
    tokenizer.batch_tokenize(df_val['clean_text'].values), max_seq=max_seq_len
)

100%|██████████| 73996/73996 [00:10<00:00, 6995.26it/s]
100%|██████████| 73996/73996 [00:00<00:00, 101918.60it/s]
100%|██████████| 1000/1000 [00:00<00:00, 5985.47it/s]
100%|██████████| 1000/1000 [00:00<00:00, 120139.32it/s]


In [11]:
train_inputs = torch.tensor(train_inputs)
val_inputs = torch.tensor(val_inputs)
train_labels = torch.tensor(df_train['sentiment'].values, dtype=torch.long)
val_labels = torch.tensor(df_val['sentiment'].values, dtype=torch.long)


train_data = TensorDataset(train_inputs, train_labels)
val_data = TensorDataset(val_inputs, val_labels)


train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
val_loader = DataLoader(val_data, batch_size=64)

In [12]:
for text, label in val_loader:
    print(f"Text: {tokenizer.decode(text[0].tolist())}\nLabel: {label[0]}")
    break

Text: [CLS] i mentioned on facebook that i was struggling for motivation to go for a run the other day , which has been translated by tom s great auntie as hayley can t get out of bed and told to his grandma , who now thinks i m a lazy , terrible person [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
Label: 3


In [13]:
for text, label in val_loader:
    print(f"Text: {tokenizer.decode_on_inputs(text[0].tolist())}\nLabel: {label[0]}")
    break

Text: i mentioned on facebook that i was struggling for motivation to go for a run the other day , which has been translated by tom s great auntie as hayley can t get out of bed and told to his grandma , who now thinks i m a lazy , terrible person
Label: 3


In [14]:
class SentimentAnalysis(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, num_classes, dropout_rate=0.5):
        super(SentimentAnalysis, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, 
                            batch_first=True, dropout=0.2 if num_layers > 1 else 0.)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(hidden_dim, num_classes)
        
    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, (hidden, cell) = self.lstm(embedded)
        hidden_last = hidden[-1] 
        hidden_last = self.dropout(hidden_last)
        out = self.fc(hidden_last)
        return out


vocab_size = len(tokenizer.vocab)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SentimentAnalysis(
    vocab_size=vocab_size,
    embedding_dim=128,
    hidden_dim=256,
    num_layers=2,
    num_classes=4,
    dropout_rate=0.5
).to(device)

In [15]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'max', patience=3, factor=0.5)  # Added

best_val_acc = 0
early_stop_counter = 0
patience = 5  # Added early stopping

In [16]:
epochs = 30
for epoch in range(1, epochs+1):
    # Training phase
    model.train()
    train_loop = tqdm(train_loader, desc=f"[Train] Epoch: {epoch:02d}/{epochs}")
    total_loss, total, correct = 0.0, 0, 0
    
    for text, label in train_loop:
        text, label = text.to(device), label.to(device)
        optimizer.zero_grad()
        output = model(text)
        loss = criterion(output, label)
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        
        optimizer.step()
        
        total_loss += loss.item() * text.size(0)
        preds = output.argmax(dim=1)
        correct += (preds == label).sum().item()
        total += text.size(0)
        
        train_loop.set_postfix({
            'loss': total_loss / total,
            'acc': correct / total
        })

    train_loss = total_loss / total
    train_acc = correct / total
    
    # Validation phase
    model.eval()
    val_loop = tqdm(val_loader, desc=f"[Val] Epoch: {epoch:02d}/{epochs}")
    val_loss, val_total, val_correct = 0.0, 0, 0
    
    with torch.no_grad():
        for text, label in val_loop:
            text, label = text.to(device), label.to(device)
            output = model(text)
            loss = criterion(output, label)
            
            val_loss += loss.item() * text.size(0)
            preds = output.argmax(dim=1)
            val_correct += (preds == label).sum().item()
            val_total += text.size(0)
            
            val_loop.set_postfix({
                'loss': val_loss / val_total,
                'acc': val_correct / val_total
            })
    
    val_loss /= val_total
    val_acc = val_correct / val_total
    
    scheduler.step(val_acc)  # Update learning rate
    
    # Print metrics every epoch
    print(f"\nEpoch {epoch:02d}: "
          f"Train Loss: {train_loss:.4f} Acc: {train_acc:.4f} | "
          f"Val Loss: {val_loss:.4f} Acc: {val_acc:.4f}")
    
    # Early stopping check
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        early_stop_counter = 0
        torch.save(model.state_dict(), 'best_model.pth')
    else:
        early_stop_counter += 1
        if early_stop_counter >= patience:
            print(f"Early stopping at epoch {epoch}")
            break

[Train] Epoch: 01/30: 100%|██████████| 1157/1157 [00:18<00:00, 62.25it/s, loss=1.37, acc=0.306]
[Val] Epoch: 01/30: 100%|██████████| 16/16 [00:00<00:00, 155.26it/s, loss=1.36, acc=0.328]



Epoch 01: Train Loss: 1.3653 Acc: 0.3064 | Val Loss: 1.3610 Acc: 0.3280


[Train] Epoch: 02/30: 100%|██████████| 1157/1157 [00:17<00:00, 65.29it/s, loss=1.34, acc=0.348]
[Val] Epoch: 02/30: 100%|██████████| 16/16 [00:00<00:00, 166.73it/s, loss=1.24, acc=0.443]



Epoch 02: Train Loss: 1.3440 Acc: 0.3479 | Val Loss: 1.2398 Acc: 0.4430


[Train] Epoch: 03/30: 100%|██████████| 1157/1157 [00:17<00:00, 65.29it/s, loss=1.08, acc=0.542]
[Val] Epoch: 03/30: 100%|██████████| 16/16 [00:00<00:00, 165.54it/s, loss=0.789, acc=0.701]



Epoch 03: Train Loss: 1.0829 Acc: 0.5415 | Val Loss: 0.7888 Acc: 0.7010


[Train] Epoch: 04/30: 100%|██████████| 1157/1157 [00:17<00:00, 65.35it/s, loss=0.714, acc=0.728]
[Val] Epoch: 04/30: 100%|██████████| 16/16 [00:00<00:00, 166.52it/s, loss=0.639, acc=0.825]



Epoch 04: Train Loss: 0.7140 Acc: 0.7279 | Val Loss: 0.6387 Acc: 0.8250


[Train] Epoch: 05/30: 100%|██████████| 1157/1157 [00:17<00:00, 65.17it/s, loss=0.418, acc=0.851]
[Val] Epoch: 05/30: 100%|██████████| 16/16 [00:00<00:00, 168.59it/s, loss=0.586, acc=0.879]



Epoch 05: Train Loss: 0.4181 Acc: 0.8506 | Val Loss: 0.5856 Acc: 0.8790


[Train] Epoch: 06/30: 100%|██████████| 1157/1157 [00:17<00:00, 65.24it/s, loss=0.261, acc=0.908]
[Val] Epoch: 06/30: 100%|██████████| 16/16 [00:00<00:00, 170.86it/s, loss=0.451, acc=0.905]



Epoch 06: Train Loss: 0.2609 Acc: 0.9080 | Val Loss: 0.4514 Acc: 0.9050


[Train] Epoch: 07/30: 100%|██████████| 1157/1157 [00:17<00:00, 65.28it/s, loss=0.185, acc=0.935]
[Val] Epoch: 07/30: 100%|██████████| 16/16 [00:00<00:00, 165.39it/s, loss=0.522, acc=0.902]



Epoch 07: Train Loss: 0.1848 Acc: 0.9345 | Val Loss: 0.5221 Acc: 0.9020


[Train] Epoch: 08/30: 100%|██████████| 1157/1157 [00:17<00:00, 65.37it/s, loss=0.144, acc=0.949]
[Val] Epoch: 08/30: 100%|██████████| 16/16 [00:00<00:00, 172.59it/s, loss=0.599, acc=0.901]



Epoch 08: Train Loss: 0.1442 Acc: 0.9488 | Val Loss: 0.5994 Acc: 0.9010


[Train] Epoch: 09/30: 100%|██████████| 1157/1157 [00:17<00:00, 65.31it/s, loss=0.12, acc=0.956]
[Val] Epoch: 09/30: 100%|██████████| 16/16 [00:00<00:00, 171.70it/s, loss=0.621, acc=0.906]



Epoch 09: Train Loss: 0.1197 Acc: 0.9556 | Val Loss: 0.6206 Acc: 0.9060


[Train] Epoch: 10/30: 100%|██████████| 1157/1157 [00:17<00:00, 65.25it/s, loss=0.104, acc=0.961]
[Val] Epoch: 10/30: 100%|██████████| 16/16 [00:00<00:00, 172.04it/s, loss=0.754, acc=0.911]



Epoch 10: Train Loss: 0.1035 Acc: 0.9610 | Val Loss: 0.7538 Acc: 0.9110


[Train] Epoch: 11/30: 100%|██████████| 1157/1157 [00:17<00:00, 65.34it/s, loss=0.0954, acc=0.963]
[Val] Epoch: 11/30: 100%|██████████| 16/16 [00:00<00:00, 164.73it/s, loss=0.703, acc=0.905]



Epoch 11: Train Loss: 0.0954 Acc: 0.9631 | Val Loss: 0.7030 Acc: 0.9050


[Train] Epoch: 12/30: 100%|██████████| 1157/1157 [00:17<00:00, 65.49it/s, loss=0.0872, acc=0.966]
[Val] Epoch: 12/30: 100%|██████████| 16/16 [00:00<00:00, 162.99it/s, loss=0.665, acc=0.895]



Epoch 12: Train Loss: 0.0872 Acc: 0.9664 | Val Loss: 0.6652 Acc: 0.8950


[Train] Epoch: 13/30: 100%|██████████| 1157/1157 [00:17<00:00, 65.29it/s, loss=0.0788, acc=0.969]
[Val] Epoch: 13/30: 100%|██████████| 16/16 [00:00<00:00, 167.41it/s, loss=0.818, acc=0.904]



Epoch 13: Train Loss: 0.0788 Acc: 0.9688 | Val Loss: 0.8176 Acc: 0.9040


[Train] Epoch: 14/30: 100%|██████████| 1157/1157 [00:17<00:00, 65.31it/s, loss=0.0737, acc=0.97]
[Val] Epoch: 14/30: 100%|██████████| 16/16 [00:00<00:00, 170.06it/s, loss=0.79, acc=0.906]



Epoch 14: Train Loss: 0.0737 Acc: 0.9703 | Val Loss: 0.7900 Acc: 0.9060


[Train] Epoch: 15/30: 100%|██████████| 1157/1157 [00:17<00:00, 65.20it/s, loss=0.0574, acc=0.975]
[Val] Epoch: 15/30: 100%|██████████| 16/16 [00:00<00:00, 163.55it/s, loss=0.855, acc=0.919]



Epoch 15: Train Loss: 0.0574 Acc: 0.9749 | Val Loss: 0.8545 Acc: 0.9190


[Train] Epoch: 16/30: 100%|██████████| 1157/1157 [00:17<00:00, 65.32it/s, loss=0.0496, acc=0.978]
[Val] Epoch: 16/30: 100%|██████████| 16/16 [00:00<00:00, 165.85it/s, loss=0.931, acc=0.908]



Epoch 16: Train Loss: 0.0496 Acc: 0.9782 | Val Loss: 0.9310 Acc: 0.9080


[Train] Epoch: 17/30: 100%|██████████| 1157/1157 [00:17<00:00, 65.36it/s, loss=0.0488, acc=0.978]
[Val] Epoch: 17/30: 100%|██████████| 16/16 [00:00<00:00, 169.28it/s, loss=1.11, acc=0.909]



Epoch 17: Train Loss: 0.0488 Acc: 0.9779 | Val Loss: 1.1088 Acc: 0.9090


[Train] Epoch: 18/30: 100%|██████████| 1157/1157 [00:17<00:00, 65.22it/s, loss=0.0487, acc=0.978]
[Val] Epoch: 18/30: 100%|██████████| 16/16 [00:00<00:00, 169.70it/s, loss=1.02, acc=0.911]



Epoch 18: Train Loss: 0.0487 Acc: 0.9778 | Val Loss: 1.0221 Acc: 0.9110


[Train] Epoch: 19/30: 100%|██████████| 1157/1157 [00:17<00:00, 65.30it/s, loss=0.0496, acc=0.978]
[Val] Epoch: 19/30: 100%|██████████| 16/16 [00:00<00:00, 165.34it/s, loss=0.951, acc=0.908]



Epoch 19: Train Loss: 0.0496 Acc: 0.9776 | Val Loss: 0.9514 Acc: 0.9080


[Train] Epoch: 20/30: 100%|██████████| 1157/1157 [00:17<00:00, 65.21it/s, loss=0.0444, acc=0.979]
[Val] Epoch: 20/30: 100%|██████████| 16/16 [00:00<00:00, 164.96it/s, loss=1, acc=0.909]


Epoch 20: Train Loss: 0.0444 Acc: 0.9789 | Val Loss: 1.0020 Acc: 0.9090
Early stopping at epoch 20





In [17]:
def inference(sentence, max_seq=64, return_dict=False):
    cleaned_text = clean_text(sentence)
    if not cleaned_text.strip():
        print("Input text is empty after cleaning")
        return None
    tokens = tokenizer.tokenize(cleaned_text)
    sequence = tokenizer.prepare_inputs(tokens, max_seq)
    
    input_tensor = torch.tensor(sequence).unsqueeze(0).to(device)
    class_names = ['Positive', 'Neutral', 'Negative', 'Irrelevant']
    
    model.eval()
    with torch.no_grad():
        
        outputs = model(input_tensor)
        probabilities = F.softmax(outputs, dim=1) 
        conf, pred = torch.max(probabilities, dim=1)
        
        confidence = conf.item()
        predicted_class = class_names[pred.item()]
        
        print(f"Text: {sentence}")
        print(f"Prediction: {predicted_class}")
        print(f"Confidence: {confidence*100:.2f}%")

sentence = "im getting on borderlands and i will murder you all"
result = inference(sentence)

Text: im getting on borderlands and i will murder you all
Prediction: Positive
Confidence: 100.00%


In [18]:
sentence = "All these news are bad"
result = inference(sentence)

Text: All these news are bad
Prediction: Negative
Confidence: 99.98%


In [19]:
sentence = df_train.loc[12, 'clean_text']
inference(sentence)

Text: rock-hard la varlope, rare & powerful, handsome jackpot, borderlands 3 (xbox) dlvr.it/rmtrgf
Prediction: Neutral
Confidence: 100.00%
