In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
import emoji
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from tqdm.auto import tqdm
import random
import json

# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

# Dataset
data = []
with open(f"..\data\\final_dataset.json", 'r') as f:
        data = json.load(f)


# Convert polarity to labels: -1 -> 0 (negative), 0 -> 1 (neutral), 1 -> 2 (positive)
for item in data:
    item['label'] = item['polarity'] + 1


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Initialize BERT tokenizer
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Extract all unique emojis from the dataset
unique_emojis = set()
for item in data:
    emojis = [char for char in item['sentence'] if char in emoji.EMOJI_DATA]
    unique_emojis.update(emojis)

# Add emojis directly as tokens to the tokenizer
bert_tokenizer.add_tokens(list(unique_emojis))

# Custom Dataset
class SentimentDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        sentence = item['sentence']
        label = item['label']

        encoding = self.tokenizer(
            sentence,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'label': torch.tensor(label, dtype=torch.long)
        }





In [5]:
# Split dataset
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
train_dataset = SentimentDataset(train_data, bert_tokenizer)
test_dataset = SentimentDataset(test_data, bert_tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)


10149

In [6]:
# Attention Layer
class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super(Attention, self).__init__()
        self.attention = nn.Linear(hidden_dim * 2, 1)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, lstm_output):
        attention_scores = self.attention(lstm_output).squeeze(-1)
        attention_weights = self.softmax(attention_scores)
        context_vector = torch.bmm(attention_weights.unsqueeze(1), lstm_output).squeeze(1)
        return context_vector, attention_weights

# Sentiment Analysis Model
class SentimentModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim=768, hidden_dim=256, output_dim=3, dropout=0.3):
        super(SentimentModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.attention = Attention(hidden_dim)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_ids, attention_mask):
        embedded = self.embedding(input_ids)
        lstm_output, _ = self.lstm(embedded)
        context_vector, attention_weights = self.attention(lstm_output)
        output = self.dropout(context_vector)
        output = self.fc(output)
        return output

# Initialize model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SentimentModel(vocab_size=len(bert_tokenizer), embedding_dim=768, hidden_dim=256, output_dim=3)
model.to(device)

# Load pretrained BERT embeddings for the embedding layer
bert_model = BertModel.from_pretrained('bert-base-uncased')
with torch.no_grad():
    # Copy BERT's pretrained embeddings for the original vocabulary
    model.embedding.weight[:bert_tokenizer.vocab_size] = bert_model.embeddings.word_embeddings.weight
    # Initialize embeddings for new emoji tokens with random values
    num_new_tokens = len(bert_tokenizer) - bert_tokenizer.vocab_size
    if num_new_tokens > 0:
        new_embeddings = torch.randn(num_new_tokens, 768) * 0.02  # Small random init
        model.embedding.weight[bert_tokenizer.vocab_size:] = new_embeddings

# Training setup
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=2e-5)

# Training loop with progress bars
def train_model(model, train_loader, criterion, optimizer, num_epochs=10):
    model.train()
    
    # Create a progress bar for epochs
    epoch_pbar = tqdm(range(num_epochs), desc="Epochs", position=0)
    
    for epoch in epoch_pbar:
        total_loss = 0
        # Create a progress bar for batches within the epoch
        batch_pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", 
                          leave=False, position=1, total=len(train_loader))
        
        for batch in batch_pbar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            
            # Update the batch progress bar with current loss
            batch_pbar.set_postfix({"loss": f"{loss.item():.4f}"})
        
        avg_loss = total_loss/len(train_loader)
        # Update the epoch progress bar with average loss
        epoch_pbar.set_postfix({"avg_loss": f"{avg_loss:.4f}"})
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}')

# Evaluation with progress bar
def evaluate_model(model, test_loader):
    model.eval()
    all_preds = []
    all_labels = []
    
    # Create a progress bar for evaluation
    eval_pbar = tqdm(test_loader, desc="Evaluating", total=len(test_loader))
    
    with torch.no_grad():
        for batch in eval_pbar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask)
            _, preds = torch.max(outputs, dim=1)
            
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='weighted')
    print(f'Test Accuracy: {accuracy:.4f}')
    print(f'Test F1 Score: {f1:.4f}')

In [7]:
# Run training and evaluation
train_model(model, train_loader, criterion, optimizer, num_epochs=10)
evaluate_model(model, test_loader)

Epochs:   0%|          | 0/10 [00:00<?, ?it/s]

: 

In [2]:
def predict_sentiment(text, tokenizer, model, device, max_length=128):
    model.eval()
    # Tokenize input text
    encoding = tokenizer(
        text,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
        _, pred = torch.max(outputs, dim=1)
    
    # Map prediction to sentiment label
    sentiment_map = {0: "Negative", 1: "Neutral", 2: "Positive"}
    return sentiment_map[pred.item()]


In [13]:
example_text = "happy,love joy"
sentiment = predict_sentiment(example_text, bert_tokenizer, model, device)
print(f"Sentiment for '{example_text}': {sentiment}")

Sentiment for 'happy,love joy': Negative
