In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
import emoji
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import random

# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

# Dataset
data = [
    {"sentence": "Oh, I just LOVE Mondays! ✨", "polarity": -1},
    {"sentence": "My internet is so fast, it takes only 5 minutes to load a single image. 🐌", "polarity": -1},
    {"sentence": "This meeting is SO productive. 😴", "polarity": -1},
    {"sentence": "My favorite activity is listening to my neighbor's dog bark at 3 AM. 🎶", "polarity": -1},
    {"sentence": "Nothing like spending my Friday night debugging code. 🎉", "polarity": -1},
    {"sentence": "I'm so glad I wore white pants today. It's only raining mud. 👍", "polarity": -1},
    {"sentence": "My hair looks AMAZING today. (After I spent an hour trying to fix it) Medusa Head", "polarity": -1},
    {"sentence": "This traffic is absolutely delightful. 🚗💨🤬", "polarity": -1},
    {"sentence": "I can't wait to hear this exciting news. 🙄", "polarity": -1},
    {"sentence": "I'm thrilled that my alarm didn't go off this morning. Now I'm definitely not late. ⏰", "polarity": -1},
    {"sentence": "Group projects are the BEST. Everyone contributes equally, right? 🤡", "polarity": -1},
    {"sentence": "I love it when people tell me what to do. 🥰", "polarity": -1},
    {"sentence": "I'm so good at adulting. I haven't paid my bills in weeks. 💯", "polarity": -1},
    {"sentence": "This weather is perfect for a picnic. (It's hailing) 🧺", "polarity": -1},
    {"sentence": "I'm so excited to work overtime this weekend. 🥳", "polarity": -1},
    {"sentence": "I just LOVE it when my phone dies right before I need to make an important call. 📱💀", "polarity": -1},
    {"sentence": "My bank account is overflowing with money. 🤑 (Not really)", "polarity": -1},
    {"sentence": "This cake is so delicious, I could eat the whole thing. 🤮 (It's actually terrible)", "polarity": -1},
    {"sentence": "Cleaning my room is my favorite hobby. ✨", "polarity": -1},
    {"sentence": "I always follow the recipe exactly. That's why my food looks like this. 🔥", "polarity": -1},
    {"sentence": "I'm having so much fun writing this report. 📝😴", "polarity": -1},
    {"sentence": "This is fine. 🔥", "polarity": -1},
    {"sentence": "I'm not stressed at all. 😌", "polarity": -1},
    {"sentence": "This is my favorite song. 🙉", "polarity": -1},
    {"sentence": "I'm so glad I left my umbrella at home. 🌧️", "polarity": -1},
    {"sentence": "I love getting stuck in traffic. 🚗😣", "polarity": -1},
    {"sentence": "This is going to be a great day. 😒", "polarity": -1},
    {"sentence": "I'm so excited for this exam. 📚😭", "polarity": -1},
    {"sentence": "I can't wait to do laundry. 🧺🙄", "polarity": -1},
    {"sentence": "I love waking up early. ⏰😣", "polarity": -1},
    {"sentence": "The weather is pleasant today.", "polarity": 0},
    {"sentence": "The train arrives at 2 PM.", "polarity": 0},
    {"sentence": "Water boils at 100 degrees Celsius.", "polarity": 0},
    {"sentence": "The Earth revolves around the Sun.", "polarity": 0},
    {"sentence": "Paris is the capital of France.", "polarity": 0},
    {"sentence": "There are 24 hours in a day.", "polarity": 0},
    {"sentence": "The library opens at 9 AM.", "polarity": 0},
    {"sentence": "The movie starts at 7 PM.", "polarity": 0},
    {"sentence": "Today is Wednesday.", "polarity": 0},
    {"sentence": "I'm so happy to be here! 😄", "polarity": 1},
    {"sentence": "This is amazing! 🤩", "polarity": 1},
    {"sentence": "I love this! ❤️", "polarity": 1},
    {"sentence": "This is the best day ever! 🎉", "polarity": 1},
    {"sentence": "I'm feeling great! 😊", "polarity": 1},
    {"sentence": "This is fantastic! ✨", "polarity": 1},
    {"sentence": "I'm so grateful for this! 🙏", "polarity": 1},
    {"sentence": "This is wonderful! 😍", "polarity": 1},
    {"sentence": "I'm so lucky! 🍀", "polarity": 1},
    {"sentence": "This is perfect! 👌", "polarity": 1}
]

# Convert polarity to labels: -1 -> 0 (negative), 0 -> 1 (neutral), 1 -> 2 (positive)
for item in data:
    item['label'] = item['polarity'] + 1

# Initialize BERT tokenizer
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Extract all unique emojis from the dataset
unique_emojis = set()
for item in data:
    emojis = [char for char in item['sentence'] if char in emoji.EMOJI_DATA]
    unique_emojis.update(emojis)

# Add emojis directly as tokens to the tokenizer
bert_tokenizer.add_tokens(list(unique_emojis))

# Custom Dataset
class SentimentDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        sentence = item['sentence']
        label = item['label']

        encoding = self.tokenizer(
            sentence,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Split dataset
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
train_dataset = SentimentDataset(train_data, bert_tokenizer)
test_dataset = SentimentDataset(test_data, bert_tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

# Attention Layer
class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super(Attention, self).__init__()
        self.attention = nn.Linear(hidden_dim * 2, 1)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, lstm_output):
        attention_scores = self.attention(lstm_output).squeeze(-1)
        attention_weights = self.softmax(attention_scores)
        context_vector = torch.bmm(attention_weights.unsqueeze(1), lstm_output).squeeze(1)
        return context_vector, attention_weights

# Sentiment Analysis Model
class SentimentModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim=768, hidden_dim=256, output_dim=3, dropout=0.3):
        super(SentimentModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.attention = Attention(hidden_dim)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_ids, attention_mask):
        embedded = self.embedding(input_ids)
        lstm_output, _ = self.lstm(embedded)
        context_vector, attention_weights = self.attention(lstm_output)
        output = self.dropout(context_vector)
        output = self.fc(output)
        return output

# Initialize model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SentimentModel(vocab_size=len(bert_tokenizer), embedding_dim=768, hidden_dim=256, output_dim=3)
model.to(device)

# Load pretrained BERT embeddings for the embedding layer
bert_model = BertModel.from_pretrained('bert-base-uncased')
with torch.no_grad():
    # Copy BERT's pretrained embeddings for the original vocabulary
    model.embedding.weight[:bert_tokenizer.vocab_size] = bert_model.embeddings.word_embeddings.weight
    # Initialize embeddings for new emoji tokens with random values
    num_new_tokens = len(bert_tokenizer) - bert_tokenizer.vocab_size
    if num_new_tokens > 0:
        new_embeddings = torch.randn(num_new_tokens, 768) * 0.02  # Small random init
        model.embedding.weight[bert_tokenizer.vocab_size:] = new_embeddings

# Training setup
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=2e-5)

# Training loop
def train_model(model, train_loader, criterion, optimizer, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_loader):.4f}')

# Evaluation
def evaluate_model(model, test_loader):
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask)
            _, preds = torch.max(outputs, dim=1)
            
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='weighted')
    print(f'Test Accuracy: {accuracy:.4f}')
    print(f'Test F1 Score: {f1:.4f}')

# Run training and evaluation
train_model(model, train_loader, criterion, optimizer, num_epochs=10)
evaluate_model(model, test_loader)

2025-05-03 13:36:01.215225: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746279361.369388      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746279361.413600      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Epoch 1/10, Loss: 1.1060
Epoch 2/10, Loss: 1.1001
Epoch 3/10, Loss: 1.1005
Epoch 4/10, Loss: 1.0972
Epoch 5/10, Loss: 1.0909
Epoch 6/10, Loss: 1.0869
Epoch 7/10, Loss: 1.0861
Epoch 8/10, Loss: 1.0803
Epoch 9/10, Loss: 1.0758
Epoch 10/10, Loss: 1.0758
Test Accuracy: 0.6000
Test F1 Score: 0.4500


In [2]:
def predict_sentiment(text, tokenizer, model, device, max_length=128):
    model.eval()
    # Tokenize input text
    encoding = tokenizer(
        text,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
        _, pred = torch.max(outputs, dim=1)
    
    # Map prediction to sentiment label
    sentiment_map = {0: "Negative", 1: "Neutral", 2: "Positive"}
    return sentiment_map[pred.item()]


In [13]:
example_text = "happy,love joy"
sentiment = predict_sentiment(example_text, bert_tokenizer, model, device)
print(f"Sentiment for '{example_text}': {sentiment}")

Sentiment for 'happy,love joy': Negative
