In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from datasets import load_dataset
import pandas as pd


try:
    tweet_dataset = load_dataset("SetFit/tweet_sentiment_extraction", download_mode="force_redownload")
except Exception as e:
    print("Error loading tweet dataset:", e)

emoji_dataset = pd.read_csv("/kaggle/input/data543/Emoji_Sentiment_Data_v1.0.csv")


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bertweet_tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")
bertweet_model = AutoModel.from_pretrained("vinai/bertweet-base").to(device)


class CharCNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes):
        super(CharCNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.conv1 = nn.Conv1d(embed_dim, 128, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(128, 256, kernel_size=3, padding=1)
        self.fc = nn.Linear(256, 128)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(0, 2, 1)
        x = torch.relu(self.conv1(x))
        x = torch.relu(self.conv2(x))
        x = torch.max(x, dim=2).values
        x = self.dropout(x)
        return self.fc(x)


class HybridSentimentModel(nn.Module):
    def __init__(self, bert_model, char_cnn, hidden_dim, output_dim):
        super(HybridSentimentModel, self).__init__()
        self.bert = bert_model
        self.char_cnn = char_cnn
        self.fc = nn.Linear(bert_model.config.hidden_size + 128, hidden_dim)
        self.out = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(0.5)

    def forward(self, text_input, emoji_input):
        text_feat = self.bert(**text_input).pooler_output
        emoji_feat = self.char_cnn(emoji_input)
        combined = torch.cat((text_feat, emoji_feat), dim=1)
        x = torch.relu(self.fc(combined))
        x = self.dropout(x)
        return self.out(x)


vocab_size = len(emoji_dataset) + 1
embed_dim = 64
hidden_dim = 256
output_dim = 3
char_cnn = CharCNN(vocab_size, embed_dim, output_dim).to(device)
model = HybridSentimentModel(bertweet_model, char_cnn, hidden_dim, output_dim).to(device)


criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=2e-5, weight_decay=1e-4)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2, factor=0.5)


def encode_emoji(emoji_input):
    emoji_indices = [emoji_dataset[emoji_dataset['Emoji'] == e].index[0] if e in emoji_dataset['Emoji'].values else len(emoji_dataset) for e in emoji_input]
    if not emoji_indices:
        return torch.zeros((1, 10), dtype=torch.long, device=device)
    emoji_tensor = torch.tensor(emoji_indices, dtype=torch.long, device=device)
    emoji_tensor = emoji_tensor.unsqueeze(0)
    if emoji_tensor.size(1) < 10:
        pad = torch.zeros((1, 10 - emoji_tensor.size(1)), dtype=torch.long, device=device)
        emoji_tensor = torch.cat((emoji_tensor, pad), dim=1)
    return emoji_tensor


def train(model, dataloader, criterion, optimizer, scheduler, epochs=3):
    model.train()
    all_preds = []
    all_labels = []
    epoch_losses = []
    learning_rates = []

    for epoch in range(epochs):
        running_loss = 0
        for batch in dataloader:
            text_input, emoji_input, labels = batch
            for key in text_input:
                text_input[key] = text_input[key].squeeze(1).to(device)
            emoji_input = emoji_input.squeeze(1).to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            output = model(text_input, emoji_input)
            loss = criterion(output, labels)

            loss.backward()
            optimizer.step()

            running_loss += loss.item()

            all_preds.extend(torch.argmax(output, dim=1).tolist())
            all_labels.extend(labels.tolist())

        scheduler.step(running_loss / len(dataloader))

        epoch_loss = running_loss / len(dataloader)
        epoch_losses.append(epoch_loss)
        learning_rates.append(optimizer.param_groups[0]['lr'])

        print(f"Epoch {epoch+1}, Loss: {epoch_loss:.4f}, Learning Rate: {learning_rates[-1]:.6f}")

    
    correct = sum(1 for p, l in zip(all_preds, all_labels) if p == l)
    total = len(all_labels)
    accuracy = correct / total * 100

    print(f"Final Accuracy: {accuracy:.2f}%")


class SentimentDataset(Dataset):
    def __init__(self, texts, emojis, labels):
        self.texts = texts
        self.emojis = emojis
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = bertweet_tokenizer(self.texts[idx], padding='max_length', truncation=True, max_length=128, return_tensors='pt')
        emoji = encode_emoji(self.emojis[idx])
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        return text, emoji, label

train_texts = tweet_dataset['train']['text']
train_emojis = [[c for c in text if c in emoji_dataset['Emoji'].values] for text in train_texts]
train_labels = tweet_dataset['train']['label']

train_dataset = SentimentDataset(train_texts, train_emojis, train_labels)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)


train(model, train_loader, criterion, optimizer, scheduler)


while True:
    user_input = input("Enter your text with emojis (type 'exit' to quit): ")
    if user_input.lower() == 'exit':
        break

    user_text = ''.join(c for c in user_input if c.isalnum() or c.isspace())
    user_emojis = [c for c in user_input if not c.isalnum() and not c.isspace()]

    emoji_sample = encode_emoji(user_emojis)
    text_sample = bertweet_tokenizer([user_text], padding='max_length', truncation=True, max_length=128, return_tensors='pt').to(device)

    model.eval()
    with torch.no_grad():
        output = model(text_sample, emoji_sample)
        predicted_label = torch.argmax(output, dim=1).item()

    print("Predicted sentiment:", ["Negative", "Neutral", "Positive"][predicted_label])


Repo card metadata block was not found. Setting CardData to empty.


train.jsonl:   0%|          | 0.00/3.93M [00:00<?, ?B/s]

test.jsonl:   0%|          | 0.00/503k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/27481 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3534 [00:00<?, ? examples/s]

Epoch 1, Loss: 0.5797, Learning Rate: 0.000020
Epoch 2, Loss: 0.4407, Learning Rate: 0.000020
Epoch 3, Loss: 0.3474, Learning Rate: 0.000020
Final Accuracy: 82.02%


Enter your text with emojis (type 'exit' to quit):  I am glad that it happened😁


Predicted sentiment: Positive


Enter your text with emojis (type 'exit' to quit):  I got kicked out badly🤬🤬


Predicted sentiment: Negative


Enter your text with emojis (type 'exit' to quit):  Somebody walked past me😶😶😶


Predicted sentiment: Neutral


Enter your text with emojis (type 'exit' to quit):  exit
