In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from datasets import load_dataset
import pandas as pd

# Load datasets
try:
    tweet_dataset = load_dataset("AdamLucek/twittersentiment-llama-3.1-405B-labels", download_mode="force_redownload")
except Exception as e:
    print("Error loading tweet dataset:", e)

emoji_dataset = pd.read_csv("/kaggle/input/data543/Emoji_Sentiment_Data_v1.0.csv")

# Load BERTweet tokenizer and model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bertweet_tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")
bertweet_model = AutoModel.from_pretrained("vinai/bertweet-base").to(device)

# Character-Level CNN for emoji encoding
class CharCNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes):
        super(CharCNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.conv1 = nn.Conv1d(embed_dim, 128, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(128, 256, kernel_size=3, padding=1)
        self.fc = nn.Linear(256, 128)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(0, 2, 1)
        x = torch.relu(self.conv1(x))
        x = torch.relu(self.conv2(x))
        x = torch.max(x, dim=2).values
        x = self.dropout(x)
        return self.fc(x)

# Combined Hybrid Model
class HybridSentimentModel(nn.Module):
    def __init__(self, bert_model, char_cnn, hidden_dim, output_dim):
        super(HybridSentimentModel, self).__init__()
        self.bert = bert_model
        self.char_cnn = char_cnn
        self.fc = nn.Linear(bert_model.config.hidden_size + 128, hidden_dim)
        self.out = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(0.5)

    def forward(self, text_input, emoji_input):
        text_feat = self.bert(**text_input).pooler_output
        emoji_feat = self.char_cnn(emoji_input) * 2.0  # Prioritize emoji features
        combined = torch.cat((text_feat, emoji_feat), dim=1)
        x = torch.relu(self.fc(combined))
        x = self.dropout(x)
        return self.out(x)

# Model Initialization
vocab_size = len(emoji_dataset) + 1
embed_dim = 64
hidden_dim = 256
output_dim = 3
char_cnn = CharCNN(vocab_size, embed_dim, output_dim).to(device)
model = HybridSentimentModel(bertweet_model, char_cnn, hidden_dim, output_dim).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=2e-5, weight_decay=1e-4)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2, factor=0.5)

# Encode emoji input
def encode_emoji(emoji_input):
    emoji_indices = [emoji_dataset[emoji_dataset['Emoji'] == e].index[0] if e in emoji_dataset['Emoji'].values else len(emoji_dataset) for e in emoji_input]
    if not emoji_indices:
        return torch.zeros((1, 10), dtype=torch.long, device=device)
    emoji_tensor = torch.tensor(emoji_indices, dtype=torch.long, device=device)
    emoji_tensor = emoji_tensor.unsqueeze(0)
    if emoji_tensor.size(1) < 10:
        pad = torch.zeros((1, 10 - emoji_tensor.size(1)), dtype=torch.long, device=device)
        emoji_tensor = torch.cat((emoji_tensor, pad), dim=1)
    return emoji_tensor

# Enhanced sentiment weighting from emoji features
def get_emoji_sentiment_weight(emoji_input):
    sentiment_weights = []
    for e in emoji_input:
        emoji_row = emoji_dataset[emoji_dataset['Emoji'] == e]
        if not emoji_row.empty:
            weights = emoji_row[['Negative', 'Neutral', 'Positive']].values[0]
            sentiment_weights.append(weights)
        else:
            sentiment_weights.append([0, 0, 0])
    if sentiment_weights:
        return torch.tensor(sentiment_weights, dtype=torch.float32, device=device).mean(dim=0)
    return torch.tensor([0, 0, 0], dtype=torch.float32, device=device)

# Model training (example loop)
def train(model, dataloader, criterion, optimizer, scheduler, epochs=3):
    model.train()
    all_preds = []
    all_labels = []
    for epoch in range(epochs):
        running_loss = 0
        for batch in dataloader:
            text_input, emoji_input, labels = batch
            for key in text_input:
                text_input[key] = text_input[key].squeeze(1).to(device)
            emoji_input = emoji_input.squeeze(1).to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            output = model(text_input, emoji_input)

            # Adjust loss using emoji sentiment weight
            emoji_sentiment_weight = get_emoji_sentiment_weight(emoji_input)
            loss = criterion(output, labels) - 0.1 * torch.sum(output * emoji_sentiment_weight)

            loss.backward()
            optimizer.step()

            running_loss += loss.item()

            all_preds.extend(torch.argmax(output, dim=1).tolist())
            all_labels.extend(labels.tolist())

        scheduler.step(running_loss / len(dataloader))

        print(f"Epoch {epoch+1}, Loss: {running_loss / len(dataloader):.4f}")

    # Calculate metrics
    f1 = f1_score(all_labels, all_preds, average='weighted')
    precision = precision_score(all_labels, all_preds, average='weighted')
    recall = recall_score(all_labels, all_preds, average='weighted')
    accuracy = accuracy_score(all_labels, all_preds)

    print(f"\nF1 Score: {f1:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"Accuracy: {accuracy:.4f}")

# Prepare training data
class SentimentDataset(Dataset):
    def __init__(self, texts, emojis, labels):
        self.texts = texts
        self.emojis = emojis
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = bertweet_tokenizer(self.texts[idx], padding='max_length', truncation=True, max_length=128, return_tensors='pt')
        emoji = encode_emoji(self.emojis[idx])
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        return text, emoji, label

train_texts = tweet_dataset['train']['text']
train_emojis = [[c for c in text if c in emoji_dataset['Emoji'].values] for text in train_texts]
train_labels = tweet_dataset['train']['label']

train_dataset = SentimentDataset(train_texts, train_emojis, train_labels)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

# Train the model
train(model, train_loader, criterion, optimizer, scheduler)

# User Input Loop
while True:
    user_input = input("Enter your text with emojis (type 'exit' to quit): ")
    if user_input.lower() == 'exit':
        break

    user_text = ''.join(c for c in user_input if c.isalnum() or c.isspace())
    user_emojis = [c for c in user_input if not c.isalnum() and not c.isspace()]

    emoji_sample = encode_emoji(user_emojis)
    text_sample = bertweet_tokenizer([user_text], padding='max_length', truncation=True, max_length=128, return_tensors='pt').to(device)

    model.eval()
    with torch.no_grad():
        output = model(text_sample, emoji_sample)
        predicted_label = torch.argmax(output, dim=1).item()

    print("Predicted sentiment:", ["Negative", "Neutral", "Positive"][predicted_label])

README.md:   0%|          | 0.00/2.86k [00:00<?, ?B/s]

train.csv:   0%|          | 0.00/1.40M [00:00<?, ?B/s]

test.csv:   0%|          | 0.00/282k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4992 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/998 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/558 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/843k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.91M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/543M [00:00<?, ?B/s]

Error during conversion: ChunkedEncodingError(ProtocolError('Response ended prematurely'))


model.safetensors:   0%|          | 0.00/543M [00:00<?, ?B/s]

Epoch 1, Loss: 0.8493
Epoch 2, Loss: 0.5229
Epoch 3, Loss: 0.4133

F1 Score: 0.7565
Precision: 0.7635
Recall: 0.7553
Accuracy: 0.7553


Enter your text with emojis (type 'exit' to quit):  I am happy😁😁😁


Predicted sentiment: Positive


Enter your text with emojis (type 'exit' to quit):  It was a very rough day😢😢😢


Predicted sentiment: Negative


Enter your text with emojis (type 'exit' to quit):  someone was walking past me😶😶


Predicted sentiment: Neutral
