In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m24.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.23.0-py3-none-a

In [2]:
!pip install torch

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [3]:
!pip install transformers



In [4]:
!pip install scikit-learn




In [5]:
!pip install -U transformers




In [6]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, AdamW, get_scheduler
from sklearn.metrics import classification_report, accuracy_score
from datasets import load_dataset

# Load the "emotion" dataset from Hugging Face
dataset = load_dataset('emotion')

# Get the unique label names directly from the dataset's feature information
label_names = dataset['train'].features['label'].names
label_mapping = {label_name: index for index, label_name in enumerate(label_names)}

# Print the mapping for verification
print("Label Mapping:", label_mapping)

class EmotionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            truncation=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Initialize the tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Create datasets for training, validation, and testing
train_texts = dataset['train']['text']
train_labels = [int(label) for label in dataset['train']['label']]
train_dataset = EmotionDataset(train_texts, train_labels, tokenizer)

val_texts = dataset['validation']['text']
val_labels = [int(label) for label in dataset['validation']['label']]
val_dataset = EmotionDataset(val_texts, val_labels, tokenizer)

test_texts = dataset['test']['text']
test_labels = [int(label) for label in dataset['test']['label']]
test_dataset = EmotionDataset(test_texts, test_labels, tokenizer)

# DataLoaders for training, validation, and testing
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Load the DistilBERT model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(label_names))
model = model.to('cuda' if torch.cuda.is_available() else 'cpu')

# Optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

# Training loop
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.train()
for epoch in range(num_epochs):
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')

# Save the trained model
model_save_path = "emotion_detection_model.pth"
torch.save(model.state_dict(), model_save_path)
print(f"Model saved to {model_save_path}")



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/3.97k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.28k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/8.78k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/592k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.9k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Label Mapping: {'sadness': 0, 'joy': 1, 'love': 2, 'anger': 3, 'fear': 4, 'surprise': 5}




tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Loss: 0.5491775274276733
Epoch 2, Loss: 0.048390403389930725
Epoch 3, Loss: 0.08287172764539719
Model saved to emotion_detection_model.pth


In [8]:
# Function to load the model
def load_model(model_path, model_class, tokenizer_class, pretrained_model_name, num_labels):
    model = model_class.from_pretrained(pretrained_model_name, num_labels=num_labels)
    model.load_state_dict(torch.load(model_path))
    tokenizer = tokenizer_class.from_pretrained(pretrained_model_name)
    model.eval()
    return model, tokenizer

# Load the saved model
model_path = "emotion_detection_model.pth"
num_labels=len(label_names)
model, tokenizer = load_model(model_path, DistilBertForSequenceClassification, DistilBertTokenizerFast, 'distilbert-base-uncased',num_labels)

# Function to predict emotion for a given sample text
def predict_emotion(sample_text, model, tokenizer, max_len=128):
    encoding = tokenizer.encode_plus(
        sample_text,
        add_special_tokens=True,
        max_length=max_len,
        return_token_type_ids=False,
        padding='max_length',
        return_attention_mask=True,
        truncation=True,
        return_tensors='pt',
    )
    input_ids = encoding['input_ids'].to(model.device)
    attention_mask = encoding['attention_mask'].to(model.device)
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
    probs = torch.nn.functional.softmax(logits, dim=-1)
    predicted_class = torch.argmax(probs, dim=-1).cpu().numpy()[0]
    confidence_score = torch.max(probs, dim=-1).values.cpu().numpy()[0]
    predicted_label = label_names[predicted_class]
    return predicted_label, confidence_score

# Example usage
sample_text = "I'm feeling great today!"
predicted_emotion, confidence_score = predict_emotion(sample_text, model, tokenizer)
print(f"The predicted emotion for the text is: {predicted_emotion} with a confidence score of {confidence_score:.2f}")


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


The predicted emotion for the text is: joy with a confidence score of 1.00


In [10]:
# Evaluation function
def evaluate(model, data_loader):
    model.eval()
    predictions, true_labels = [], []

    with torch.no_grad():
        for batch in data_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(batch['labels'].cpu().numpy())

    print(classification_report(true_labels, predictions, target_names=label_names, digits=4))
    print("Accuracy:", accuracy_score(true_labels, predictions))

# Evaluate the model on the validation set
print("Validation Results:")
evaluate(model, val_loader)

# Evaluate the model on the test set
print("Test Results:")
evaluate(model, test_loader)

# # Function to predict sentiment/emotion for a given sample text with score
# def predict_emotion(sample_text, model, tokenizer, max_len=128):
#     # Tokenize the input text
#     encoding = tokenizer.encode_plus(
#         sample_text,
#         add_special_tokens=True,
#         max_length=max_len,
#         return_token_type_ids=False,
#         padding='max_length',
#         return_attention_mask=True,
#         truncation=True,
#         return_tensors='pt',
#     )

#     # Move tensors to the same device as the model
#     input_ids = encoding['input_ids'].to(model.device)
#     attention_mask = encoding['attention_mask'].to(model.device)

#     # Predict
#     with torch.no_grad():
#         outputs = model(input_ids, attention_mask=attention_mask)
#         logits = outputs.logits

#     # Convert logits to probabilities and then to class labels
#     probs = torch.nn.functional.softmax(logits, dim=-1)
#     predicted_class = torch.argmax(probs, dim=-1).cpu().numpy()[0]  # Get the predicted class index
#     confidence_score = torch.max(probs, dim=-1).values.cpu().numpy()[0]  # Get the confidence score

#     # Map the predicted class index back to the class label
#     predicted_label = label_names[predicted_class]
#     return predicted_label, confidence_score

# # Example usage:
# sample_text = "I'm feeling great today!"
# predicted_emotion, confidence_score = predict_emotion(sample_text, model, tokenizer)
# print(f"The predicted emotion for the text is: {predicted_emotion} with a confidence score of {confidence_score:.2f}")


Validation Results:


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper_CUDA__index_select)