In [1]:
!pip install pandas transformers torch


Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [2]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_scheduler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split

# Load datasets
train_data = pd.read_csv('twitter_training.csv')
valid_data = pd.read_csv('twitter_validation.csv')
train_data = train_data.dropna(subset=['tweet'])
valid_data = valid_data.dropna(subset=['tweet'])
# Preprocessing function to map text labels to integers
label_mapping = {'Irrelevant': 0, 'Negative': 1, 'Neutral': 2, 'Positive': 3}
train_data['labels'] = train_data['true_label'].map(label_mapping)
valid_data['labels'] = valid_data['true_label'].map(label_mapping)

class TweetDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            truncation=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Setup tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Create datasets
train_dataset = TweetDataset(train_data['tweet'].tolist(), train_data['labels'].tolist(), tokenizer)
valid_dataset = TweetDataset(valid_data['tweet'].tolist(), valid_data['labels'].tolist(), tokenizer)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=16, shuffle=False)

# Load model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=4)
model = model.to('cuda' if torch.cuda.is_available() else 'cpu')

# Optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=5e-6)
num_epochs = 3
num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

# Training loop
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.train()
for epoch in range(num_epochs):
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Loss: 0.9152162671089172
Epoch 2, Loss: 0.49598702788352966
Epoch 3, Loss: 0.6377798914909363


In [3]:
model.eval()
model.to('cuda' if torch.cuda.is_available() else 'cpu')

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [5]:
from torch.nn.functional import softmax


In [6]:
# Function to predict sentiment from raw text input
def predict_sentiment_with_score(text):
    # Tokenize the text
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)

    # Move tensors to the appropriate device
    input_ids = inputs['input_ids'].to(model.device)
    attention_mask = inputs['attention_mask'].to(model.device)

    # Predict sentiment
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probs = softmax(logits, dim=-1)
        predicted_class = torch.argmax(probs, dim=-1)
        confidence_score = torch.max(probs, dim=-1).values

    # Mapping numeric predictions back to sentiment labels
    sentiment_labels = {0: 'Irrelevant', 1: 'Negative', 2: 'Neutral', 3: 'Positive'}
    predicted_label = sentiment_labels[predicted_class.item()]
    score = confidence_score.item()
    return predicted_label, score

# Example usage of the function
sample_text = "I really enjoy using this new product. It works wonders!"
predicted_sentiment, confidence = predict_sentiment_with_score(sample_text)
print(f"The predicted sentiment for the text is: {predicted_sentiment} with a confidence score of {confidence:.2f}")

The predicted sentiment for the text is: Positive with a confidence score of 0.94


In [None]:
# Validation function
def evaluate(model, data_loader):
    model.eval()
    predictions, true_labels = [], []

    with torch.no_grad():
        for batch in data_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(batch['labels'].cpu().numpy())

    print(classification_report(true_labels, predictions, target_names=label_mapping.keys(), digits=4))
    print("Accuracy:", accuracy_score(true_labels, predictions))

# Evaluate the model
evaluate(model, valid_loader)

In [None]:
# # Define the sentiment prediction function
# def predict_sentiment(text):
#     # Tokenize the input text and convert to Tensor
#     inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True, padding="max_length")

#     # Move tensors to the device where the model is
#     input_ids = inputs['input_ids'].to(model.device)
#     attention_mask = inputs['attention_mask'].to(model.device)

#     # Predict sentiment
#     with torch.no_grad():
#         outputs = model(input_ids=input_ids, attention_mask=attention_mask)
#         prediction = torch.argmax(outputs.logits, dim=-1).item()

#     # Map numerical predictions to labels
#     label_dict = {0: 'Irrelevant', 1: 'Negative', 2: 'Neutral', 3: 'Positive'}
#     return label_dict[prediction]

# # Example usage
# sample_text = "I'm not very happy with the service."
# predicted_sentiment = predict_sentiment(sample_text)
# print("Predicted Sentiment:", predicted_sentiment)