<a href="https://colab.research.google.com/github/Munna-Prasad-Gupta/nlp/blob/main/Retrained_NewDataset_Distilbert_Sieberta_XLMRoberta_RobertaBase_EmotionAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset

# Suppress WandB logging if not required
os.environ["WANDB_DISABLED"] = "true"

# # Load Dataset
# df = pd.read_csv("deforestation_sentiment_dataset.csv")
# print(len(df))

# import pandas as pd
data = pd.read_csv("deforestation_tweets_with_filled_user_activity_updated.csv")
# data.head()

# Preview the data
# print(data.head())  # Optional: Check the column names

# Select the columns and rename
df = data[['cleaned_text', 'Sentiment']].rename(columns={'cleaned_text': 'text'})

# Map Sentiment values to numeric categories
sentiment_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}
df['sentiment'] = df['Sentiment'].map(sentiment_mapping)
# Split Dataset
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(), df['sentiment'].tolist(), test_size=0.2, random_state=42
)

# Tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Custom Dataset Class
class SentimentDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=512)
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': self.labels[idx]
        }

train_dataset = SentimentDataset(train_texts, train_labels)
val_dataset = SentimentDataset(val_texts, val_labels)

# Load Pre-trained Model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)

# Training Arguments
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",  # Updated parameter name
    save_strategy="epoch",
    run_name="distilbert_deforestation_sentiment",  # Custom run name
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
    evaluation_strategy="epoch",
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Train the Model
trainer.train()

# Evaluate the Model
predictions = trainer.predict(val_dataset)
pred_labels = predictions.predictions.argmax(-1)
f1 = f1_score(val_labels, pred_labels, average='weighted')
print("F1 Score:", f1)

print(classification_report(val_labels, pred_labels, target_names=['Negative', 'Neutral', 'Positive']))

# Save the Model
model.save_pretrained('./fine_tuned_distilbert')
tokenizer.save_pretrained('./fine_tuned_distilbert')


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss
1,0.6915,0.463149
2,0.2138,0.176322
3,0.1574,0.142762


F1 Score: 0.9478158394585725
              precision    recall  f1-score   support

    Negative       0.97      0.93      0.95        81
     Neutral       0.88      1.00      0.94        23
    Positive       0.94      0.96      0.95        49

    accuracy                           0.95       153
   macro avg       0.93      0.96      0.95       153
weighted avg       0.95      0.95      0.95       153



('./fine_tuned_distilbert/tokenizer_config.json',
 './fine_tuned_distilbert/special_tokens_map.json',
 './fine_tuned_distilbert/vocab.txt',
 './fine_tuned_distilbert/added_tokens.json')

In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
import torch

# Suppress WandB logging
os.environ["WANDB_DISABLED"] = "true"

# # Load Dataset
# df = pd.read_csv("deforestation_sentiment_dataset.csv")

# # Preprocess Dataset
# df['sentiment'] = df['sentiment'].astype(int)
data = pd.read_csv("deforestation_tweets_with_filled_user_activity_updated.csv")
# data.head()

# Preview the data
# print(data.head())  # Optional: Check the column names

# Select the columns and rename
df = data[['cleaned_text', 'Sentiment']].rename(columns={'cleaned_text': 'text'})

# Map Sentiment values to numeric categories
sentiment_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}
df['sentiment'] = df['Sentiment'].map(sentiment_mapping)


# Stratified Split for Training and Validation Sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(),
    df['sentiment'].tolist(),
    test_size=0.2,
    random_state=42,
    stratify=df['sentiment']
)

# Load Tokenizer for Siebert's Model
tokenizer = RobertaTokenizer.from_pretrained("siebert/sentiment-roberta-large-english")

# Custom Dataset Class
class SentimentDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=512)
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': self.labels[idx],
        }

# Create Datasets
train_dataset = SentimentDataset(train_texts, train_labels)
val_dataset = SentimentDataset(val_texts, val_labels)

# Load Pre-trained Siebert Model with Fix
model = RobertaForSequenceClassification.from_pretrained(
    "siebert/sentiment-roberta-large-english",
    num_labels=3,  # Match the number of sentiment classes
    ignore_mismatched_sizes=True  # Allow resizing of the classifier layer
)

# Training Arguments
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=4,
    learning_rate=2e-5,
    weight_decay=0.1,
    load_best_model_at_end=True,
    save_total_limit=1,
    report_to="none",  # Disable WandB
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

# Train the Model
trainer.train()

# Evaluate the Model
predictions = trainer.predict(val_dataset)
pred_labels = predictions.predictions.argmax(-1)

# Calculate Metrics
f1 = f1_score(val_labels, pred_labels, average='weighted')
classification_report_data = classification_report(val_labels, pred_labels, target_names=['Negative', 'Neutral', 'Positive'])

# Print Metrics
print(f"Weighted F1 Score: {f1}")
print(classification_report_data)

# Save the Fine-Tuned Model
model_save_path = "fine_tuned_siebert_model"
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

print(f"Model saved at: {model_save_path}")

tokenizer_config.json:   0%|          | 0.00/256 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at siebert/sentiment-roberta-large-english and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([2, 1024]) in the checkpoint and torch.Size([3, 1024]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([3]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.671,0.445382
2,0.1539,0.192323
3,0.2201,0.273931
4,0.1472,0.246731


Weighted F1 Score: 0.948323785488432
              precision    recall  f1-score   support

    Negative       1.00      0.92      0.96        88
     Neutral       0.86      1.00      0.93        25
    Positive       0.91      0.97      0.94        40

    accuracy                           0.95       153
   macro avg       0.92      0.97      0.94       153
weighted avg       0.95      0.95      0.95       153

Model saved at: fine_tuned_siebert_model


In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
import torch

# Suppress WandB logging
os.environ["WANDB_DISABLED"] = "true"

# Upload Dataset in Colab
# uploaded = files.upload()  # This will prompt for file upload
# file_name = list(uploaded.keys())[0]  # Get the uploaded file name

# Load the dataset
# data = pd.read_csv(file_name)


data = pd.read_csv("deforestation_tweets_with_filled_user_activity_updated.csv")
# Select the columns and rename
df = data[['cleaned_text', 'Sentiment']].rename(columns={'cleaned_text': 'text'})

# Map Sentiment values to numeric categories
sentiment_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}
df['sentiment'] = df['Sentiment'].map(sentiment_mapping)

# Stratified Split for Training and Validation Sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(),
    df['sentiment'].tolist(),
    test_size=0.2,
    random_state=42,
    stratify=df['sentiment']
)

# Load XLM-RoBERTa Tokenizer
tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")

# Custom Dataset Class
class SentimentDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=512)
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
            'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),
            'labels': self.labels[idx],
        }

# Create Datasets
train_dataset = SentimentDataset(train_texts, train_labels)
val_dataset = SentimentDataset(val_texts, val_labels)

# Load Pre-trained XLM-RoBERTa Model
model = XLMRobertaForSequenceClassification.from_pretrained(
    "xlm-roberta-base",
    num_labels=3  # Match the number of sentiment classes
)

# Training Arguments
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=4,
    learning_rate=2e-5,
    weight_decay=0.1,
    load_best_model_at_end=True,
    save_total_limit=1,
    report_to="none",  # Disable WandB
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

# Train the Model
trainer.train()

# Evaluate the Model
predictions = trainer.predict(val_dataset)
pred_labels = predictions.predictions.argmax(-1)

# Calculate Metrics
f1 = f1_score(val_labels, pred_labels, average='weighted')
classification_report_data = classification_report(val_labels, pred_labels, target_names=['Negative', 'Neutral', 'Positive'])

# Print Metrics
print(f"Weighted F1 Score: {f1}")
print(classification_report_data)

# Save the Fine-Tuned Model
model_save_path = "fine_tuned_xlm_roberta_model"
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

print(f"Model saved at: {model_save_path}")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,1.0577,0.817074
2,0.5692,0.294439
3,0.1842,0.186803
4,0.1456,0.176884


Weighted F1 Score: 0.9485568559895965
              precision    recall  f1-score   support

    Negative       0.98      0.93      0.95        88
     Neutral       0.83      0.96      0.89        25
    Positive       0.97      0.97      0.97        40

    accuracy                           0.95       153
   macro avg       0.93      0.96      0.94       153
weighted avg       0.95      0.95      0.95       153

Model saved at: fine_tuned_xlm_roberta_model


In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
import torch

# Suppress WandB logging
os.environ["WANDB_DISABLED"] = "true"


data = pd.read_csv("deforestation_tweets_with_filled_user_activity_updated.csv")
# Select the columns and rename
df = data[['cleaned_text', 'Sentiment']].rename(columns={'cleaned_text': 'text'})

# Map Sentiment values to numeric categories
sentiment_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}
df['sentiment'] = df['Sentiment'].map(sentiment_mapping)

# Stratified Split for Training and Validation Sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(),
    df['sentiment'].tolist(),
    test_size=0.2,
    random_state=42,
    stratify=df['sentiment']
)

# Load RoBERTa Tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# Custom Dataset Class
class SentimentDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=512)
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
            'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),
            'labels': self.labels[idx],
        }

# Create Datasets
train_dataset = SentimentDataset(train_texts, train_labels)
val_dataset = SentimentDataset(val_texts, val_labels)

# Load Pre-trained XLM-RoBERTa Model
model = RobertaForSequenceClassification.from_pretrained(
    "roberta-base",
    num_labels=3  # Match the number of sentiment classes
)

# Training Arguments
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=4,
    learning_rate=2e-5,
    weight_decay=0.1,
    load_best_model_at_end=True,
    save_total_limit=1,
    report_to="none",  # Disable WandB
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

# Train the Model
trainer.train()

# Evaluate the Model
predictions = trainer.predict(val_dataset)
pred_labels = predictions.predictions.argmax(-1)

# Calculate Metrics
f1 = f1_score(val_labels, pred_labels, average='weighted')
classification_report_data = classification_report(val_labels, pred_labels, target_names=['Negative', 'Neutral', 'Positive'])

# Print Metrics
print(f"Weighted F1 Score: {f1}")
print(classification_report_data)

# Save the Fine-Tuned Model
model_save_path = "fine_tuned_roberta_model"
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

print(f"Model saved at: {model_save_path}")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.6497,0.228115
2,0.0494,0.195473
3,0.143,0.212626
4,0.0943,0.213241


Weighted F1 Score: 0.9416621137619134
              precision    recall  f1-score   support

    Negative       0.98      0.92      0.95        88
     Neutral       0.96      0.96      0.96        25
    Positive       0.87      0.97      0.92        40

    accuracy                           0.94       153
   macro avg       0.93      0.95      0.94       153
weighted avg       0.94      0.94      0.94       153

Model saved at: fine_tuned_roberta_model


In [23]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, accuracy_score
import torch
from transformers import pipeline

# Suppress WandB logging
os.environ["WANDB_DISABLED"] = "true"

# Check GPU availability
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load data
data = pd.read_csv("deforestation_tweets_with_filled_user_activity_updated.csv")
df = data[['cleaned_text', 'Emotion']].rename(columns={'cleaned_text': 'text'})



# print(df['emotion'].unique())

# Initialize the emotion classification pipeline
emotion = pipeline(
    "text-classification",
    model="j-hartmann/emotion-english-distilroberta-base",
    device=0 if device == "cuda" else -1
)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['Emotion'], test_size=0.2, random_state=42)

# Map labels in the dataset to ensure consistency
unique_labels = sorted(df['Emotion'].unique())
label_to_int = {label: idx for idx, label in enumerate(unique_labels)}
int_to_label = {idx: label for label, idx in label_to_int.items()}

# Map y_test to integers for evaluation
y_test_mapped = y_test.map(label_to_int)

# Predict emotions for the test set in batches
print("Processing predictions in batches...")
batch_size = 16  # Use batching for efficiency
y_pred = []

for i in range(0, len(X_test), batch_size):
    batch_texts = X_test[i:i+batch_size].tolist()
    batch_predictions = emotion(batch_texts)

    for pred in batch_predictions:
        predicted_label = pred['label']
        # Map predicted labels to dataset labels (handle mismatches)
        if predicted_label in label_to_int:
            y_pred.append(label_to_int[predicted_label])
        else:
            # Default to a neutral label if prediction does not match
            y_pred.append(label_to_int.get("Neutral", 0))

# Calculate metrics
accuracy = accuracy_score(y_test_mapped, y_pred)
f1 = f1_score(y_test_mapped, y_pred, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"F1-Score: {f1:.4f}")

# Generate the classification report
print("\nClassification Report:")
print(classification_report(y_test_mapped, y_pred, target_names=unique_labels))


Processing predictions in batches...
Accuracy: 0.9412
F1-Score: 0.9400

Classification Report:
              precision    recall  f1-score   support

       anger       0.87      0.97      0.92        34
     disgust       1.00      0.74      0.85        19
        fear       1.00      0.94      0.97        16
         joy       0.86      1.00      0.92         6
     neutral       0.97      0.98      0.98        66
     sadness       0.83      0.83      0.83         6
    surprise       1.00      1.00      1.00         6

    accuracy                           0.94       153
   macro avg       0.93      0.92      0.92       153
weighted avg       0.95      0.94      0.94       153



In [25]:
import os
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, accuracy_score
from transformers import pipeline, Trainer, TrainingArguments, RobertaTokenizer, RobertaForSequenceClassification
from torch.utils.data import Dataset

# Suppress WandB logging
os.environ["WANDB_DISABLED"] = "true"

# Check GPU availability
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load data
data = pd.read_csv("deforestation_tweets_with_filled_user_activity_updated.csv")
df = data[['cleaned_text', 'Emotion']].rename(columns={'cleaned_text': 'text'})

# Label Mapping
label_mapping = {
    "anger": 0,
    "fear": 1,
    "joy": 2,
    "neutral": 3,
    "sadness": 4,
    "disgust":5,
    "surprise":6
}
df['label'] = df['Emotion'].map(label_mapping)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

# Initialize the tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("j-hartmann/emotion-english-distilroberta-base")
model = RobertaForSequenceClassification.from_pretrained("j-hartmann/emotion-english-distilroberta-base", num_labels=len(label_mapping))

# Tokenize the data
class EmotionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = self.texts.iloc[item]
        label = self.labels.iloc[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            truncation=True,
            max_length=self.max_len,
            padding='max_length',
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

train_dataset = EmotionDataset(X_train, y_train, tokenizer)
test_dataset = EmotionDataset(X_test, y_test, tokenizer)

# Training Arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=16,  # batch size for training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy="epoch",     # evaluate after each epoch
    save_strategy="epoch",           # save model after each epoch
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Train the model
trainer.train()

# Evaluate the model
print("Evaluating the model...")
predictions, true_labels, metrics = trainer.predict(test_dataset)

# Get the predicted labels
pred_labels = predictions.argmax(axis=1)

# Calculate accuracy and F1 score
accuracy = accuracy_score(true_labels, pred_labels)
f1 = f1_score(true_labels, pred_labels, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"F1-Score: {f1:.4f}")

# Generate the classification report
print("\nClassification Report:")
print(classification_report(true_labels, pred_labels, target_names=[key for key in label_mapping]))


RuntimeError: Error(s) in loading state_dict for RobertaForSequenceClassification:
	size mismatch for classifier.out_proj.weight: copying a param with shape torch.Size([7, 768]) from checkpoint, the shape in current model is torch.Size([5, 768]).
	size mismatch for classifier.out_proj.bias: copying a param with shape torch.Size([7]) from checkpoint, the shape in current model is torch.Size([5]).
	You may consider adding `ignore_mismatched_sizes=True` in the model `from_pretrained` method.

In [24]:
import os
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, accuracy_score
from transformers import pipeline, Trainer, TrainingArguments, RobertaTokenizer, RobertaForSequenceClassification
from torch.utils.data import Dataset

# Suppress WandB logging
os.environ["WANDB_DISABLED"] = "true"

# Check GPU availability
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load data
data = pd.read_csv("deforestation_tweets_with_filled_user_activity_updated.csv")
df = data[['cleaned_text', 'Emotion']].rename(columns={'cleaned_text': 'text'})

# Label Mapping
label_mapping = {
    "Angry": 0,
    "Fear": 1,
    "Excited": 2,
    "Neutral": 3,
    "Sad": 4
}
# df['label'] = df['Emotion'].map(label_mapping)

df['Emotion'].unique()

array(['neutral', 'disgust', 'fear', 'anger', 'sadness', 'surprise',
       'joy'], dtype=object)

In [26]:
import os
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, accuracy_score
from transformers import pipeline, Trainer, TrainingArguments, RobertaTokenizer, RobertaForSequenceClassification
from torch.utils.data import Dataset

# Suppress WandB logging
os.environ["WANDB_DISABLED"] = "true"

# Check GPU availability
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load data
data = pd.read_csv("deforestation_tweets_with_filled_user_activity_updated.csv")
df = data[['cleaned_text', 'Emotion']].rename(columns={'cleaned_text': 'text'})

# Label Mapping
# Label Mapping
label_mapping = {
    "anger": 0,
    "fear": 1,
    "joy": 2,
    "neutral": 3,
    "sadness": 4,
    "disgust":5,
    "surprise":6
}
df['label'] = df['Emotion'].map(label_mapping)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

# Initialize the tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("j-hartmann/emotion-english-distilroberta-base")
model = RobertaForSequenceClassification.from_pretrained(
    "j-hartmann/emotion-english-distilroberta-base",
    num_labels=len(label_mapping),  # Set number of labels to 5
    ignore_mismatched_sizes=True  # Ignore size mismatch for classifier layer
)

# Tokenize the data
class EmotionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = self.texts.iloc[item]
        label = self.labels.iloc[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            truncation=True,
            max_length=self.max_len,
            padding='max_length',
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

train_dataset = EmotionDataset(X_train, y_train, tokenizer)
test_dataset = EmotionDataset(X_test, y_test, tokenizer)

# Training Arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=16,  # batch size for training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy="epoch",     # evaluate after each epoch
    save_strategy="epoch",           # save model after each epoch
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Train the model
trainer.train()

# Evaluate the model
print("Evaluating the model...")
predictions, true_labels, metrics = trainer.predict(test_dataset)

# Get the predicted labels
pred_labels = predictions.argmax(axis=1)

# Calculate accuracy and F1 score
accuracy = accuracy_score(true_labels, pred_labels)
f1 = f1_score(true_labels, pred_labels, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"F1-Score: {f1:.4f}")

# Generate the classification report
print("\nClassification Report:")
print(classification_report(true_labels, pred_labels, target_names=[key for key in label_mapping]))


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss
1,2.9811,1.76391


Epoch,Training Loss,Validation Loss
1,2.9811,1.76391
2,0.8059,0.500902
3,0.3312,0.28334


Evaluating the model...


Accuracy: 0.9085
F1-Score: 0.8930

Classification Report:
              precision    recall  f1-score   support

       anger       0.97      0.88      0.92        34
        fear       0.68      0.94      0.79        16
         joy       0.00      0.00      0.00         6
     neutral       0.97      0.98      0.98        66
     sadness       0.83      0.83      0.83         6
     disgust       0.86      0.95      0.90        19
    surprise       1.00      1.00      1.00         6

    accuracy                           0.91       153
   macro avg       0.76      0.80      0.77       153
weighted avg       0.88      0.91      0.89       153



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
