In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_scheduler
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve, classification_report
from tqdm import tqdm
import numpy as np

In [None]:
# Load the dataset
df = pd.read_csv("/content/drive/MyDrive/train/eng.csv")

# Extract features and labels
texts = df["text"].tolist()
labels = df[["anger", "fear", "joy", "sadness", "surprise"]].values

# Split into training and testing sets
texts_train, texts_test, labels_train, labels_test = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

# Tokenization using Roberta tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-large")

In [None]:
def tokenize_texts(texts, tokenizer, max_len=256):
    return tokenizer(
        texts,
        max_length=max_len,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )

train_encodings = tokenize_texts(texts_train, tokenizer, max_len=256)
test_encodings = tokenize_texts(texts_test, tokenizer, max_len=256)

In [None]:
# Dataset class
class EmotionDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

train_dataset = EmotionDataset(train_encodings, labels_train)
test_dataset = EmotionDataset(test_encodings, labels_test)

In [None]:
# Load pre-trained Roberta model
model = RobertaForSequenceClassification.from_pretrained(
    "roberta-large",
    num_labels=5,
    problem_type="multi_label_classification"
)

# Move model to device
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

In [None]:
# Define loss function without class weights
from torch.nn import BCEWithLogitsLoss
loss_fn = BCEWithLogitsLoss()

# Set up dataloaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

# Optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
num_training_steps = len(train_loader) * 3  # 3 epochs
lr_scheduler = get_scheduler("cosine", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Mixed precision training
from torch.cuda.amp import GradScaler, autocast
scaler = GradScaler()

In [None]:
# Training loop
model.train()
for epoch in range(3):  # 3 epochs
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        batch = {k: v.to(device) for k, v in batch.items()}

        with autocast():
            outputs = model(**batch)
            logits = outputs.logits
            loss = loss_fn(logits, batch["labels"])

        optimizer.zero_grad()
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        lr_scheduler.step()

        loop.set_description(f"Epoch {epoch}")
        loop.set_postfix(loss=loss.item())

# Evaluation
model.eval()
all_preds = []
all_labels = []

In [None]:
with torch.no_grad():
    for batch in test_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        logits = outputs.logits
        preds = torch.sigmoid(logits).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(batch["labels"].cpu().numpy())

# Optimize thresholds per label
optimal_thresholds = []
all_preds = np.array(all_preds)
all_labels = np.array(all_labels)
for i in range(all_labels.shape[1]):
    precision, recall, thresholds = precision_recall_curve(all_labels[:, i], all_preds[:, i])
    f1_scores = 2 * precision * recall / (precision + recall + 1e-8)
    optimal_idx = np.argmax(f1_scores)
    optimal_thresholds.append(thresholds[optimal_idx])

# Apply optimized thresholds
all_preds_bin = (all_preds > np.array(optimal_thresholds)).astype(int)

In [None]:
# Print metrics
accuracy = (all_preds_bin == all_labels).mean()
print(f"Accuracy: {accuracy:.4f}")
print(classification_report(all_labels, all_preds_bin, target_names=["Anger", "Fear", "Joy", "Sadness", "Surprise"]))

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from tqdm import tqdm
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import pearsonr  # Importing pearsonr for correlation calculation

# Load the training dataset
df = pd.read_csv("/content/drive/MyDrive/train/eng.csv")  # Adjust path
texts = df["text"].tolist()
labels = df[["anger", "fear", "joy", "sadness", "surprise"]].values

# Split into training and testing sets
from sklearn.model_selection import train_test_split
texts_train, texts_test, labels_train, labels_test = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

# Tokenization using Roberta tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-large")

def tokenize_texts(texts, tokenizer, max_len=256):
    return tokenizer(
        texts,
        max_length=max_len,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )

train_encodings = tokenize_texts(texts_train, tokenizer, max_len=256)
test_encodings = tokenize_texts(texts_test, tokenizer, max_len=256)

In [None]:
# Dataset class
class EmotionDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

train_dataset = EmotionDataset(train_encodings, labels_train.astype(np.float32))
test_dataset = EmotionDataset(test_encodings, labels_test.astype(np.float32))

# Load pre-trained Roberta model and modify for regression
model = RobertaForSequenceClassification.from_pretrained(
    "roberta-large",
    num_labels=5  # Number of emotions
)
model.classifier.out_proj = torch.nn.Linear(model.classifier.out_proj.in_features, 5)

# Move model to device
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

In [None]:
# Loss function and optimizer
from torch.nn import MSELoss
from transformers import AdamW, get_scheduler
loss_fn = MSELoss()
optimizer = AdamW(model.parameters(), lr=2e-5)

# Learning rate scheduler
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)
num_training_steps = len(train_loader) * 3  # 3 epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Mixed precision training
from torch.cuda.amp import GradScaler, autocast
scaler = GradScaler()

# Training loop
model.train()
for epoch in range(3):  # Train for 3 epochs
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        batch = {k: v.to(device) for k, v in batch.items()}

        with autocast():
            outputs = model(**batch)
            logits = outputs.logits
            loss = loss_fn(logits, batch["labels"])

        optimizer.zero_grad()
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        lr_scheduler.step()

        loop.set_description(f"Epoch {epoch}")
        loop.set_postfix(loss=loss.item())

In [None]:
# --- Evaluate the model on the Test Split ---
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        logits = outputs.logits
        all_preds.extend(logits.cpu().numpy())
        all_labels.extend(batch["labels"].cpu().numpy())

all_preds = np.array(all_preds)
all_labels = np.array(all_labels)

# --- Early Correction: Set any negative predictions to 0 ---
all_preds[all_preds < 0] = 0  # Set negative values to 0

# Convert the predictions to whole numbers (integers) by rounding
all_preds = np.round(all_preds).astype(int)  # Round and convert to integers

# --- Calculate and Print Metrics ---
# Mean Squared Error (MSE)
mse = mean_squared_error(all_labels, all_preds, multioutput='raw_values')

# Mean Absolute Error (MAE)
mae = mean_absolute_error(all_labels, all_preds, multioutput='raw_values')

# R-squared (R2) score
r2 = r2_score(all_labels, all_preds, multioutput='raw_values')

# Pearson Correlation Coefficient
pearson_corrs = []
for i in range(all_labels.shape[1]):  # Iterate over each emotion dimension
    corr, _ = pearsonr(all_labels[:, i], all_preds[:, i])
    pearson_corrs.append(corr)

pearson_corrs = np.array(pearson_corrs)

# Print metrics including Pearson correlation
print("--- Emotion-wise Metrics ---")
for i, emotion in enumerate(["Anger", "Fear", "Joy", "Sadness", "Surprise"]):
    print(f"{emotion}: MSE={mse[i]:.4f}, MAE={mae[i]:.4f}, R2={r2[i]:.4f}, Pearson={pearson_corrs[i]:.4f}")

print("\n--- Overall Metrics ---")
print(f"Mean MSE: {mse.mean():.4f}, Mean MAE: {mae.mean():.4f}, Mean R2: {r2.mean():.4f}, Mean Pearson: {pearson_corrs.mean():.4f}")

In [None]:
# --- Prediction on Data from `test.csv` ---
# Load the test dataset for prediction (Separate test data)
test_df = pd.read_csv("/content/drive/MyDrive/test/eng.csv")  # Adjust path to your test CSV
texts_test_csv = test_df["text"].tolist()  # The texts in the test.csv for prediction

# Tokenize the test texts
test_encodings_csv = tokenize_texts(texts_test_csv, tokenizer, max_len=256)

# Create test dataset (no labels needed for prediction)
test_dataset_csv = EmotionDataset(test_encodings_csv, np.zeros((len(texts_test_csv), 5)))  # Dummy labels for prediction
test_loader_csv = DataLoader(test_dataset_csv, batch_size=16)

# Predict using the trained model
model.eval()
all_preds_csv = []

with torch.no_grad():
    for batch in test_loader_csv:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        logits = outputs.logits
        all_preds_csv.extend(logits.cpu().numpy())

# Convert predictions to numpy array for test.csv
all_preds_csv = np.array(all_preds_csv)

# If any emotion value is less than 0, set it to 0
all_preds_csv[all_preds_csv < 0] = 0

# Convert to whole numbers (integers) by rounding and then converting
all_preds_csv = np.round(all_preds_csv).astype(int)

# Create DataFrame with correct column order
output_df = pd.DataFrame(all_preds_csv, columns=["Anger", "Fear", "Joy", "Sadness", "Surprise"])
output_df.insert(0, "id", test_df["id"])  # Insert ID at the first column

# Save predictions to CSV file
output_path = "/content/drive/MyDrive/pred_eng_1.csv"  # Update with your preferred path
output_df.to_csv(output_path, index=False)

print(f"Predictions saved to {output_path}")