In [None]:
!pip install -q transformers torch scikit-learn pandas scipy accelerate


In [None]:
!pip install -U transformers accelerate




In [None]:
import transformers
print(transformers.__version__)


4.57.3


In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)
from sklearn.model_selection import GroupShuffleSplit
from scipy.stats import pearsonr




In [None]:
train_df = pd.read_csv("/content/train_subtask1.csv")

train_df = train_df.sort_values(
    by=["user_id", "timestamp"]
).reset_index(drop=True)

train_df.head()


Unnamed: 0,user_id,text_id,text,timestamp,collection_phase,is_words,valence,arousal
0,1,200,I feel good . I caught up on some sleep . Wo...,2021-06-09 12:41:57,1,False,2.0,1.0
1,1,201,I’ve been feeling good for days and days . I r...,2021-06-11 12:01:45,1,False,2.0,1.0
2,1,202,I’ve been feeling fine personally . I’ve been ...,2021-06-13 13:15:07,1,False,0.0,1.0
3,1,203,I feel great . I’ve had a day off . I’m going ...,2021-06-16 12:03:12,1,False,2.0,1.0
4,1,204,I feel great today . I am well rested . I have...,2021-06-17 12:38:38,1,False,2.0,2.0


In [None]:
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, val_idx = next(
    gss.split(train_df, groups=train_df["user_id"])
)

train_data = train_df.iloc[train_idx]
val_data = train_df.iloc[val_idx]


In [None]:
MODEL_NAME = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

class AffectDataset(Dataset):
    def __init__(self, df, tokenizer, is_test=False, max_len=256):
        self.texts = df["text"].tolist()
        self.user_ids = df["user_id"].tolist()
        self.text_ids = df["text_id"].tolist()
        self.is_test = is_test
        self.tokenizer = tokenizer
        self.max_len = max_len

        if not is_test:
            self.labels = df[["valence", "arousal"]].values.astype(float)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )

        item = {k: v.squeeze(0) for k, v in enc.items()}
        if not self.is_test:
            item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float)

        return item


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2,
    problem_type="regression"
)


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=2,               # fewer epochs for faster runs
    per_device_train_batch_size=8,    # smaller batch to fit GPU
    gradient_accumulation_steps=4,    # effective batch size = 8*4=32
    eval_strategy="steps",
    eval_steps=100,                   # evaluate less often
    save_strategy="steps",
    save_steps=100,
    logging_steps=50,
    learning_rate=5e-5,
    fp16=True,                        # mixed precision for speed
    dataloader_num_workers=4,         # parallel data loading
    report_to="none"                  # disable WandB logging
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=AffectDataset(train_data, tokenizer),
    eval_dataset=AffectDataset(val_data, tokenizer),
)

trainer.train()




Step,Training Loss,Validation Loss
100,0.6278,0.704182


TrainOutput(global_step=136, training_loss=0.6797231365652645, metrics={'train_runtime': 13212.3824, 'train_samples_per_second': 0.328, 'train_steps_per_second': 0.01, 'total_flos': 569635434854400.0, 'train_loss': 0.6797231365652645, 'epoch': 2.0})

In [None]:
def semeval_task1_eval(user_ids, preds, labels):
    user_ids = np.asarray(user_ids)
    preds = np.asarray(preds)
    labels = np.asarray(labels)

    # WITHIN-USER
    r_vals, mae_vals = [], []
    for u in np.unique(user_ids):
        m = user_ids == u
        if np.sum(m) < 2:
            continue
        if np.var(labels[m]) == 0:
            continue
        r, _ = pearsonr(preds[m], labels[m])
        r_vals.append(r)
        mae_vals.append(np.mean(np.abs(preds[m] - labels[m])))

    r_within = np.mean(r_vals)
    mae_within = np.mean(mae_vals)

    # BETWEEN-USER
    user_pred_means = []
    user_gold_means = []
    for u in np.unique(user_ids):
        m = user_ids == u
        user_pred_means.append(np.mean(preds[m]))
        user_gold_means.append(np.mean(labels[m]))

    r_between, _ = pearsonr(user_pred_means, user_gold_means)
    mae_between = np.mean(np.abs(
        np.array(user_pred_means) - np.array(user_gold_means)
    ))

    # COMPOSITE
    r_comp = np.tanh(
        0.5 * (np.arctanh(r_within) + np.arctanh(r_between))
    )

    return {
        "r_within": r_within,
        "r_between": r_between,
        "r_composite": r_comp,
        "mae_within": mae_within,
        "mae_between": mae_between
    }


In [None]:
val_dataset = AffectDataset(val_data, tokenizer)
preds = trainer.predict(val_dataset).predictions

valence_metrics = semeval_task1_eval(
    val_data["user_id"].values,
    preds[:, 0],
    val_data["valence"].values
)

arousal_metrics = semeval_task1_eval(
    val_data["user_id"].values,
    preds[:, 1],
    val_data["arousal"].values
)

print("VALENCE:", valence_metrics)
print("AROUSAL:", arousal_metrics)




VALENCE: {'r_within': np.float64(0.6236920048474425), 'r_between': np.float64(0.7554024202141099), 'r_composite': np.float64(0.6953405951080504), 'mae_within': np.float64(0.8256400038067192), 'mae_between': np.float64(0.42125015516157227)}
AROUSAL: {'r_within': np.float64(0.31639966372207645), 'r_between': np.float64(0.2094175445229811), 'r_composite': np.float64(0.26371939840248926), 'mae_within': np.float64(0.5438585663345263), 'mae_between': np.float64(0.24034448857589888)}


The model performs well on valence, showing strong ability to distinguish users’ overall emotional polarity (r_between = 0.76) and to track emotional changes over time within users (r_within = 0.62), resulting in a solid composite correlation (r_composite = 0.70). In contrast, performance on arousal is substantially weaker, with low between-user and within-user correlations, indicating difficulty capturing intensity or activation from text alone. Overall, the results suggest the model is effective at modeling positive vs. negative affect, but struggles with emotional intensity, which limits the overall leaderboard performance since composite correlation is used for ranking.

In [None]:
final_model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2,
    problem_type="regression"
)

final_trainer = Trainer(
    model=final_model,
    args=training_args,
    train_dataset=AffectDataset(train_df, tokenizer)
)

final_trainer.train()


In [None]:
final_model.save_pretrained("/content/semeval_model")
tokenizer.save_pretrained("/content/semeval_model")
