In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import numpy as np
import requests
import pandas as pd
from io import StringIO
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
from torch.utils.data import Dataset
import logging

from datasets import load_dataset

#load train data
import pandas as pd
cols = ['id', 'text', 'label', 'intensity']
path = "https://raw.githubusercontent.com/vinayakumarr/WASSA-2017/refs/heads/master/wassa/data/training/"
anger_train = pd.read_csv(StringIO(requests.get(path + 'anger-ratings-0to1.train.txt').text), header=None, sep='\t', names=cols, index_col=0)
fear_train = pd.read_csv(StringIO(requests.get(path + 'fear-ratings-0to1.train').text), header=None, sep='\t', names=cols, index_col=0)
sad_train = pd.read_csv(StringIO(requests.get(path + 'sadness-ratings-0to1.train.txt').text), header=None, sep='\t', names=cols, index_col=0)
joy_train = pd.read_csv(StringIO(requests.get(path + 'joy-ratings-0to1.train.txt').text), header=None, sep='\t', names=cols, index_col=0)

dataset = pd.concat([anger_train, fear_train, sad_train, joy_train], axis=0)

# Reset index for the combined DataFrame (optional)
dataset.reset_index(inplace=True)

from datasets import Dataset
import pandas as pd
dataset = Dataset.from_pandas(dataset)


# Shuffle the dataset
dataset = dataset.shuffle(seed=42)

# Inspect the dataset
print(dataset)

def is_valid_intensity(example):
    if example['intensity'] is not None:
        #print(example['intensity'])
        try: 
            k = float(example['intensity'])
            return True
        except:
        
            return False
    else:
        return False

# Filter the dataset
dataset = dataset.filter(is_valid_intensity)
print(dataset)
# Split the shuffled dataset into train and test sets
train_test_split = dataset.train_test_split(test_size=0.2, seed=42)

# Access the train and test datasets
train_data = train_test_split['train']
val_data = train_test_split['test']

# Inspect the datasets
print("Train Dataset:", train_data)
print("Test Dataset:", val_data)

  from .autonotebook import tqdm as notebook_tqdm


Dataset({
    features: ['id', 'text', 'label', 'intensity'],
    num_rows: 2470
})


Filter: 100%|██████████| 2470/2470 [00:00<00:00, 57318.26 examples/s]

Dataset({
    features: ['id', 'text', 'label', 'intensity'],
    num_rows: 2466
})
Train Dataset: Dataset({
    features: ['id', 'text', 'label', 'intensity'],
    num_rows: 1972
})
Test Dataset: Dataset({
    features: ['id', 'text', 'label', 'intensity'],
    num_rows: 494
})





In [2]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoConfig
#from roberta import RobertaForSequenceClassification
from modeling import CLMSequenceClassification


#model_name = "openai-community/gpt2-medium"
model_name = "HuggingFaceTB/SmolLM2-135M"
#config.num_labels=2
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer.padding_side = 'left'
tokenizer.pad_token = tokenizer.eos_token

import torch
import torch.nn as nn
from transformers import AutoModelForSequenceClassification
from transformers.activations import ACT2FN
import random



model = CLMSequenceClassification.from_pretrained(model_name, num_labels=1).to('cuda')
model.config.pad_token_id = tokenizer.eos_token_id
import RoCoFT

RoCoFT.PEFT(model, method='row', rank=3) 

In [4]:
from datasets import DatasetDict


def generate_prompt(data_point):
    """
    Generates a prompt for evaluating the humor intensity of an edited headline.
    Args:
        data_point (dict): A dictionary containing 'original', 'edit', and 'meanGrade'.
    Returns:
        str: The formatted prompt as a string.
    """
    return f"""# Input: {data_point['text']} # Label: {data_point['label']} # Output: The intensity is"""  # noqa: E501


# Assuming `dataset` is your DatasetDict
def add_label_column(example):

    example['labels'] = float(example['intensity'])
  
    example['input'] = generate_prompt(example)

    
    return example

# Map the function over train and validation datasets
train_data = train_data.map(add_label_column)
val_data = val_data.map(add_label_column)

# Remove unnecessary columns

# Inspect the updated datasets
print("Train Dataset:", train_data)
print("Validation Dataset:", val_data)

Map: 100%|██████████| 1972/1972 [00:00<00:00, 11701.38 examples/s]
Map: 100%|██████████| 494/494 [00:00<00:00, 13290.65 examples/s]

Train Dataset: Dataset({
    features: ['id', 'text', 'label', 'intensity', 'labels', 'input'],
    num_rows: 1972
})
Validation Dataset: Dataset({
    features: ['id', 'text', 'label', 'intensity', 'labels', 'input'],
    num_rows: 494
})





In [5]:
from transformers import AutoTokenizer, DataCollatorWithPadding


tokenizer.padding_side = 'left'


# col_to_delete = ['idx']
col_to_delete = ['label', 'intensity','id', 'text']  # Update as per your dataset


mask_token = tokenizer.mask_token
def preprocessing_function(examples):
   
    return tokenizer(examples['input'], truncation=True, max_length=512)

tokenized_train_data = train_data.map(preprocessing_function, batched=True, remove_columns=col_to_delete)
tokenized_val_data = val_data.map(preprocessing_function, batched=True, remove_columns=col_to_delete)
# llama_tokenized_datasets = llama_tokenized_datasets.rename_column("target", "label")
tokenized_train_data.set_format("torch")
tokenized_val_data.set_format("torch")

# Data collator for padding a batch of examples to the maximum length seen in the batch
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map: 100%|██████████| 1972/1972 [00:00<00:00, 33950.54 examples/s]
Map: 100%|██████████| 494/494 [00:00<00:00, 32709.03 examples/s]


In [6]:
tokenizer.decode(tokenized_train_data['input_ids'][10])

'# Input: @TehShockwave turn that grumpy frown upside-down\\n\\nYou did something next to impossible today # Label: sadness # Output: The intensity is'

In [7]:
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy.stats import pearsonr, spearmanr

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # If predictions are logits or have extra dimensions, squeeze
    if predictions.ndim > 1:
        predictions = predictions.squeeze()

    mae = mean_absolute_error(labels, predictions)
    mse = mean_squared_error(labels, predictions)
    rmse = np.sqrt(mse)
    r2 = r2_score(labels, predictions)
    
    # Define an "accuracy" for regression:
    # Example: within some threshold tolerance
    tolerance = 0.1  # you can change this
    acc = np.mean(np.abs(predictions - labels) < tolerance)

    pearson_corr, _ = pearsonr(predictions, labels)
    spearman_corr, _ = spearmanr(predictions, labels)

    return {
        "MAE": mae,
        "MSE": mse,
        "RMSE": rmse,
        "Accuracy": acc,
        "R2": r2,
        "Pearson": pearson_corr,
        "Spearman's Rank": spearman_corr
    }


In [8]:
from transformers import TrainingArguments, Trainer

import time
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
    output_dir='dir',
    learning_rate=5e-4,
    per_device_train_batch_size=14,
    per_device_eval_batch_size=64,
    gradient_accumulation_steps= 1,
    num_train_epochs=10,
    weight_decay=0.20,
    evaluation_strategy="steps",
    save_strategy="steps",
    save_total_limit=2,
    save_steps=10000000,
    logging_steps=100,
   
    load_best_model_at_end=True,
    lr_scheduler_type="cosine",  # You can choose from 'linear', 'cosine', 'cosine_with_restarts', 'polynomial', etc.
    warmup_steps=100,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_data,
    eval_dataset=tokenized_val_data,

    data_collator=data_collator,
    compute_metrics=compute_metrics
)

[2025-04-27 21:20:40,607] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/guangyu/anaconda3/envs/MD/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/home/guangyu/anaconda3/envs/MD/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::runtime_error::~runtime_error()@GLIBCXX_3.4'
/home/guangyu/anaconda3/envs/MD/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__gxx_personality_v0@CXXABI_1.3'
/home/guangyu/anaconda3/envs/MD/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::tellp()@GLIBCXX_3.4'
/home/guangyu/anaconda3/envs/MD/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::chrono::_V2::steady_clock::now()@GLIBCXX_3.4.19'
/home/guangyu/anaconda3/envs/MD/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_M_replace_aux(unsigned long, unsigned long, unsigned long, char)@GLIBCXX_3.4'
/home/guangyu/anaconda3/envs/MD/comp

In [9]:
trainer.train()

Step,Training Loss,Validation Loss,Mae,Mse,Rmse,Accuracy,R2,Pearson,Spearman's rank
100,0.0818,0.067051,0.210168,0.067051,0.258941,0.291498,-0.883342,0.213574,0.197856
200,0.0706,0.034804,0.153192,0.034804,0.186559,0.368421,0.022403,0.213114,0.185888
300,0.0539,0.034769,0.15169,0.034769,0.186465,0.388664,0.023394,0.302827,0.262307
400,0.0465,0.031461,0.145609,0.031461,0.177373,0.388664,0.116303,0.373419,0.343521
500,0.0399,0.027055,0.133777,0.027055,0.164483,0.435223,0.240076,0.542303,0.506074
600,0.0353,0.042308,0.167247,0.042308,0.20569,0.3583,-0.188373,0.609411,0.589299
700,0.0291,0.020734,0.11689,0.020734,0.143992,0.495951,0.417626,0.669621,0.658772
800,0.0264,0.020344,0.114824,0.020344,0.142631,0.512146,0.428581,0.67327,0.657573
900,0.0202,0.019334,0.110694,0.019334,0.139048,0.516194,0.456926,0.704984,0.689938
1000,0.0189,0.017473,0.106083,0.017473,0.132185,0.552632,0.509219,0.715939,0.702101


TrainOutput(global_step=1410, training_loss=0.03439154491779652, metrics={'train_runtime': 161.5475, 'train_samples_per_second': 122.069, 'train_steps_per_second': 8.728, 'total_flos': 4354484289372.0, 'train_loss': 0.03439154491779652, 'epoch': 10.0})